diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 00000000..2d51a74b
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,35 @@
+name: Discord GitHub Notifications
+
+on:
+  issues:
+    types: [opened]
+  issue_comment:
+    types: [created]
+  pull_request:
+    types: [opened]
+  discussion:
+    types: [created]
+
+jobs:
+  notify-discord:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Set webhook based on event type
+        id: set-webhook
+        run: |
+          if [ "${{ github.event_name }}" == "discussion" ]; then
+            echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT
+          else
+            echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Discord Notification
+        uses: Ilshidur/action-discord@master
+        env:
+          DISCORD_WEBHOOK: ${{ steps.set-webhook.outputs.webhook }}
+        with:
+          args: |
+            ${{ github.event_name == 'issues' && format('📣 New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) || 
+            github.event_name == 'issue_comment' && format('💬 New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) ||
+            github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) || 
+            format('💬 New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }}
diff --git a/.gitignore b/.gitignore
index 44b90f82..61bda965 100644
--- a/.gitignore
+++ b/.gitignore
@@ -232,5 +232,36 @@ plans/
 .codeiumignore
 todo/
 
-# windsurf rules
-.windsurfrules
+# Continue development files
+.continue/
+.continuerc.json
+continue.lock
+continue_core.log
+contextProviders/
+continue_workspace/
+.continue-cache/
+continue_config.json
+
+# Continue temporary files
+.continue-temp/
+.continue-logs/
+.continue-downloads/
+
+# Continue VS Code specific
+.vscode-continue/
+.vscode-continue-cache/
+
+.prompts/
+
+.llm.env
+.private/
+
+CLAUDE_MONITOR.md
+CLAUDE.md
+
+tests/**/test_site
+tests/**/reports
+tests/**/benchmark_reports
+
+docs/**/data
+.codecat/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae805f26..4475e12e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,10 +5,274 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
----
+## [0.6.2] - 2025-05-02
+
+### Added
+- New `RegexExtractionStrategy` for fast pattern-based extraction without requiring LLM
+  - Built-in patterns for emails, URLs, phone numbers, dates, and more
+  - Support for custom regex patterns
+  - `generate_pattern` utility for LLM-assisted pattern creation (one-time use)
+- Added `fit_html` as a top-level field in `CrawlResult` for optimized HTML extraction
+- Added support for network response body capture in network request tracking
 
 ### Changed
-Okay, here's a detailed changelog in Markdown format, generated from the provided git diff and commit history. I've focused on user-facing changes, fixes, and features, and grouped them as requested:
+- Updated documentation for no-LLM extraction strategies
+- Enhanced API reference to include RegexExtractionStrategy examples and usage
+- Improved HTML preprocessing with optimized performance for extraction strategies
+
+## [0.6.1] - 2025-04-24
+
+### Added
+- New dedicated `tables` field in `CrawlResult` model for better table extraction handling
+- Updated crypto_analysis_example.py to use the new tables field with backward compatibility
+
+### Changed
+- Improved playground UI in Docker deployment with better endpoint handling and UI feedback
+
+## [0.6.0] ‑ 2025‑04‑22
+
+### Added
+- Browser pooling with page pre‑warming and fine‑grained **geolocation, locale, and timezone** controls  
+- Crawler pool manager (SDK + Docker API) for smarter resource allocation  
+- Network & console log capture plus MHTML snapshot export  
+- **Table extractor**: turn HTML `<table>`s into DataFrames or CSV with one flag  
+- High‑volume stress‑test framework in `tests/memory` and API load scripts  
+- MCP protocol endpoints with socket & SSE support; playground UI scaffold  
+- Docs v2 revamp: TOC, GitHub badge, copy‑code buttons, Docker API demo  
+- “Ask AI” helper button *(work‑in‑progress, shipping soon)*  
+- New examples: geo‑location usage, network/console capture, Docker API, markdown source selection, crypto analysis  
+- Expanded automated test suites for browser, Docker, MCP and memory benchmarks  
+
+### Changed
+- Consolidated and renamed browser strategies; legacy docker strategy modules removed  
+- `ProxyConfig` moved to `async_configs`  
+- Server migrated to pool‑based crawler management  
+- FastAPI validators replace custom query validation  
+- Docker build now uses Chromium base image  
+- Large‑scale repo tidy‑up (≈36 k insertions, ≈5 k deletions)  
+
+### Fixed
+- Async crawler session leak, duplicate‑visit handling, URL normalisation  
+- Target‑element regressions in scraping strategies  
+- Logged‑URL readability, encoded‑URL decoding, middle truncation for long URLs  
+- Closed issues: #701, #733, #756, #774, #804, #822, #839, #841, #842, #843, #867, #902, #911  
+
+### Removed
+- Obsolete modules under `crawl4ai/browser/*` superseded by the new pooled browser layer  
+
+### Deprecated
+- Old markdown generator names now alias `DefaultMarkdownGenerator` and emit warnings  
+
+---
+
+#### Upgrade notes
+1. Update any direct imports from `crawl4ai/browser/*` to the new pooled browser modules  
+2. If you override `AsyncPlaywrightCrawlerStrategy.get_page`, adopt the new signature  
+3. Rebuild Docker images to pull the new Chromium layer  
+4. Switch to `DefaultMarkdownGenerator` (or silence the deprecation warning)  
+
+---
+
+`121 files changed, ≈36 223 insertions, ≈4 975 deletions` :contentReference[oaicite:0]{index=0}&#8203;:contentReference[oaicite:1]{index=1}
+
+
+### [Feature] 2025-04-21
+- Implemented MCP protocol for machine-to-machine communication
+  - Added WebSocket and SSE transport for MCP server
+  - Exposed server endpoints via MCP protocol
+  - Created tests for MCP socket and SSE communication
+- Enhanced Docker server with file handling and intelligent search
+  - Added PDF and screenshot endpoints with file saving capability
+  - Added JavaScript execution endpoint for page interaction
+  - Implemented advanced context search with BM25 and code chunking
+  - Added file path output support for generated assets
+- Improved server endpoints and API surface
+  - Added intelligent context search with query filtering
+  - Added syntax-aware code function chunking
+  - Implemented efficient HTML processing pipeline
+- Added support for controlling browser geolocation via new GeolocationConfig class
+  - Added locale and timezone configuration options to CrawlerRunConfig
+  - Added example script demonstrating geolocation and locale usage
+  - Added documentation for location-based identity features
+
+### [Refactor] 2025-04-20
+- Replaced crawler_manager.py with simpler crawler_pool.py implementation
+- Added global page semaphore for hard concurrency cap
+- Implemented browser pool with idle cleanup
+- Added playground UI for testing and stress testing
+- Updated API handlers to use pooled crawlers
+- Enhanced logging levels and symbols
+- Added memory tests and stress test utilities
+
+### [Added] 2025-04-17
+- Added content source selection feature for markdown generation
+  - New `content_source` parameter allows choosing between `cleaned_html`, `raw_html`, and `fit_html`
+  - Provides flexibility in how HTML content is processed before markdown conversion
+  - Added examples and documentation for the new feature
+  - Includes backward compatibility with default `cleaned_html` behavior
+  
+## Version 0.5.0.post5 (2025-03-14)
+
+### Added
+
+- *(crawler)* Add experimental parameters dictionary to CrawlerRunConfig to support beta features
+- *(tables)* Add comprehensive table detection and extraction functionality with scoring system
+- *(monitor)* Add real-time crawler monitoring system with memory management
+- *(content)* Add target_elements parameter for selective content extraction
+- *(browser)* Add standalone CDP browser launch capability
+- *(schema)* Add preprocess_html_for_schema utility for better HTML cleaning
+- *(api)* Add special handling for single URL requests in Docker API
+
+### Changed
+
+- *(filters)* Add reverse option to URLPatternFilter for inverting filter logic
+- *(browser)* Make CSP nonce headers optional via experimental config
+- *(browser)* Remove default cookie injection from page initialization
+- *(crawler)* Optimize response handling for single-URL processing
+- *(api)* Refactor crawl request handling to streamline processing
+- *(config)* Update default provider to gpt-4o
+- *(cache)* Change default cache_mode from aggressive to bypass in examples
+
+### Fixed
+
+- *(browser)* Clean up browser context creation code
+- *(api)* Improve code formatting in API handler
+
+### Breaking Changes
+
+- WebScrapingStrategy no longer returns 'scraped_html' in its output dictionary
+- Table extraction logic has been modified to better handle thead/tbody structures
+- Default cookie injection has been removed from page initialization
+
+## Version 0.5.0 (2025-03-02)
+
+### Added
+
+- *(profiles)* Add BrowserProfiler class for dedicated browser profile management
+- *(cli)* Add interactive profile management to CLI with rich UI
+- *(profiles)* Add ability to crawl directly from profile management interface
+- *(browser)* Support identity-based browsing with persistent profiles
+- *(deep-crawling)* Add max_pages parameter to limit the number of pages crawled in all deep crawling strategies
+- *(deep-crawling)* Add score_threshold parameter to BFS and DFS strategies to filter URLs by score
+
+### Changed
+
+- *(browser)* Refactor profile management from ManagedBrowser to BrowserProfiler class
+- *(cli)* Enhance CLI with profile selection and status display for crawling
+- *(examples)* Update identity-based browsing example to use BrowserProfiler class
+- *(docs)* Update identity-based crawling documentation
+- *(docs)* Update deep crawling documentation with max_pages and score_threshold parameters
+- *(examples)* Add example demonstrating the use of max_pages and score_threshold parameters
+
+### Fixed
+
+- *(browser)* Fix profile detection and management on different platforms
+- *(cli)* Fix CLI command structure for better user experience
+- *(deep-crawling)* Improve BFS and DFS strategies to handle page count limits more efficiently
+
+
+## Version 0.5.0 (2025-02-21)
+
+### Added
+
+- *(crawler)* [**breaking**] Add memory-adaptive dispatcher with rate limiting
+- *(scraping)* [**breaking**] Add LXML-based scraping mode for improved performance
+- *(content-filter)* Add LLMContentFilter for intelligent markdown generation
+- *(dispatcher)* [**breaking**] Add streaming support for URL processing
+- *(browser)* [**breaking**] Improve browser context management and add shared data support
+- *(config)* [**breaking**] Add streaming support and config cloning
+- *(crawler)* Add URL redirection tracking
+- *(extraction)* Add LLM-powered schema generation utility
+- *(proxy)* Add proxy configuration support to CrawlerRunConfig
+- *(robots)* Add robots.txt compliance support
+- *(release)* [**breaking**] Prepare v0.4.3 beta release
+- *(proxy)* Add proxy rotation support and documentation
+- *(browser)* Add CDP URL configuration support
+- *(demo)* Uncomment feature demos and add fake-useragent dependency
+- *(pdf)* Add PDF processing capabilities
+- *(crawler)* [**breaking**] Enhance JavaScript execution and PDF processing
+- *(docker)* Add Docker deployment configuration and API server
+- *(docker)* Add Docker service integration and config serialization
+- *(docker)* [**breaking**] Enhance Docker deployment setup and configuration
+- *(api)* Improve cache handling and add API tests
+- *(crawler)* [**breaking**] Add deep crawling capabilities with BFS strategy
+- *(proxy)* [**breaking**] Add proxy rotation strategy
+- *(deep-crawling)* Add DFS strategy and update exports; refactor CLI entry point
+- *(cli)* Add command line interface with comprehensive features
+- *(config)* Enhance serialization and add deep crawling exports
+- *(crawler)* Add HTTP crawler strategy for lightweight web scraping
+- *(docker)* [**breaking**] Implement supervisor and secure API endpoints
+- *(docker)* [**breaking**] Add JWT authentication and improve server architecture
+
+### Changed
+
+- *(browser)* Update browser channel default to 'chromium' in BrowserConfig.from_args method
+- *(crawler)* Optimize response handling and default settings
+- *(crawler)* - Update hello_world example with proper content filtering
+- - Update hello_world.py example
+- *(docs)* [**breaking**] Reorganize documentation structure and update styles
+- *(dispatcher)* [**breaking**] Migrate to modular dispatcher system with enhanced monitoring
+- *(scraping)* [**breaking**] Replace ScrapingMode enum with strategy pattern
+- *(browser)* Improve browser path management
+- *(models)* Rename final_url to redirected_url for consistency
+- *(core)* [**breaking**] Improve type hints and remove unused file
+- *(docs)* Improve code formatting in features demo
+- *(user-agent)* Improve user agent generation system
+- *(core)* [**breaking**] Reorganize project structure and remove legacy code
+- *(docker)* Clean up import statements in server.py
+- *(docker)* Remove unused models and utilities for cleaner codebase
+- *(docker)* [**breaking**] Improve server architecture and configuration
+- *(deep-crawl)* [**breaking**] Reorganize deep crawling functionality into dedicated module
+- *(deep-crawling)* [**breaking**] Reorganize deep crawling strategies and add new implementations
+- *(crawling)* [**breaking**] Improve type hints and code cleanup
+- *(crawler)* [**breaking**] Improve HTML handling and cleanup codebase
+- *(crawler)* [**breaking**] Remove content filter functionality
+- *(examples)* Update API usage in features demo
+- *(config)* [**breaking**] Enhance serialization and config handling
+
+### Docs
+
+- Add Code of Conduct for the project (#410)
+
+### Documentation
+
+- *(extraction)* Add clarifying comments for CSS selector behavior
+- *(readme)* Update personal story and project vision
+- *(urls)* [**breaking**] Update documentation URLs to new domain
+- *(api)* Add streaming mode documentation and examples
+- *(readme)* Update version and feature announcements for v0.4.3b1
+- *(examples)* Update demo scripts and fix output formats
+- *(examples)* Update v0.4.3 features demo to v0.4.3b2
+- *(readme)* Update version references and fix links
+- *(multi-url)* [**breaking**] Improve documentation clarity and update examples
+- *(examples)* Update proxy rotation demo and disable other demos
+- *(api)* Improve formatting and readability of API documentation
+- *(examples)* Add SERP API project example
+- *(urls)* Update documentation URLs to new domain
+- *(readme)* Resolve merge conflict and update version info
+
+### Fixed
+
+- *(browser)* Update default browser channel to chromium and simplify channel selection logic
+- *(browser)* [**breaking**] Default to Chromium channel for new headless mode (#387)
+- *(browser)* Resolve merge conflicts in browser channel configuration
+- Prevent memory leaks by ensuring proper closure of Playwright pages
+- Not working long page screenshot (#403)
+- *(extraction)* JsonCss selector and crawler improvements
+- *(models)* [**breaking**] Make model fields optional with default values
+- *(dispatcher)* Adjust memory threshold and fix dispatcher initialization
+- *(install)* Ensure proper exit after running doctor command
+
+### Miscellaneous Tasks
+
+- *(cleanup)* Remove unused files and improve type hints
+- Add .gitattributes file
+
+## License Update
+
+Crawl4AI v0.5.0 updates the license to Apache 2.0 *with a required attribution clause*.  This means you are free to use, modify, and distribute Crawl4AI (even commercially), but you *must* clearly attribute the project in any public use or distribution.  See the updated `LICENSE` file for the full legal text and specific requirements.
+
+---
 
 ## Version 0.4.3b2 (2025-01-21)
 
@@ -286,12 +550,6 @@ This release introduces several powerful new features, including robots.txt comp
 - Fixed potential viewport mismatches by ensuring consistent use of `self.viewport_width` and `self.viewport_height` throughout the code.
 - Improved robustness of dynamic content loading to avoid timeouts and failed evaluations.
 
-
-
-
-
-
-
 ## [0.3.75] December 1, 2024
 
 ### PruningContentFilter
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 82f677cd..6068c618 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -24,6 +24,14 @@ We would like to thank the following people for their contributions to Crawl4AI:
 - [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271)
 - [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298)
 
+#### Feb-Alpha-1
+- [sufianuddin](https://github.com/sufianuddin) - fix: [Documentation for JsonCssExtractionStrategy](https://github.com/unclecode/crawl4ai/issues/651)
+- [tautikAg](https://github.com/tautikAg) - fix: [Markdown output has incorect spacing](https://github.com/unclecode/crawl4ai/issues/599)
+- [cardit1](https://github.com/cardit1) - fix: ['AsyncPlaywrightCrawlerStrategy' object has no attribute 'downloads_path'](https://github.com/unclecode/crawl4ai/issues/585)
+- [dmurat](https://github.com/dmurat) - fix: [ Incorrect rendering of inline code inside of links ](https://github.com/unclecode/crawl4ai/issues/583)
+- [Sparshsing](https://github.com/Sparshsing) - fix: [Relative Urls in the webpage not extracted properly ](https://github.com/unclecode/crawl4ai/issues/570)
+
+
 
 ## Other Contributors
 
@@ -31,6 +39,11 @@ We would like to thank the following people for their contributions to Crawl4AI:
 - [Shiv Kumar](https://github.com/shivkumar0757)
 - [QIN2DIM](https://github.com/QIN2DIM)
 
+#### Typo fixes
+- [ssoydan](https://github.com/ssoydan)
+- [Darshan](https://github.com/Darshan2104)
+- [tuhinmallick](https://github.com/tuhinmallick)
+
 ## Acknowledgements
 
 We also want to thank all the users who have reported bugs, suggested features, or helped in any other way to make Crawl4AI better.
diff --git a/Dockerfile b/Dockerfile
index 2997590a..1a89800c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,32 +1,36 @@
-# syntax=docker/dockerfile:1.4
+FROM python:3.12-slim-bookworm AS build
 
-ARG TARGETPLATFORM
-ARG BUILDPLATFORM
+# C4ai version
+ARG C4AI_VER=0.6.0
+ENV C4AI_VERSION=$C4AI_VER
+LABEL c4ai.version=$C4AI_VER
 
-# Other build arguments
-ARG PYTHON_VERSION=3.10
+# Set build arguments
+ARG APP_HOME=/app
+ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git
+ARG GITHUB_BRANCH=main
+ARG USE_LOCAL=true
 
-# Base stage with system dependencies
-FROM python:${PYTHON_VERSION}-slim as base
+ENV PYTHONFAULTHANDLER=1 \
+    PYTHONHASHSEED=random \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_DEFAULT_TIMEOUT=100 \
+    DEBIAN_FRONTEND=noninteractive \
+    REDIS_HOST=localhost \
+    REDIS_PORT=6379
 
-# Declare ARG variables again within the build stage
-ARG INSTALL_TYPE=all
+ARG PYTHON_VERSION=3.12
+ARG INSTALL_TYPE=default
 ARG ENABLE_GPU=false
+ARG TARGETARCH
 
-# Platform-specific labels
 LABEL maintainer="unclecode"
 LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
 LABEL version="1.0"
 
-# Environment setup
-ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1 \
-    PIP_NO_CACHE_DIR=1 \
-    PIP_DISABLE_PIP_VERSION_CHECK=1 \
-    PIP_DEFAULT_TIMEOUT=100 \
-    DEBIAN_FRONTEND=noninteractive
-
-# Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     curl \
@@ -37,10 +41,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     pkg-config \
     python3-dev \
     libjpeg-dev \
-    libpng-dev \
+    redis-server \
+    supervisor \
+    && apt-get clean \ 
     && rm -rf /var/lib/apt/lists/*
 
-# Playwright system dependencies for Linux
 RUN apt-get update && apt-get install -y --no-install-recommends \
     libglib2.0-0 \
     libnss3 \
@@ -63,30 +68,66 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libcairo2 \
     libasound2 \
     libatspi2.0-0 \
+    && apt-get clean \ 
     && rm -rf /var/lib/apt/lists/*
 
-# GPU support if enabled and architecture is supported
-RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
+RUN apt-get update && apt-get dist-upgrade -y \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
     apt-get update && apt-get install -y --no-install-recommends \
     nvidia-cuda-toolkit \
+    && apt-get clean \ 
     && rm -rf /var/lib/apt/lists/* ; \
 else \
     echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
 fi
 
-# Create and set working directory
-WORKDIR /app
+RUN if [ "$TARGETARCH" = "arm64" ]; then \
+    echo "🦾 Installing ARM-specific optimizations"; \
+    apt-get update && apt-get install -y --no-install-recommends \
+    libopenblas-dev \
+    && apt-get clean \ 
+    && rm -rf /var/lib/apt/lists/*; \
+elif [ "$TARGETARCH" = "amd64" ]; then \
+    echo "🖥️ Installing AMD64-specific optimizations"; \
+    apt-get update && apt-get install -y --no-install-recommends \
+    libomp-dev \
+    && apt-get clean \ 
+    && rm -rf /var/lib/apt/lists/*; \
+else \
+    echo "Skipping platform-specific optimizations (unsupported platform)"; \
+fi
 
-# Copy the entire project
-COPY . .
+# Create a non-root user and group
+RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
 
-# Install base requirements
+# Create and set permissions for appuser home directory
+RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
+
+WORKDIR ${APP_HOME}
+
+RUN echo '#!/bin/bash\n\
+if [ "$USE_LOCAL" = "true" ]; then\n\
+    echo "📦 Installing from local source..."\n\
+    pip install --no-cache-dir /tmp/project/\n\
+else\n\
+    echo "🌐 Installing from GitHub..."\n\
+    for i in {1..3}; do \n\
+        git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
+        { echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\
+    done\n\
+    pip install --no-cache-dir /tmp/crawl4ai\n\
+fi' > /tmp/install.sh && chmod +x /tmp/install.sh
+
+COPY . /tmp/project/
+
+# Copy supervisor config first (might need root later, but okay for now)
+COPY deploy/docker/supervisord.conf .
+
+COPY deploy/docker/requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
-# Install required library for FastAPI
-RUN pip install fastapi uvicorn psutil
-
-# Install ML dependencies first for better layer caching
 RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
         pip install --no-cache-dir \
             torch \
@@ -99,38 +140,61 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
         python -m nltk.downloader punkt stopwords ; \
     fi
 
-# Install the package
 RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
-        pip install ".[all]" && \
+        pip install "/tmp/project/[all]" && \
         python -m crawl4ai.model_loader ; \
     elif [ "$INSTALL_TYPE" = "torch" ] ; then \
-        pip install ".[torch]" ; \
+        pip install "/tmp/project/[torch]" ; \
     elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
-        pip install ".[transformer]" && \
+        pip install "/tmp/project/[transformer]" && \
         python -m crawl4ai.model_loader ; \
     else \
-        pip install "." ; \
+        pip install "/tmp/project" ; \
     fi
 
-    # Install MkDocs and required plugins
-RUN pip install --no-cache-dir \
-    mkdocs \
-    mkdocs-material \
-    mkdocs-terminal \
-    pymdown-extensions
+RUN pip install --no-cache-dir --upgrade pip && \
+    /tmp/install.sh && \
+    python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
+    python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
 
-# Build MkDocs documentation
-RUN mkdocs build
+RUN crawl4ai-setup
 
-# Install Playwright and browsers
-RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
-    playwright install chromium; \
-    elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-    playwright install chromium; \
-    fi
+RUN playwright install --with-deps
 
-# Expose port
-EXPOSE 8000 11235 9222 8080
+RUN mkdir -p /home/appuser/.cache/ms-playwright \
+    && cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \
+    && chown -R appuser:appuser /home/appuser/.cache/ms-playwright
 
-# Start the FastAPI server
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
\ No newline at end of file
+RUN crawl4ai-doctor
+
+# Copy application code
+COPY deploy/docker/* ${APP_HOME}/
+
+# copy the playground + any future static assets
+COPY deploy/docker/static ${APP_HOME}/static
+
+# Change ownership of the application directory to the non-root user
+RUN chown -R appuser:appuser ${APP_HOME}
+
+# give permissions to redis persistence dirs if used
+RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
+
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD bash -c '\
+    MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
+    if [ $MEM -lt 2048 ]; then \
+        echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
+        exit 1; \
+    fi && \
+    redis-cli ping > /dev/null && \
+    curl -f http://localhost:11235/health || exit 1'
+
+EXPOSE 6379
+# Switch to the non-root user before starting the application
+USER appuser
+
+# Set environment variables to ptoduction
+ENV PYTHON_ENV=production 
+
+# Start the application using supervisord
+CMD ["supervisord", "-c", "supervisord.conf"]
\ No newline at end of file
diff --git a/JOURNAL.md b/JOURNAL.md
new file mode 100644
index 00000000..c2d21e3e
--- /dev/null
+++ b/JOURNAL.md
@@ -0,0 +1,339 @@
+# Development Journal
+
+This journal tracks significant feature additions, bug fixes, and architectural decisions in the crawl4ai project. It serves as both documentation and a historical record of the project's evolution.
+
+## [2025-04-17] Added Content Source Selection for Markdown Generation
+
+**Feature:** Configurable content source for markdown generation
+
+**Changes Made:**
+1. Added `content_source: str = "cleaned_html"` parameter to `MarkdownGenerationStrategy` class
+2. Updated `DefaultMarkdownGenerator` to accept and pass the content source parameter
+3. Renamed the `cleaned_html` parameter to `input_html` in the `generate_markdown` method
+4. Modified `AsyncWebCrawler.aprocess_html` to select the appropriate HTML source based on the generator's config
+5. Added `preprocess_html_for_schema` import in `async_webcrawler.py`
+
+**Implementation Details:**
+- Added a new `content_source` parameter to specify which HTML input to use for markdown generation
+- Options include: "cleaned_html" (default), "raw_html", and "fit_html"
+- Used a dictionary dispatch pattern in `aprocess_html` to select the appropriate HTML source
+- Added proper error handling with fallback to cleaned_html if content source selection fails
+- Ensured backward compatibility by defaulting to "cleaned_html" option
+
+**Files Modified:**
+- `crawl4ai/markdown_generation_strategy.py`: Added content_source parameter and updated the method signature
+- `crawl4ai/async_webcrawler.py`: Added HTML source selection logic and updated imports
+
+**Examples:**
+- Created `docs/examples/content_source_example.py` demonstrating how to use the new parameter
+
+**Challenges:**
+- Maintaining backward compatibility while reorganizing the parameter flow
+- Ensuring proper error handling for all content source options
+- Making the change with minimal code modifications
+
+**Why This Feature:**
+The content source selection feature allows users to choose which HTML content to use as input for markdown generation:
+1. "cleaned_html" - Uses the post-processed HTML after scraping strategy (original behavior)
+2. "raw_html" - Uses the original raw HTML directly from the web page
+3. "fit_html" - Uses the preprocessed HTML optimized for schema extraction
+
+This feature provides greater flexibility in how users generate markdown, enabling them to:
+- Capture more detailed content from the original HTML when needed
+- Use schema-optimized HTML when working with structured data
+- Choose the approach that best suits their specific use case
+## [2025-04-17] Implemented High Volume Stress Testing Solution for SDK
+
+**Feature:** Comprehensive stress testing framework using `arun_many` and the dispatcher system to evaluate performance, concurrency handling, and identify potential issues under high-volume crawling scenarios.
+
+**Changes Made:**
+1.  Created a dedicated stress testing framework in the `benchmarking/` (or similar) directory.
+2.  Implemented local test site generation (`SiteGenerator`) with configurable heavy HTML pages.
+3.  Added basic memory usage tracking (`SimpleMemoryTracker`) using platform-specific commands (avoiding `psutil` dependency for this specific test).
+4.  Utilized `CrawlerMonitor` from `crawl4ai` for rich terminal UI and real-time monitoring of test progress and dispatcher activity.
+5.  Implemented detailed result summary saving (JSON) and memory sample logging (CSV).
+6.  Developed `run_benchmark.py` to orchestrate tests with predefined configurations.
+7.  Created `run_all.sh` as a simple wrapper for `run_benchmark.py`.
+
+**Implementation Details:**
+-   Generates a local test site with configurable pages containing heavy text and image content.
+-   Uses Python's built-in `http.server` for local serving, minimizing network variance.
+-   Leverages `crawl4ai`'s `arun_many` method for processing URLs.
+-   Utilizes `MemoryAdaptiveDispatcher` to manage concurrency via the `max_sessions` parameter (note: memory adaptation features require `psutil`, not used by `SimpleMemoryTracker`).
+-   Tracks memory usage via `SimpleMemoryTracker`, recording samples throughout test execution to a CSV file.
+-   Uses `CrawlerMonitor` (which uses the `rich` library) for clear terminal visualization and progress reporting directly from the dispatcher.
+-   Stores detailed final metrics in a JSON summary file.
+
+**Files Created/Updated:**
+-   `stress_test_sdk.py`: Main stress testing implementation using `arun_many`.
+-   `benchmark_report.py`: (Assumed) Report generator for comparing test results.
+-   `run_benchmark.py`: Test runner script with predefined configurations.
+-   `run_all.sh`: Simple bash script wrapper for `run_benchmark.py`.
+-   `USAGE.md`: Comprehensive documentation on usage and interpretation (updated).
+
+**Testing Approach:**
+-   Creates a controlled, reproducible test environment with a local HTTP server.
+-   Processes URLs using `arun_many`, allowing the dispatcher to manage concurrency up to `max_sessions`.
+-   Optionally logs per-batch summaries (when not in streaming mode) after processing chunks.
+-   Supports different test sizes via `run_benchmark.py` configurations.
+-   Records memory samples via platform commands for basic trend analysis.
+-   Includes cleanup functionality for the test environment.
+
+**Challenges:**
+-   Ensuring proper cleanup of HTTP server processes.
+-   Getting reliable memory tracking across platforms without adding heavy dependencies (`psutil`) to this specific test script.
+-   Designing `run_benchmark.py` to correctly pass arguments to `stress_test_sdk.py`.
+
+**Why This Feature:**
+The high volume stress testing solution addresses critical needs for ensuring Crawl4AI's `arun_many` reliability:
+1.  Provides a reproducible way to evaluate performance under concurrent load.
+2.  Allows testing the dispatcher's concurrency control (`max_session_permit`) and queue management.
+3.  Enables performance tuning by observing throughput (`URLs/sec`) under different `max_sessions` settings.
+4.  Creates a controlled environment for testing `arun_many` behavior.
+5.  Supports continuous integration by providing deterministic test conditions for `arun_many`.
+
+**Design Decisions:**
+-   Chose local site generation for reproducibility and isolation from network issues.
+-   Utilized the built-in `CrawlerMonitor` for real-time feedback, leveraging its `rich` integration.
+-   Implemented optional per-batch logging in `stress_test_sdk.py` (when not streaming) to provide chunk-level summaries alongside the continuous monitor.
+-   Adopted `arun_many` with a `MemoryAdaptiveDispatcher` as the core mechanism for parallel execution, reflecting the intended SDK usage.
+-   Created `run_benchmark.py` to simplify running standard test configurations.
+-   Used `SimpleMemoryTracker` to provide basic memory insights without requiring `psutil` for this particular test runner.
+
+**Future Enhancements to Consider:**
+-   Create a separate test variant that *does* use `psutil` to specifically stress the memory-adaptive features of the dispatcher.
+-   Add support for generated JavaScript content.
+-   Add support for Docker-based testing with explicit memory limits.
+-   Enhance `benchmark_report.py` to provide more sophisticated analysis of performance and memory trends from the generated JSON/CSV files.
+
+---
+
+## [2025-04-17] Refined Stress Testing System Parameters and Execution
+
+**Changes Made:**
+1.  Corrected `run_benchmark.py` and `stress_test_sdk.py` to use `--max-sessions` instead of the incorrect `--workers` parameter, accurately reflecting dispatcher configuration.
+2.  Updated `run_benchmark.py` argument handling to correctly pass all relevant custom parameters (including `--stream`, `--monitor-mode`, etc.) to `stress_test_sdk.py`.
+3.  (Assuming changes in `benchmark_report.py`) Applied dark theme to benchmark reports for better readability.
+4.  (Assuming changes in `benchmark_report.py`) Improved visualization code to eliminate matplotlib warnings.
+5.  Updated `run_benchmark.py` to provide clickable `file://` links to generated reports in the terminal output.
+6.  Updated `USAGE.md` with comprehensive parameter descriptions reflecting the final script arguments.
+7.  Updated `run_all.sh` wrapper to correctly invoke `run_benchmark.py` with flexible arguments.
+
+**Details of Changes:**
+
+1.  **Parameter Correction (`--max-sessions`)**:
+    *   Identified the fundamental misunderstanding where `--workers` was used incorrectly.
+    *   Refactored `stress_test_sdk.py` to accept `--max-sessions` and configure the `MemoryAdaptiveDispatcher`'s `max_session_permit` accordingly.
+    *   Updated `run_benchmark.py` argument parsing and command construction to use `--max-sessions`.
+    *   Updated `TEST_CONFIGS` in `run_benchmark.py` to use `max_sessions`.
+
+2.  **Argument Handling (`run_benchmark.py`)**:
+    *   Improved logic to collect all command-line arguments provided to `run_benchmark.py`.
+    *   Ensured all relevant arguments (like `--stream`, `--monitor-mode`, `--port`, `--use-rate-limiter`, etc.) are correctly forwarded when calling `stress_test_sdk.py` as a subprocess.
+
+3.  **Dark Theme & Visualization Fixes (Assumed in `benchmark_report.py`)**:
+    *   (Describes changes assumed to be made in the separate reporting script).
+
+4.  **Clickable Links (`run_benchmark.py`)**:
+    *   Added logic to find the latest HTML report and PNG chart in the `benchmark_reports` directory after `benchmark_report.py` runs.
+    *   Used `pathlib` to generate correct `file://` URLs for terminal output.
+
+5.  **Documentation Improvements (`USAGE.md`)**:
+    *   Rewrote sections to explain `arun_many`, dispatchers, and `--max-sessions`.
+    *   Updated parameter tables for all scripts (`stress_test_sdk.py`, `run_benchmark.py`).
+    *   Clarified the difference between batch and streaming modes and their effect on logging.
+    *   Updated examples to use correct arguments.
+
+**Files Modified:**
+-   `stress_test_sdk.py`: Changed `--workers` to `--max-sessions`, added new arguments, used `arun_many`.
+-   `run_benchmark.py`: Changed argument handling, updated configs, calls `stress_test_sdk.py`.
+-   `run_all.sh`: Updated to call `run_benchmark.py` correctly.
+-   `USAGE.md`: Updated documentation extensively.
+-   `benchmark_report.py`: (Assumed modifications for dark theme and viz fixes).
+
+**Testing:**
+-   Verified that `--max-sessions` correctly limits concurrency via the `CrawlerMonitor` output.
+-   Confirmed that custom arguments passed to `run_benchmark.py` are forwarded to `stress_test_sdk.py`.
+-   Validated clickable links work in supporting terminals.
+-   Ensured documentation matches the final script parameters and behavior.
+
+**Why These Changes:**
+These refinements correct the fundamental approach of the stress test to align with `crawl4ai`'s actual architecture and intended usage:
+1.  Ensures the test evaluates the correct components (`arun_many`, `MemoryAdaptiveDispatcher`).
+2.  Makes test configurations more accurate and flexible.
+3.  Improves the usability of the testing framework through better argument handling and documentation.
+
+
+**Future Enhancements to Consider:**
+- Add support for generated JavaScript content to test JS rendering performance
+- Implement more sophisticated memory analysis like generational garbage collection tracking
+- Add support for Docker-based testing with memory limits to force OOM conditions
+- Create visualization tools for analyzing memory usage patterns across test runs
+- Add benchmark comparisons between different crawler versions or configurations
+
+## [2025-04-17] Fixed Issues in Stress Testing System
+
+**Changes Made:**
+1. Fixed custom parameter handling in run_benchmark.py
+2. Applied dark theme to benchmark reports for better readability
+3. Improved visualization code to eliminate matplotlib warnings
+4. Added clickable links to generated reports in terminal output
+5. Enhanced documentation with comprehensive parameter descriptions
+
+**Details of Changes:**
+
+1. **Custom Parameter Handling Fix**
+   - Identified bug where custom URL count was being ignored in run_benchmark.py
+   - Rewrote argument handling to use a custom args dictionary
+   - Properly passed parameters to the test_simple_stress.py command
+   - Added better UI indication of custom parameters in use
+
+2. **Dark Theme Implementation**
+   - Added complete dark theme to HTML benchmark reports
+   - Applied dark styling to all visualization components
+   - Used Nord-inspired color palette for charts and graphs
+   - Improved contrast and readability for data visualization
+   - Updated text colors and backgrounds for better eye comfort
+
+3. **Matplotlib Warning Fixes**
+   - Resolved warnings related to improper use of set_xticklabels()
+   - Implemented correct x-axis positioning for bar charts
+   - Ensured proper alignment of bar labels and data points
+   - Updated plotting code to use modern matplotlib practices
+
+4. **Documentation Improvements**
+   - Created comprehensive USAGE.md with detailed instructions
+   - Added parameter documentation for all scripts
+   - Included examples for all common use cases
+   - Provided detailed explanations for interpreting results
+   - Added troubleshooting guide for common issues
+
+**Files Modified:**
+- `tests/memory/run_benchmark.py`: Fixed custom parameter handling
+- `tests/memory/benchmark_report.py`: Added dark theme and fixed visualization warnings
+- `tests/memory/run_all.sh`: Added clickable links to reports
+- `tests/memory/USAGE.md`: Created comprehensive documentation
+
+**Testing:**
+- Verified that custom URL counts are now correctly used
+- Confirmed dark theme is properly applied to all report elements
+- Checked that matplotlib warnings are no longer appearing
+- Validated clickable links to reports work in terminals that support them
+
+**Why These Changes:**
+These improvements address several usability issues with the stress testing system:
+1. Better parameter handling ensures test configurations work as expected
+2. Dark theme reduces eye strain during extended test review sessions
+3. Fixing visualization warnings improves code quality and output clarity
+4. Enhanced documentation makes the system more accessible for future use
+
+**Future Enhancements:**
+- Add additional visualization options for different types of analysis
+- Implement theme toggle to support both light and dark preferences
+- Add export options for embedding reports in other documentation
+- Create dedicated CI/CD integration templates for automated testing
+
+## [2025-04-09] Added MHTML Capture Feature
+
+**Feature:** MHTML snapshot capture of crawled pages
+
+**Changes Made:**
+1. Added `capture_mhtml: bool = False` parameter to `CrawlerRunConfig` class
+2. Added `mhtml: Optional[str] = None` field to `CrawlResult` model
+3. Added `mhtml_data: Optional[str] = None` field to `AsyncCrawlResponse` class
+4. Implemented `capture_mhtml()` method in `AsyncPlaywrightCrawlerStrategy` class to capture MHTML via CDP
+5. Modified the crawler to capture MHTML when enabled and pass it to the result
+
+**Implementation Details:**
+- MHTML capture uses Chrome DevTools Protocol (CDP) via Playwright's CDP session API
+- The implementation waits for page to fully load before capturing MHTML content
+- Enhanced waiting for JavaScript content with requestAnimationFrame for better JS content capture
+- We ensure all browser resources are properly cleaned up after capture
+
+**Files Modified:**
+- `crawl4ai/models.py`: Added the mhtml field to CrawlResult
+- `crawl4ai/async_configs.py`: Added capture_mhtml parameter to CrawlerRunConfig
+- `crawl4ai/async_crawler_strategy.py`: Implemented MHTML capture logic
+- `crawl4ai/async_webcrawler.py`: Added mapping from AsyncCrawlResponse.mhtml_data to CrawlResult.mhtml
+
+**Testing:**
+- Created comprehensive tests in `tests/20241401/test_mhtml.py` covering:
+  - Capturing MHTML when enabled
+  - Ensuring mhtml is None when disabled explicitly
+  - Ensuring mhtml is None by default
+  - Capturing MHTML on JavaScript-enabled pages
+
+**Challenges:**
+- Had to improve page loading detection to ensure JavaScript content was fully rendered
+- Tests needed to be run independently due to Playwright browser instance management
+- Modified test expected content to match actual MHTML output
+
+**Why This Feature:**
+The MHTML capture feature allows users to capture complete web pages including all resources (CSS, images, etc.) in a single file. This is valuable for:
+1. Offline viewing of captured pages
+2. Creating permanent snapshots of web content for archival
+3. Ensuring consistent content for later analysis, even if the original site changes
+
+**Future Enhancements to Consider:**
+- Add option to save MHTML to file
+- Support for filtering what resources get included in MHTML
+- Add support for specifying MHTML capture options
+
+## [2025-04-10] Added Network Request and Console Message Capturing
+
+**Feature:** Comprehensive capturing of network requests/responses and browser console messages during crawling
+
+**Changes Made:**
+1. Added `capture_network_requests: bool = False` and `capture_console_messages: bool = False` parameters to `CrawlerRunConfig` class
+2. Added `network_requests: Optional[List[Dict[str, Any]]] = None` and `console_messages: Optional[List[Dict[str, Any]]] = None` fields to both `AsyncCrawlResponse` and `CrawlResult` models
+3. Implemented event listeners in `AsyncPlaywrightCrawlerStrategy._crawl_web()` to capture browser network events and console messages
+4. Added proper event listener cleanup in the finally block to prevent resource leaks
+5. Modified the crawler flow to pass captured data from AsyncCrawlResponse to CrawlResult
+
+**Implementation Details:**
+- Network capture uses Playwright event listeners (`request`, `response`, and `requestfailed`) to record all network activity
+- Console capture uses Playwright event listeners (`console` and `pageerror`) to record console messages and errors
+- Each network event includes metadata like URL, headers, status, and timing information
+- Each console message includes type, text content, and source location when available
+- All captured events include timestamps for chronological analysis
+- Error handling ensures even failed capture attempts won't crash the main crawling process
+
+**Files Modified:**
+- `crawl4ai/models.py`: Added new fields to AsyncCrawlResponse and CrawlResult
+- `crawl4ai/async_configs.py`: Added new configuration parameters to CrawlerRunConfig
+- `crawl4ai/async_crawler_strategy.py`: Implemented capture logic using event listeners
+- `crawl4ai/async_webcrawler.py`: Added data transfer from AsyncCrawlResponse to CrawlResult
+
+**Documentation:**
+- Created detailed documentation in `docs/md_v2/advanced/network-console-capture.md`
+- Added feature to site navigation in `mkdocs.yml`
+- Updated CrawlResult documentation in `docs/md_v2/api/crawl-result.md`
+- Created comprehensive example in `docs/examples/network_console_capture_example.py`
+
+**Testing:**
+- Created `tests/general/test_network_console_capture.py` with tests for:
+  - Verifying capture is disabled by default
+  - Testing network request capturing
+  - Testing console message capturing
+  - Ensuring both capture types can be enabled simultaneously
+  - Checking correct content is captured in expected formats
+
+**Challenges:**
+- Initial implementation had synchronous/asynchronous mismatches in event handlers
+- Needed to fix type of property access vs. method calls in handlers
+- Required careful cleanup of event listeners to prevent memory leaks
+
+**Why This Feature:**
+The network and console capture feature provides deep visibility into web page activity, enabling:
+1. Debugging complex web applications by seeing all network requests and errors
+2. Security analysis to detect unexpected third-party requests and data flows
+3. Performance profiling to identify slow-loading resources
+4. API discovery in single-page applications
+5. Comprehensive analysis of web application behavior
+
+**Future Enhancements to Consider:**
+- Option to filter captured events by type, domain, or content
+- Support for capturing response bodies (with size limits)
+- Aggregate statistics calculation for performance metrics
+- Integration with visualization tools for network waterfall analysis
+- Exporting captures in HAR format for use with external tools
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index 59711bc9..ade44a78 100644
--- a/LICENSE
+++ b/LICENSE
@@ -48,4 +48,22 @@ You may add Your own copyright statement to Your modifications and may provide a
 
 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
 
-END OF TERMS AND CONDITIONS
\ No newline at end of file
+END OF TERMS AND CONDITIONS
+
+---
+Attribution Requirement
+
+All distributions, publications, or public uses of this software, or derivative works based on this software, must include the following attribution:
+
+"This product includes software developed by UncleCode (https://x.com/unclecode) as part of the Crawl4AI project (https://github.com/unclecode/crawl4ai)."
+
+This attribution must be displayed in a prominent and easily accessible location, such as:
+
+-   For software distributions: In a NOTICE file, README file, or equivalent documentation.
+-   For publications (research papers, articles, blog posts): In the acknowledgments section or a footnote.
+-   For websites/web applications: In an "About" or "Credits" section.
+-   For command-line tools: In the help/usage output.
+
+This requirement ensures proper credit is given for the use of Crawl4AI and helps promote the project.
+
+---
\ No newline at end of file
diff --git a/README.md b/README.md
index a9fcdd19..97787b2f 100644
--- a/README.md
+++ b/README.md
@@ -21,9 +21,9 @@
 
 Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.  
 
-[✨ Check out latest update v0.4.3bx](#-recent-updates)
+[✨ Check out latest update v0.6.0](#-recent-updates)
 
-🎉 **Version 0.4.3bx is out!** This release brings exciting new features like a Memory Dispatcher System, Streaming Support, LLM-Powered Markdown Generation, Schema Generation, and Robots.txt Compliance! [Read the release notes →](https://docs.crawl4ai.com/blog)
+🎉 **Version 0.6.0 is now available!** This release candidate introduces World-aware Crawling with geolocation and locale settings, Table-to-DataFrame extraction, Browser pooling with pre-warming, Network and console traffic capture, MCP integration for AI tools, and a completely revamped Docker deployment! [Read the release notes →](https://docs.crawl4ai.com/blog)
 
 <details>
 <summary>🤓 <strong>My Personal Story</strong></summary>
@@ -68,7 +68,7 @@ If you encounter any browser-related issues, you can install them manually:
 python -m playwright install --with-deps chromium
 ```
 
-2. Run a simple web crawl:
+2. Run a simple web crawl with Python:
 ```python
 import asyncio
 from crawl4ai import *
@@ -84,6 +84,18 @@ if __name__ == "__main__":
     asyncio.run(main())
 ```
 
+3. Or use the new command-line interface:
+```bash
+# Basic crawl with markdown output
+crwl https://www.nbcnews.com/business -o markdown
+
+# Deep crawl with BFS strategy, max 10 pages
+crwl https://docs.crawl4ai.com --deep-crawl bfs --max-pages 10
+
+# Use LLM extraction with a specific question
+crwl https://www.example.com/products -q "Extract all product prices"
+```
+
 ## ✨ Features 
 
 <details>
@@ -112,6 +124,7 @@ if __name__ == "__main__":
 
 - 🖥️ **Managed Browser**: Use user-owned browsers with full control, avoiding bot detection.
 - 🔄 **Remote Browser Control**: Connect to Chrome Developer Tools Protocol for remote, large-scale data extraction.
+- 👤 **Browser Profiler**: Create and manage persistent profiles with saved authentication states, cookies, and settings.
 - 🔒 **Session Management**: Preserve browser states and reuse them for multi-step crawling.
 - 🧩 **Proxy Support**: Seamlessly connect to proxies with authentication for secure access.
 - ⚙️ **Full Browser Control**: Modify headers, cookies, user agents, and more for tailored crawling setups.
@@ -140,10 +153,11 @@ if __name__ == "__main__":
 <details>
 <summary>🚀 <strong>Deployment</strong></summary>
 
-- 🐳 **Dockerized Setup**: Optimized Docker image with API server for easy deployment.
+- 🐳 **Dockerized Setup**: Optimized Docker image with FastAPI server for easy deployment.
+- 🔑 **Secure Authentication**: Built-in JWT token authentication for API security.
 - 🔄 **API Gateway**: One-click deployment with secure token authentication for API-based workflows.
 - 🌐 **Scalable Architecture**: Designed for mass-scale production and optimized server performance.
-- ⚙️ **DigitalOcean Deployment**: Ready-to-deploy configurations for DigitalOcean and similar platforms.
+- ☁️ **Cloud Deployment**: Ready-to-deploy configurations for major cloud platforms.
 
 </details>
 
@@ -239,24 +253,29 @@ pip install -e ".[all]"             # Install all optional features
 <details>
 <summary>🐳 <strong>Docker Deployment</strong></summary>
 
-> 🚀 **Major Changes Coming!** We're developing a completely new Docker implementation that will make deployment even more efficient and seamless. The current Docker setup is being deprecated in favor of this new solution.
+> 🚀 **Now Available!** Our completely redesigned Docker implementation is here! This new solution makes deployment more efficient and seamless than ever.
 
-### Current Docker Support
+### New Docker Features
 
-The existing Docker implementation is being deprecated and will be replaced soon. If you still need to use Docker with the current version:
+The new Docker implementation includes:
+- **Browser pooling** with page pre-warming for faster response times
+- **Interactive playground** to test and generate request code
+- **MCP integration** for direct connection to AI tools like Claude Code
+- **Comprehensive API endpoints** including HTML extraction, screenshots, PDF generation, and JavaScript execution
+- **Multi-architecture support** with automatic detection (AMD64/ARM64)
+- **Optimized resources** with improved memory management
 
-- 📚 [Deprecated Docker Setup](./docs/deprecated/docker-deployment.md) - Instructions for the current Docker implementation
-- ⚠️ Note: This setup will be replaced in the next major release
+### Getting Started
 
-### What's Coming Next?
+```bash
+# Pull and run the latest release candidate
+docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
+docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
 
-Our new Docker implementation will bring:
-- Improved performance and resource efficiency
-- Streamlined deployment process
-- Better integration with Crawl4AI features
-- Enhanced scalability options
+# Visit the playground at http://localhost:11235/playground
+```
 
-Stay connected with our [GitHub repository](https://github.com/unclecode/crawl4ai) for updates!
+For complete documentation, see our [Docker Deployment Guide](https://docs.crawl4ai.com/core/docker-deployment/).
 
 </details>
 
@@ -318,9 +337,8 @@ async def main():
             url="https://docs.micronaut.io/4.7.6/guide/",
             config=run_config
         )
-        print(len(result.markdown))
-        print(len(result.fit_markdown))
-        print(len(result.markdown_v2.fit_markdown))
+        print(len(result.markdown.raw_markdown))
+        print(len(result.markdown.fit_markdown))
 
 if __name__ == "__main__":
     asyncio.run(main())
@@ -407,7 +425,7 @@ if __name__ == "__main__":
 ```python
 import os
 import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
 from pydantic import BaseModel, Field
 
@@ -423,7 +441,7 @@ async def main():
         extraction_strategy=LLMExtractionStrategy(
             # Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
             # provider="ollama/qwen2", api_token="no-token", 
-            provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'), 
+            llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')), 
             schema=OpenAIModelFee.schema(),
             extraction_type="schema",
             instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
@@ -487,21 +505,92 @@ async def test_news_crawl():
 
 ## ✨ Recent Updates
 
--   **🚀 New Dispatcher System**: Scale to thousands of URLs with intelligent **memory monitoring**, **concurrency control**, and optional **rate limiting**. (See `MemoryAdaptiveDispatcher`, `SemaphoreDispatcher`, `RateLimiter`, `CrawlerMonitor`)
--   **⚡ Streaming Mode**: Process results **as they arrive** instead of waiting for an entire batch to complete. (Set `stream=True` in `CrawlerRunConfig`)
--   **🤖 Enhanced LLM Integration**:
-    -   **Automatic schema generation**: Create extraction rules from HTML using OpenAI or Ollama, no manual CSS/XPath needed.
-    -   **LLM-powered Markdown filtering**: Refine your markdown output with a new `LLMContentFilter` that understands content relevance.
-    -   **Ollama Support**: Use open-source or self-hosted models for private or cost-effective extraction.
--   **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental).
--   **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching.
--   **🔄 Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence.
--   **➡️ URL Redirection Tracking**: The `redirected_url` field now captures the final destination after any redirects.
--   **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites.
--   **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`.
--   **📝 Improved Documentation**: More examples, clearer explanations, and updated tutorials.
+### Version 0.6.0 Release Highlights
 
-Read the full details in our [0.4.3bx Release Notes](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
+- **🌎 World-aware Crawling**: Set geolocation, language, and timezone for authentic locale-specific content:
+  ```python
+    crun_cfg = CrawlerRunConfig(
+        url="https://browserleaks.com/geo",          # test page that shows your location
+        locale="en-US",                              # Accept-Language & UI locale
+        timezone_id="America/Los_Angeles",           # JS Date()/Intl timezone
+        geolocation=GeolocationConfig(                 # override GPS coords
+            latitude=34.0522,
+            longitude=-118.2437,
+            accuracy=10.0,
+        )
+    )
+  ```
+
+- **📊 Table-to-DataFrame Extraction**: Extract HTML tables directly to CSV or pandas DataFrames:
+  ```python
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    try:
+        # Set up scraping parameters
+        crawl_config = CrawlerRunConfig(
+            table_score_threshold=8,  # Strict table detection
+        )
+
+        # Execute market data extraction
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://coinmarketcap.com/?page=1", config=crawl_config
+        )
+
+        # Process results
+        raw_df = pd.DataFrame()
+        for result in results:
+            if result.success and result.media["tables"]:
+                raw_df = pd.DataFrame(
+                    result.media["tables"][0]["rows"],
+                    columns=result.media["tables"][0]["headers"],
+                )
+                break
+        print(raw_df.head())
+
+    finally:
+        await crawler.stop()
+  ```
+
+- **🚀 Browser Pooling**: Pages launch hot with pre-warmed browser instances for lower latency and memory usage
+
+- **🕸️ Network and Console Capture**: Full traffic logs and MHTML snapshots for debugging:
+  ```python
+  crawler_config = CrawlerRunConfig(
+      capture_network=True,
+      capture_console=True,
+      mhtml=True
+  )
+  ```
+
+- **🔌 MCP Integration**: Connect to AI tools like Claude Code through the Model Context Protocol
+  ```bash
+  # Add Crawl4AI to Claude Code
+  claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
+  ```
+
+- **🖥️ Interactive Playground**: Test configurations and generate API requests with the built-in web interface at `http://localhost:11235//playground`
+
+- **🐳 Revamped Docker Deployment**: Streamlined multi-architecture Docker image with improved resource efficiency
+
+- **📱 Multi-stage Build System**: Optimized Dockerfile with platform-specific performance enhancements
+
+Read the full details in our [0.6.0 Release Notes](https://docs.crawl4ai.com/blog/releases/0.6.0.html) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
+
+### Previous Version: 0.5.0 Major Release Highlights
+
+-   **🚀 Deep Crawling System**: Explore websites beyond initial URLs with BFS, DFS, and BestFirst strategies
+-   **⚡ Memory-Adaptive Dispatcher**: Dynamically adjusts concurrency based on system memory
+-   **🔄 Multiple Crawling Strategies**: Browser-based and lightweight HTTP-only crawlers
+-   **💻 Command-Line Interface**: New `crwl` CLI provides convenient terminal access
+-   **👤 Browser Profiler**: Create and manage persistent browser profiles
+-   **🧠 Crawl4AI Coding Assistant**: AI-powered coding assistant
+-   **🏎️ LXML Scraping Mode**: Fast HTML parsing using the `lxml` library
+-   **🌐 Proxy Rotation**: Built-in support for proxy switching
+-   **🤖 LLM Content Filter**: Intelligent markdown generation using LLMs
+-   **📄 PDF Processing**: Extract text, images, and metadata from PDF files
+
+Read the full details in our [0.5.0 Release Notes](https://docs.crawl4ai.com/blog/releases/0.5.0.html).
 
 ## Version Numbering in Crawl4AI
 
@@ -517,7 +606,7 @@ We use different suffixes to indicate development stages:
 - `dev` (0.4.3dev1): Development versions, unstable
 - `a` (0.4.3a1): Alpha releases, experimental features
 - `b` (0.4.3b1): Beta releases, feature complete but needs testing
-- `rc` (0.4.3rc1): Release candidates, potential final version
+- `rc` (0.4.3): Release candidates, potential final version
 
 #### Installation
 - Regular installation (stable version):
@@ -574,9 +663,83 @@ To check our development plans and upcoming features, visit our [Roadmap](https:
 
 We welcome contributions from the open-source community. Check out our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
 
-## 📄 License 
+I'll help modify the license section with badges. For the halftone effect, here's a version with it:
 
-Crawl4AI is released under the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE).
+Here's the updated license section:
+
+## 📄 License & Attribution
+
+This project is licensed under the Apache License 2.0 with a required attribution clause. See the [Apache 2.0 License](https://github.com/unclecode/crawl4ai/blob/main/LICENSE) file for details.
+
+### Attribution Requirements
+When using Crawl4AI, you must include one of the following attribution methods:
+
+#### 1. Badge Attribution (Recommended)
+Add one of these badges to your README, documentation, or website:
+
+| Theme | Badge |
+|-------|-------|
+| **Disco Theme (Animated)** | <a href="https://github.com/unclecode/crawl4ai"><img src="./docs/assets/powered-by-disco.svg" alt="Powered by Crawl4AI" width="200"/></a> |
+| **Night Theme (Dark with Neon)** | <a href="https://github.com/unclecode/crawl4ai"><img src="./docs/assets/powered-by-night.svg" alt="Powered by Crawl4AI" width="200"/></a> |
+| **Dark Theme (Classic)** | <a href="https://github.com/unclecode/crawl4ai"><img src="./docs/assets/powered-by-dark.svg" alt="Powered by Crawl4AI" width="200"/></a> |
+| **Light Theme (Classic)** | <a href="https://github.com/unclecode/crawl4ai"><img src="./docs/assets/powered-by-light.svg" alt="Powered by Crawl4AI" width="200"/></a> |
+ 
+
+HTML code for adding the badges:
+```html
+<!-- Disco Theme (Animated) -->
+<a href="https://github.com/unclecode/crawl4ai">
+  <img src="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/assets/powered-by-disco.svg" alt="Powered by Crawl4AI" width="200"/>
+</a>
+
+<!-- Night Theme (Dark with Neon) -->
+<a href="https://github.com/unclecode/crawl4ai">
+  <img src="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/assets/powered-by-night.svg" alt="Powered by Crawl4AI" width="200"/>
+</a>
+
+<!-- Dark Theme (Classic) -->
+<a href="https://github.com/unclecode/crawl4ai">
+  <img src="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/assets/powered-by-dark.svg" alt="Powered by Crawl4AI" width="200"/>
+</a>
+
+<!-- Light Theme (Classic) -->
+<a href="https://github.com/unclecode/crawl4ai">
+  <img src="https://raw.githubusercontent.com/unclecode/crawl4ai/main/docs/assets/powered-by-light.svg" alt="Powered by Crawl4AI" width="200"/>
+</a>
+
+<!-- Simple Shield Badge -->
+<a href="https://github.com/unclecode/crawl4ai">
+  <img src="https://img.shields.io/badge/Powered%20by-Crawl4AI-blue?style=flat-square" alt="Powered by Crawl4AI"/>
+</a>
+```
+
+#### 2. Text Attribution
+Add this line to your documentation:
+```
+This project uses Crawl4AI (https://github.com/unclecode/crawl4ai) for web data extraction.
+```
+
+## 📚 Citation
+
+If you use Crawl4AI in your research or project, please cite:
+
+```bibtex
+@software{crawl4ai2024,
+  author = {UncleCode},
+  title = {Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper},
+  year = {2024},
+  publisher = {GitHub},
+  journal = {GitHub Repository},
+  howpublished = {\url{https://github.com/unclecode/crawl4ai}},
+  commit = {Please use the commit hash you're working with}
+}
+```
+
+Text citation format:
+```
+UncleCode. (2024). Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper [Computer software]. 
+GitHub. https://github.com/unclecode/crawl4ai
+```
 
 ## 📧 Contact 
 
diff --git a/cliff.toml b/cliff.toml
new file mode 100644
index 00000000..a2cd8cd3
--- /dev/null
+++ b/cliff.toml
@@ -0,0 +1,24 @@
+[changelog]
+# Template format
+header = """
+# Changelog\n
+All notable changes to this project will be documented in this file.\n
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).\n
+"""
+
+# Organize commits by type
+[git]
+conventional_commits = true
+filter_unconventional = true
+commit_parsers = [
+    { message = "^feat", group = "Added"},
+    { message = "^fix", group = "Fixed"},
+    { message = "^doc", group = "Documentation"},
+    { message = "^perf", group = "Performance"},
+    { message = "^refactor", group = "Changed"},
+    { message = "^style", group = "Changed"},
+    { message = "^test", group = "Testing"},
+    { message = "^chore\\(release\\): prepare for", skip = true},
+    { message = "^chore", group = "Miscellaneous Tasks"},
+]
\ No newline at end of file
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 7f284323..3ba22ece 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -1,46 +1,112 @@
 # __init__.py
+import warnings
 
 from .async_webcrawler import AsyncWebCrawler, CacheMode
-from .async_configs import BrowserConfig, CrawlerRunConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig
+
 from .content_scraping_strategy import (
     ContentScrapingStrategy,
     WebScrapingStrategy,
     LXMLWebScrapingStrategy,
 )
+from .async_logger import (
+    AsyncLoggerBase,
+    AsyncLogger,
+)
+from .proxy_strategy import (
+    ProxyRotationStrategy,
+    RoundRobinProxyStrategy,
+)
 from .extraction_strategy import (
     ExtractionStrategy,
     LLMExtractionStrategy,
     CosineStrategy,
     JsonCssExtractionStrategy,
-    JsonXPathExtractionStrategy
+    JsonXPathExtractionStrategy,
+    JsonLxmlExtractionStrategy,
+    RegexExtractionStrategy
 )
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
-from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
-from .models import CrawlResult, MarkdownGenerationResult
+from .content_filter_strategy import (
+    PruningContentFilter,
+    BM25ContentFilter,
+    LLMContentFilter,
+    RelevantContentFilter,
+)
+from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
+from .components.crawler_monitor import CrawlerMonitor
 from .async_dispatcher import (
     MemoryAdaptiveDispatcher,
     SemaphoreDispatcher,
     RateLimiter,
-    CrawlerMonitor,
-    DisplayMode,
-    BaseDispatcher
+    BaseDispatcher,
+)
+from .docker_client import Crawl4aiDockerClient
+from .hub import CrawlerHub
+from .browser_profiler import BrowserProfiler
+from .deep_crawling import (
+    DeepCrawlStrategy,
+    BFSDeepCrawlStrategy,
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+    URLFilter,
+    FilterStats,
+    SEOFilter,
+    KeywordRelevanceScorer,
+    URLScorer,
+    CompositeScorer,
+    DomainAuthorityScorer,
+    FreshnessScorer,
+    PathDepthScorer,
+    BestFirstCrawlingStrategy,
+    DFSDeepCrawlStrategy,
+    DeepCrawlDecorator,
 )
 
 __all__ = [
+    "AsyncLoggerBase",
+    "AsyncLogger",
     "AsyncWebCrawler",
+    "BrowserProfiler",
+    "LLMConfig",
+    "GeolocationConfig",
+    "DeepCrawlStrategy",
+    "BFSDeepCrawlStrategy",
+    "BestFirstCrawlingStrategy",
+    "DFSDeepCrawlStrategy",
+    "FilterChain",
+    "URLPatternFilter",
+    "ContentTypeFilter",
+    "DomainFilter",
+    "FilterStats",
+    "URLFilter",
+    "SEOFilter",
+    "KeywordRelevanceScorer",
+    "URLScorer",
+    "CompositeScorer",
+    "DomainAuthorityScorer",
+    "FreshnessScorer",
+    "PathDepthScorer",
+    "DeepCrawlDecorator",
     "CrawlResult",
+    "CrawlerHub",
     "CacheMode",
     "ContentScrapingStrategy",
     "WebScrapingStrategy",
     "LXMLWebScrapingStrategy",
     "BrowserConfig",
     "CrawlerRunConfig",
+    "HTTPCrawlerConfig",
     "ExtractionStrategy",
     "LLMExtractionStrategy",
     "CosineStrategy",
     "JsonCssExtractionStrategy",
     "JsonXPathExtractionStrategy",
+    "JsonLxmlExtractionStrategy",
+    "RegexExtractionStrategy",
     "ChunkingStrategy",
     "RegexChunking",
     "DefaultMarkdownGenerator",
@@ -55,35 +121,36 @@ __all__ = [
     "CrawlerMonitor",
     "DisplayMode",
     "MarkdownGenerationResult",
+    "Crawl4aiDockerClient",
+    "ProxyRotationStrategy",
+    "RoundRobinProxyStrategy",
+    "ProxyConfig"
 ]
 
 
-def is_sync_version_installed():
-    try:
-        import selenium
+# def is_sync_version_installed():
+#     try:
+#         import selenium # noqa
 
-        return True
-    except ImportError:
-        return False
+#         return True
+#     except ImportError:
+#         return False
 
 
-if is_sync_version_installed():
-    try:
-        from .web_crawler import WebCrawler
+# if is_sync_version_installed():
+#     try:
+#         from .web_crawler import WebCrawler
 
-        __all__.append("WebCrawler")
-    except ImportError:
-        print(
-            "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
-        )
-else:
-    WebCrawler = None
-    # import warnings
-    # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
-
-import warnings
-from pydantic import warnings as pydantic_warnings
+#         __all__.append("WebCrawler")
+#     except ImportError:
+#         print(
+#             "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
+#         )
+# else:
+#     WebCrawler = None
+#     # import warnings
+#     # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
 
 # Disable all Pydantic warnings
 warnings.filterwarnings("ignore", module="pydantic")
-# pydantic_warnings.filter_warnings()
\ No newline at end of file
+# pydantic_warnings.filter_warnings()
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 3274435a..1be2ccd8 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.4.3b3"
+__version__ = "0.6.3"
+
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 44c83262..c93516bd 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,20 +1,323 @@
+import os
 from .config import (
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
     MIN_WORD_THRESHOLD,
     IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+    PROVIDER_MODELS,
+    PROVIDER_MODELS_PREFIXES,
     SCREENSHOT_HEIGHT_TRESHOLD,
     PAGE_TIMEOUT,
     IMAGE_SCORE_THRESHOLD,
     SOCIAL_MEDIA_DOMAINS,
 )
 
-from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator
-from .extraction_strategy import ExtractionStrategy
+from .user_agent_generator import UAGen, ValidUAGenerator  # , OnlineUAGenerator
+from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
 from .chunking_strategy import ChunkingStrategy, RegexChunking
-from .markdown_generation_strategy import MarkdownGenerationStrategy
-from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter
+
+from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
 from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
-from typing import Optional, Union, List
+from .deep_crawling import DeepCrawlStrategy
+
 from .cache_context import CacheMode
+from .proxy_strategy import ProxyRotationStrategy
+
+from typing import Union, List
+import inspect
+from typing import Any, Dict, Optional
+from enum import Enum
+
+# from .proxy_strategy import ProxyConfig
+
+
+
+def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
+    """
+    Recursively convert an object to a serializable dictionary using {type, params} structure
+    for complex objects.
+    """
+    if obj is None:
+        return None
+
+    # Handle basic types
+    if isinstance(obj, (str, int, float, bool)):
+        return obj
+
+    # Handle Enum
+    if isinstance(obj, Enum):
+        return {"type": obj.__class__.__name__, "params": obj.value}
+
+    # Handle datetime objects
+    if hasattr(obj, "isoformat"):
+        return obj.isoformat()
+
+    # Handle lists, tuples, and sets, and basically any iterable
+    if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__') and not isinstance(obj, dict):
+        return [to_serializable_dict(item) for item in obj]
+
+    # Handle frozensets, which are not iterable
+    if isinstance(obj, frozenset):
+        return [to_serializable_dict(item) for item in list(obj)]
+
+    # Handle dictionaries - preserve them as-is
+    if isinstance(obj, dict):
+        return {
+            "type": "dict",  # Mark as plain dictionary
+            "value": {str(k): to_serializable_dict(v) for k, v in obj.items()},
+        }
+
+    _type = obj.__class__.__name__
+
+    # Handle class instances
+    if hasattr(obj, "__class__"):
+        # Get constructor signature
+        sig = inspect.signature(obj.__class__.__init__)
+        params = sig.parameters
+
+        # Get current values
+        current_values = {}
+        for name, param in params.items():
+            if name == "self":
+                continue
+
+            value = getattr(obj, name, param.default)
+
+            # Only include if different from default, considering empty values
+            if not (is_empty_value(value) and is_empty_value(param.default)):
+                if value != param.default and not ignore_default_value:
+                    current_values[name] = to_serializable_dict(value)
+        
+        if hasattr(obj, '__slots__'):
+            for slot in obj.__slots__:
+                if slot.startswith('_'):  # Handle private slots
+                    attr_name = slot[1:]  # Remove leading '_'
+                    value = getattr(obj, slot, None)
+                    if value is not None:
+                        current_values[attr_name] = to_serializable_dict(value)
+
+            
+        
+        return {
+            "type": obj.__class__.__name__,
+            "params": current_values
+        }
+        
+    return str(obj)
+
+
+def from_serializable_dict(data: Any) -> Any:
+    """
+    Recursively convert a serializable dictionary back to an object instance.
+    """
+    if data is None:
+        return None
+
+    # Handle basic types
+    if isinstance(data, (str, int, float, bool)):
+        return data
+
+    # Handle typed data
+    if isinstance(data, dict) and "type" in data:
+        # Handle plain dictionaries
+        if data["type"] == "dict" and "value" in data:
+            return {k: from_serializable_dict(v) for k, v in data["value"].items()}
+
+        # Import from crawl4ai for class instances
+        import crawl4ai
+
+        if hasattr(crawl4ai, data["type"]):
+            cls = getattr(crawl4ai, data["type"])
+
+            # Handle Enum
+            if issubclass(cls, Enum):
+                return cls(data["params"])
+
+            if "params" in data:
+                # Handle class instances
+                constructor_args = {
+                    k: from_serializable_dict(v) for k, v in data["params"].items()
+                }
+                return cls(**constructor_args)
+
+    # Handle lists
+    if isinstance(data, list):
+        return [from_serializable_dict(item) for item in data]
+
+    # Handle raw dictionaries (legacy support)
+    if isinstance(data, dict):
+        return {k: from_serializable_dict(v) for k, v in data.items()}
+
+    return data
+
+
+def is_empty_value(value: Any) -> bool:
+    """Check if a value is effectively empty/null."""
+    if value is None:
+        return True
+    if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
+        return True
+    return False
+
+class GeolocationConfig:
+    def __init__(
+        self,
+        latitude: float,
+        longitude: float,
+        accuracy: Optional[float] = 0.0
+    ):
+        """Configuration class for geolocation settings.
+        
+        Args:
+            latitude: Latitude coordinate (e.g., 37.7749)
+            longitude: Longitude coordinate (e.g., -122.4194)
+            accuracy: Accuracy in meters. Default: 0.0
+        """
+        self.latitude = latitude
+        self.longitude = longitude
+        self.accuracy = accuracy
+    
+    @staticmethod
+    def from_dict(geo_dict: Dict) -> "GeolocationConfig":
+        """Create a GeolocationConfig from a dictionary."""
+        return GeolocationConfig(
+            latitude=geo_dict.get("latitude"),
+            longitude=geo_dict.get("longitude"),
+            accuracy=geo_dict.get("accuracy", 0.0)
+        )
+    
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "latitude": self.latitude,
+            "longitude": self.longitude,
+            "accuracy": self.accuracy
+        }
+    
+    def clone(self, **kwargs) -> "GeolocationConfig":
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            GeolocationConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return GeolocationConfig.from_dict(config_dict)
+
+
+class ProxyConfig:
+    def __init__(
+        self,
+        server: str,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        ip: Optional[str] = None,
+    ):
+        """Configuration class for a single proxy.
+        
+        Args:
+            server: Proxy server URL (e.g., "http://127.0.0.1:8080")
+            username: Optional username for proxy authentication
+            password: Optional password for proxy authentication
+            ip: Optional IP address for verification purposes
+        """
+        self.server = server
+        self.username = username
+        self.password = password
+        
+        # Extract IP from server if not explicitly provided
+        self.ip = ip or self._extract_ip_from_server()
+    
+    def _extract_ip_from_server(self) -> Optional[str]:
+        """Extract IP address from server URL."""
+        try:
+            # Simple extraction assuming http://ip:port format
+            if "://" in self.server:
+                parts = self.server.split("://")[1].split(":")
+                return parts[0]
+            else:
+                parts = self.server.split(":")
+                return parts[0]
+        except Exception:
+            return None
+    
+    @staticmethod
+    def from_string(proxy_str: str) -> "ProxyConfig":
+        """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
+        parts = proxy_str.split(":")
+        if len(parts) == 4:  # ip:port:username:password
+            ip, port, username, password = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                username=username,
+                password=password,
+                ip=ip
+            )
+        elif len(parts) == 2:  # ip:port only
+            ip, port = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                ip=ip
+            )
+        else:
+            raise ValueError(f"Invalid proxy string format: {proxy_str}")
+    
+    @staticmethod
+    def from_dict(proxy_dict: Dict) -> "ProxyConfig":
+        """Create a ProxyConfig from a dictionary."""
+        return ProxyConfig(
+            server=proxy_dict.get("server"),
+            username=proxy_dict.get("username"),
+            password=proxy_dict.get("password"),
+            ip=proxy_dict.get("ip")
+        )
+    
+    @staticmethod
+    def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
+        """Load proxies from environment variable.
+        
+        Args:
+            env_var: Name of environment variable containing comma-separated proxy strings
+            
+        Returns:
+            List of ProxyConfig objects
+        """
+        proxies = []
+        try:
+            proxy_list = os.getenv(env_var, "").split(",")
+            for proxy in proxy_list:
+                if not proxy:
+                    continue
+                proxies.append(ProxyConfig.from_string(proxy))
+        except Exception as e:
+            print(f"Error loading proxies from environment: {e}")
+        return proxies
+    
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "server": self.server,
+            "username": self.username,
+            "password": self.password,
+            "ip": self.ip
+        }
+    
+    def clone(self, **kwargs) -> "ProxyConfig":
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            ProxyConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return ProxyConfig.from_dict(config_dict)
+
 
 
 class BrowserConfig:
@@ -30,6 +333,12 @@ class BrowserConfig:
                             Default: "chromium".
         headless (bool): Whether to run the browser in headless mode (no visible GUI).
                          Default: True.
+        browser_mode (str): Determines how the browser should be initialized:
+                           "builtin" - use the builtin CDP browser running in background
+                           "dedicated" - create a new dedicated browser instance each time
+                           "cdp" - use explicit CDP settings provided in cdp_url
+                           "docker" - run browser in Docker container with isolation
+                           Default: "dedicated"
         use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
                                     advanced manipulation. Default: False.
         cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
@@ -44,17 +353,19 @@ class BrowserConfig:
                               is "chromium". Default: "chromium".
         proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
                              Default: None.
-        proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                      If None, no additional proxy config. Default: None.
         viewport_width (int): Default viewport width for pages. Default: 1080.
         viewport_height (int): Default viewport height for pages. Default: 600.
+        viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
+                         Default: None.
         verbose (bool): Enable verbose logging.
                         Default: True.
         accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
                                  Default: False.
         downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
                                       a default path will be created. Default: None.
-        storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
+        storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
                                              Default: None.
         ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
         java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
@@ -80,6 +391,7 @@ class BrowserConfig:
         self,
         browser_type: str = "chromium",
         headless: bool = True,
+        browser_mode: str = "dedicated",
         use_managed_browser: bool = False,
         cdp_url: str = None,
         use_persistent_context: bool = False,
@@ -87,12 +399,13 @@ class BrowserConfig:
         chrome_channel: str = "chromium",
         channel: str = "chromium",
         proxy: str = None,
-        proxy_config: dict = None,
+        proxy_config: Union[ProxyConfig, dict, None] = None,
         viewport_width: int = 1080,
         viewport_height: int = 600,
+        viewport: dict = None,
         accept_downloads: bool = False,
         downloads_path: str = None,
-        storage_state : Union[str, dict, None]=None,
+        storage_state: Union[str, dict, None] = None,
         ignore_https_errors: bool = True,
         java_script_enabled: bool = True,
         sleep_on_close: bool = False,
@@ -114,7 +427,8 @@ class BrowserConfig:
         host: str = "localhost",
     ):
         self.browser_type = browser_type
-        self.headless = headless
+        self.headless = headless 
+        self.browser_mode = browser_mode
         self.use_managed_browser = use_managed_browser
         self.cdp_url = cdp_url
         self.use_persistent_context = use_persistent_context
@@ -126,8 +440,14 @@ class BrowserConfig:
             self.chrome_channel = ""
         self.proxy = proxy
         self.proxy_config = proxy_config
+
+
         self.viewport_width = viewport_width
         self.viewport_height = viewport_height
+        self.viewport = viewport
+        if self.viewport is not None:
+            self.viewport_width = self.viewport.get("width", 1080)
+            self.viewport_height = self.viewport.get("height", 600)
         self.accept_downloads = accept_downloads
         self.downloads_path = downloads_path
         self.storage_state = storage_state
@@ -144,6 +464,7 @@ class BrowserConfig:
         self.sleep_on_close = sleep_on_close
         self.verbose = verbose
         self.debugging_port = debugging_port
+        self.host = host
 
         fa_user_agenr_generator = ValidUAGenerator()
         if self.user_agent_mode == "random":
@@ -152,10 +473,26 @@ class BrowserConfig:
             )
         else:
             pass
-        
+
         self.browser_hint = UAGen.generate_client_hints(self.user_agent)
         self.headers.setdefault("sec-ch-ua", self.browser_hint)
 
+        # Set appropriate browser management flags based on browser_mode
+        if self.browser_mode == "builtin":
+            # Builtin mode uses managed browser connecting to builtin CDP endpoint
+            self.use_managed_browser = True
+            # cdp_url will be set later by browser_manager
+        elif self.browser_mode == "docker":
+            # Docker mode uses managed browser with CDP to connect to browser in container
+            self.use_managed_browser = True
+            # cdp_url will be set later by docker browser strategy
+        elif self.browser_mode == "custom" and self.cdp_url:
+            # Custom mode with explicit CDP URL
+            self.use_managed_browser = True
+        elif self.browser_mode == "dedicated":
+            # Dedicated mode uses a new browser instance each time
+            pass
+
         # If persistent context is requested, ensure managed browser is enabled
         if self.use_persistent_context:
             self.use_managed_browser = True
@@ -165,6 +502,7 @@ class BrowserConfig:
         return BrowserConfig(
             browser_type=kwargs.get("browser_type", "chromium"),
             headless=kwargs.get("headless", True),
+            browser_mode=kwargs.get("browser_mode", "dedicated"),
             use_managed_browser=kwargs.get("use_managed_browser", False),
             cdp_url=kwargs.get("cdp_url"),
             use_persistent_context=kwargs.get("use_persistent_context", False),
@@ -172,7 +510,7 @@ class BrowserConfig:
             chrome_channel=kwargs.get("chrome_channel", "chromium"),
             channel=kwargs.get("channel", "chromium"),
             proxy=kwargs.get("proxy"),
-            proxy_config=kwargs.get("proxy_config"),
+            proxy_config=kwargs.get("proxy_config", None),
             viewport_width=kwargs.get("viewport_width", 1080),
             viewport_height=kwargs.get("viewport_height", 600),
             accept_downloads=kwargs.get("accept_downloads", False),
@@ -192,12 +530,15 @@ class BrowserConfig:
             text_mode=kwargs.get("text_mode", False),
             light_mode=kwargs.get("light_mode", False),
             extra_args=kwargs.get("extra_args", []),
+            debugging_port=kwargs.get("debugging_port", 9222),
+            host=kwargs.get("host", "localhost"),
         )
 
     def to_dict(self):
-        return {
+        result = {
             "browser_type": self.browser_type,
             "headless": self.headless,
+            "browser_mode": self.browser_mode,
             "use_managed_browser": self.use_managed_browser,
             "cdp_url": self.cdp_url,
             "use_persistent_context": self.use_persistent_context,
@@ -224,14 +565,18 @@ class BrowserConfig:
             "sleep_on_close": self.sleep_on_close,
             "verbose": self.verbose,
             "debugging_port": self.debugging_port,
+            "host": self.host,
         }
 
+                
+        return result
+
     def clone(self, **kwargs):
         """Create a copy of this configuration with updated values.
-        
+
         Args:
             **kwargs: Key-value pairs of configuration options to update
-            
+
         Returns:
             BrowserConfig: A new instance with the specified updates
         """
@@ -239,8 +584,98 @@ class BrowserConfig:
         config_dict.update(kwargs)
         return BrowserConfig.from_kwargs(config_dict)
 
+    # Create a funciton returns dict of the object
+    def dump(self) -> dict:
+        # Serialize the object to a dictionary
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "BrowserConfig":
+        # Deserialize the object from a dictionary
+        config = from_serializable_dict(data)
+        if isinstance(config, BrowserConfig):
+            return config
+        return BrowserConfig.from_kwargs(config)
+
+
+class HTTPCrawlerConfig:
+    """HTTP-specific crawler configuration"""
+
+    method: str = "GET"
+    headers: Optional[Dict[str, str]] = None
+    data: Optional[Dict[str, Any]] = None
+    json: Optional[Dict[str, Any]] = None
+    follow_redirects: bool = True
+    verify_ssl: bool = True
+
+    def __init__(
+        self,
+        method: str = "GET",
+        headers: Optional[Dict[str, str]] = None,
+        data: Optional[Dict[str, Any]] = None,
+        json: Optional[Dict[str, Any]] = None,
+        follow_redirects: bool = True,
+        verify_ssl: bool = True,
+    ):
+        self.method = method
+        self.headers = headers
+        self.data = data
+        self.json = json
+        self.follow_redirects = follow_redirects
+        self.verify_ssl = verify_ssl
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig":
+        return HTTPCrawlerConfig(
+            method=kwargs.get("method", "GET"),
+            headers=kwargs.get("headers"),
+            data=kwargs.get("data"),
+            json=kwargs.get("json"),
+            follow_redirects=kwargs.get("follow_redirects", True),
+            verify_ssl=kwargs.get("verify_ssl", True),
+        )
+
+    def to_dict(self):
+        return {
+            "method": self.method,
+            "headers": self.headers,
+            "data": self.data,
+            "json": self.json,
+            "follow_redirects": self.follow_redirects,
+            "verify_ssl": self.verify_ssl,
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            HTTPCrawlerConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return HTTPCrawlerConfig.from_kwargs(config_dict)
+
+    def dump(self) -> dict:
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "HTTPCrawlerConfig":
+        config = from_serializable_dict(data)
+        if isinstance(config, HTTPCrawlerConfig):
+            return config
+        return HTTPCrawlerConfig.from_kwargs(config)
+
+class CrawlerRunConfig():
+    _UNWANTED_PROPS = {
+        'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
+        'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
+        'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
+        'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
+    }
 
-class CrawlerRunConfig:
     """
     Configuration class for controlling how the crawler runs each crawl operation.
     This includes parameters for content extraction, page manipulation, waiting conditions,
@@ -250,6 +685,9 @@ class CrawlerRunConfig:
     By using this class, you have a single place to understand and adjust the crawling options.
 
     Attributes:
+        # Deep Crawl Parameters
+        deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling.
+
         # Content Processing Parameters
         word_count_threshold (int): Minimum word count threshold before processing content.
                                     Default: MIN_WORD_THRESHOLD (typically 200).
@@ -259,18 +697,27 @@ class CrawlerRunConfig:
                                               Default: RegexChunking().
         markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
                                                          Default: None.
-        content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
-                                                        Default: None.
         only_text (bool): If True, attempt to extract text-only content where applicable.
                           Default: False.
         css_selector (str or None): CSS selector to extract a specific portion of the page.
                                     Default: None.
+        
+        target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation 
+                                                and structured data extraction. When you set this, only the contents 
+                                                of these elements are processed for extraction and Markdown generation. 
+                                                If you do not set any value, the entire page is processed. 
+                                                The difference between this and css_selector is that this will shrink 
+                                                the initial raw HTML to the selected element, while this will only affect 
+                                                the extraction and Markdown generation.
+                                    Default: None
         excluded_tags (list of str or None): List of HTML tags to exclude from processing.
                                              Default: None.
         excluded_selector (str or None): CSS selector to exclude from processing.
                                          Default: None.
         keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
                                      Default: False.
+        keep_attrs (list of str): List of HTML attributes to keep during processing.
+                                      Default: [].
         remove_forms (bool): If True, remove all `<form>` elements from the HTML.
                              Default: False.
         prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
@@ -279,13 +726,23 @@ class CrawlerRunConfig:
                            Default: "lxml".
         scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
                            Default: WebScrapingStrategy.
-        proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                      If None, no additional proxy config. Default: None.
 
+        # Browser Location and Identity Parameters
+        locale (str or None): Locale to use for the browser context (e.g., "en-US").
+                             Default: None.
+        timezone_id (str or None): Timezone identifier to use for the browser context (e.g., "America/New_York").
+                                  Default: None.
+        geolocation (GeolocationConfig or None): Geolocation configuration for the browser.
+                                                Default: None.
+
+        # SSL Parameters
+        fetch_ssl_certificate: bool = False,
         # Caching Parameters
         cache_mode (CacheMode or None): Defines how caching is handled.
                                         If None, defaults to CacheMode.ENABLED internally.
-                                        Default: None.
+                                        Default: CacheMode.BYPASS.
         session_id (str or None): Optional session ID to persist the browser context and the created
                                   page instance. If the ID already exists, the crawler does not
                                   create a new page and uses the current page to preserve the state.
@@ -357,16 +814,22 @@ class CrawlerRunConfig:
                                      Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
         exclude_external_images (bool): If True, exclude all external images from processing.
                                          Default: False.
+        table_score_threshold (int): Minimum score threshold for processing a table.
+                                     Default: 7.
 
         # Link and Domain Handling Parameters
         exclude_social_media_domains (list of str): List of domains to exclude for social media links.
                                                     Default: SOCIAL_MEDIA_DOMAINS (from config).
         exclude_external_links (bool): If True, exclude all external links from the results.
                                        Default: False.
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.
         exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
                                            Default: False.
         exclude_domains (list of str): List of specific domains to exclude from results.
                                        Default: [].
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.
 
         # Debugging and Logging Parameters
         verbose (bool): Enable verbose logging.
@@ -374,19 +837,33 @@ class CrawlerRunConfig:
         log_console (bool): If True, log console messages from the page.
                             Default: False.
 
-        # Streaming Parameters
+        # HTTP Crwler Strategy Parameters
+        method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
+                        Default: "GET".
+        data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+                        Default: None.
+        json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+
+        # Connection Parameters
         stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
                       Default: False.
 
-        # Optional Parameters
-        stream (bool): If True, stream the page content as it is being loaded.
-        url: str = None  # This is not a compulsory parameter
         check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
-        user_agent (str): Custom User-Agent string to use. Default: None
-        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
-                                       user_agent as-is. Default: None.
+                                 Default: False.
+        user_agent (str): Custom User-Agent string to use.
+                          Default: None.
+        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
+                                       Default: None.
         user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
                                                     Default: None.
+
+        # Experimental Parameters
+        experimental (dict): Dictionary containing experimental parameters that are in beta phase.
+                            This allows passing temporary features that are not yet fully integrated 
+                            into the main parameter set.
+                            Default: None.
+
+        url: str = None  # This is not a compulsory parameter
     """
 
     def __init__(
@@ -395,22 +872,28 @@ class CrawlerRunConfig:
         word_count_threshold: int = MIN_WORD_THRESHOLD,
         extraction_strategy: ExtractionStrategy = None,
         chunking_strategy: ChunkingStrategy = RegexChunking(),
-        markdown_generator: MarkdownGenerationStrategy = None,
-        content_filter : RelevantContentFilter = None,
+        markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
         only_text: bool = False,
         css_selector: str = None,
+        target_elements: List[str] = None,
         excluded_tags: list = None,
         excluded_selector: str = None,
         keep_data_attributes: bool = False,
+        keep_attrs: list = None,
         remove_forms: bool = False,
         prettiify: bool = False,
         parser_type: str = "lxml",
         scraping_strategy: ContentScrapingStrategy = None,
-        proxy_config: dict = None,
+        proxy_config: Union[ProxyConfig, dict, None] = None,
+        proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
+        # Browser Location and Identity Parameters
+        locale: Optional[str] = None,
+        timezone_id: Optional[str] = None,
+        geolocation: Optional[GeolocationConfig] = None,
         # SSL Parameters
         fetch_ssl_certificate: bool = False,
         # Caching Parameters
-        cache_mode: CacheMode =None,
+        cache_mode: CacheMode = CacheMode.BYPASS,
         session_id: str = None,
         bypass_cache: bool = False,
         disable_cache: bool = False,
@@ -443,25 +926,38 @@ class CrawlerRunConfig:
         screenshot_wait_for: float = None,
         screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
         pdf: bool = False,
+        capture_mhtml: bool = False,
         image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
         image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
+        table_score_threshold: int = 7,
         exclude_external_images: bool = False,
+        exclude_all_images: bool = False,
         # Link and Domain Handling Parameters
         exclude_social_media_domains: list = None,
         exclude_external_links: bool = False,
         exclude_social_media_links: bool = False,
         exclude_domains: list = None,
+        exclude_internal_links: bool = False,
         # Debugging and Logging Parameters
         verbose: bool = True,
         log_console: bool = False,
-        # Streaming Parameters
+        # Network and Console Capturing Parameters
+        capture_network_requests: bool = False,
+        capture_console_messages: bool = False,
+        # Connection Parameters
+        method: str = "GET",
         stream: bool = False,
         url: str = None,
         check_robots_txt: bool = False,
         user_agent: str = None,
         user_agent_mode: str = None,
         user_agent_generator_config: dict = {},
+        # Deep Crawl Parameters
+        deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
+        # Experimental Parameters
+        experimental: Dict[str, Any] = None,
     ):
+        # TODO: Planning to set properties dynamically based on the __init__ signature
         self.url = url
 
         # Content Processing Parameters
@@ -469,17 +965,24 @@ class CrawlerRunConfig:
         self.extraction_strategy = extraction_strategy
         self.chunking_strategy = chunking_strategy
         self.markdown_generator = markdown_generator
-        self.content_filter = content_filter
         self.only_text = only_text
         self.css_selector = css_selector
+        self.target_elements = target_elements or []
         self.excluded_tags = excluded_tags or []
         self.excluded_selector = excluded_selector or ""
         self.keep_data_attributes = keep_data_attributes
+        self.keep_attrs = keep_attrs or []
         self.remove_forms = remove_forms
         self.prettiify = prettiify
         self.parser_type = parser_type
         self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
         self.proxy_config = proxy_config
+        self.proxy_rotation_strategy = proxy_rotation_strategy
+        
+        # Browser Location and Identity Parameters
+        self.locale = locale
+        self.timezone_id = timezone_id
+        self.geolocation = geolocation
 
         # SSL Parameters
         self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -521,9 +1024,12 @@ class CrawlerRunConfig:
         self.screenshot_wait_for = screenshot_wait_for
         self.screenshot_height_threshold = screenshot_height_threshold
         self.pdf = pdf
+        self.capture_mhtml = capture_mhtml
         self.image_description_min_word_threshold = image_description_min_word_threshold
         self.image_score_threshold = image_score_threshold
         self.exclude_external_images = exclude_external_images
+        self.exclude_all_images = exclude_all_images
+        self.table_score_threshold = table_score_threshold
 
         # Link and Domain Handling Parameters
         self.exclude_social_media_domains = (
@@ -532,13 +1038,19 @@ class CrawlerRunConfig:
         self.exclude_external_links = exclude_external_links
         self.exclude_social_media_links = exclude_social_media_links
         self.exclude_domains = exclude_domains or []
+        self.exclude_internal_links = exclude_internal_links
 
         # Debugging and Logging Parameters
         self.verbose = verbose
         self.log_console = log_console
+        
+        # Network and Console Capturing Parameters
+        self.capture_network_requests = capture_network_requests
+        self.capture_console_messages = capture_console_messages
 
-        # Streaming Parameters
+        # Connection Parameters
         self.stream = stream
+        self.method = method
 
         # Robots.txt Handling Parameters
         self.check_robots_txt = check_robots_txt
@@ -566,6 +1078,30 @@ class CrawlerRunConfig:
         if self.chunking_strategy is None:
             self.chunking_strategy = RegexChunking()
 
+        # Deep Crawl Parameters
+        self.deep_crawl_strategy = deep_crawl_strategy
+        
+        # Experimental Parameters
+        self.experimental = experimental or {}
+
+
+    def __getattr__(self, name):
+        """Handle attribute access."""
+        if name in self._UNWANTED_PROPS:
+            raise AttributeError(f"Getting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        raise AttributeError(f"'{self.__class__.__name__}' has no attribute '{name}'")
+
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)
+
     @staticmethod
     def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
         return CrawlerRunConfig(
@@ -574,21 +1110,27 @@ class CrawlerRunConfig:
             extraction_strategy=kwargs.get("extraction_strategy"),
             chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
             markdown_generator=kwargs.get("markdown_generator"),
-            content_filter=kwargs.get("content_filter"),
             only_text=kwargs.get("only_text", False),
             css_selector=kwargs.get("css_selector"),
+            target_elements=kwargs.get("target_elements", []),
             excluded_tags=kwargs.get("excluded_tags", []),
             excluded_selector=kwargs.get("excluded_selector", ""),
             keep_data_attributes=kwargs.get("keep_data_attributes", False),
+            keep_attrs=kwargs.get("keep_attrs", []),
             remove_forms=kwargs.get("remove_forms", False),
             prettiify=kwargs.get("prettiify", False),
             parser_type=kwargs.get("parser_type", "lxml"),
             scraping_strategy=kwargs.get("scraping_strategy"),
             proxy_config=kwargs.get("proxy_config"),
+            proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
+            # Browser Location and Identity Parameters
+            locale=kwargs.get("locale", None),
+            timezone_id=kwargs.get("timezone_id", None),
+            geolocation=kwargs.get("geolocation", None),
             # SSL Parameters
             fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
             # Caching Parameters
-            cache_mode=kwargs.get("cache_mode"),
+            cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
             session_id=kwargs.get("session_id"),
             bypass_cache=kwargs.get("bypass_cache", False),
             disable_cache=kwargs.get("disable_cache", False),
@@ -623,6 +1165,7 @@ class CrawlerRunConfig:
                 "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
             ),
             pdf=kwargs.get("pdf", False),
+            capture_mhtml=kwargs.get("capture_mhtml", False),
             image_description_min_word_threshold=kwargs.get(
                 "image_description_min_word_threshold",
                 IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -630,6 +1173,8 @@ class CrawlerRunConfig:
             image_score_threshold=kwargs.get(
                 "image_score_threshold", IMAGE_SCORE_THRESHOLD
             ),
+            table_score_threshold=kwargs.get("table_score_threshold", 7),
+            exclude_all_images=kwargs.get("exclude_all_images", False),
             exclude_external_images=kwargs.get("exclude_external_images", False),
             # Link and Domain Handling Parameters
             exclude_social_media_domains=kwargs.get(
@@ -638,36 +1183,62 @@ class CrawlerRunConfig:
             exclude_external_links=kwargs.get("exclude_external_links", False),
             exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
             exclude_domains=kwargs.get("exclude_domains", []),
+            exclude_internal_links=kwargs.get("exclude_internal_links", False),
             # Debugging and Logging Parameters
             verbose=kwargs.get("verbose", True),
             log_console=kwargs.get("log_console", False),
-            # Streaming Parameters
+            # Network and Console Capturing Parameters
+            capture_network_requests=kwargs.get("capture_network_requests", False),
+            capture_console_messages=kwargs.get("capture_console_messages", False),
+            # Connection Parameters
+            method=kwargs.get("method", "GET"),
             stream=kwargs.get("stream", False),
-            url=kwargs.get("url"),
             check_robots_txt=kwargs.get("check_robots_txt", False),
             user_agent=kwargs.get("user_agent"),
             user_agent_mode=kwargs.get("user_agent_mode"),
             user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
+            # Deep Crawl Parameters
+            deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
+            url=kwargs.get("url"),
+            # Experimental Parameters 
+            experimental=kwargs.get("experimental"),
         )
 
     # Create a funciton returns dict of the object
+    def dump(self) -> dict:
+        # Serialize the object to a dictionary
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "CrawlerRunConfig":
+        # Deserialize the object from a dictionary
+        config = from_serializable_dict(data)
+        if isinstance(config, CrawlerRunConfig):
+            return config
+        return CrawlerRunConfig.from_kwargs(config)
+
     def to_dict(self):
         return {
             "word_count_threshold": self.word_count_threshold,
             "extraction_strategy": self.extraction_strategy,
             "chunking_strategy": self.chunking_strategy,
             "markdown_generator": self.markdown_generator,
-            "content_filter": self.content_filter,
             "only_text": self.only_text,
             "css_selector": self.css_selector,
+            "target_elements": self.target_elements,
             "excluded_tags": self.excluded_tags,
             "excluded_selector": self.excluded_selector,
             "keep_data_attributes": self.keep_data_attributes,
+            "keep_attrs": self.keep_attrs,
             "remove_forms": self.remove_forms,
             "prettiify": self.prettiify,
             "parser_type": self.parser_type,
             "scraping_strategy": self.scraping_strategy,
             "proxy_config": self.proxy_config,
+            "proxy_rotation_strategy": self.proxy_rotation_strategy,
+            "locale": self.locale,
+            "timezone_id": self.timezone_id,
+            "geolocation": self.geolocation,
             "fetch_ssl_certificate": self.fetch_ssl_certificate,
             "cache_mode": self.cache_mode,
             "session_id": self.session_id,
@@ -699,37 +1270,46 @@ class CrawlerRunConfig:
             "screenshot_wait_for": self.screenshot_wait_for,
             "screenshot_height_threshold": self.screenshot_height_threshold,
             "pdf": self.pdf,
+            "capture_mhtml": self.capture_mhtml,
             "image_description_min_word_threshold": self.image_description_min_word_threshold,
             "image_score_threshold": self.image_score_threshold,
+            "table_score_threshold": self.table_score_threshold,
+            "exclude_all_images": self.exclude_all_images,
             "exclude_external_images": self.exclude_external_images,
             "exclude_social_media_domains": self.exclude_social_media_domains,
             "exclude_external_links": self.exclude_external_links,
             "exclude_social_media_links": self.exclude_social_media_links,
             "exclude_domains": self.exclude_domains,
+            "exclude_internal_links": self.exclude_internal_links,
             "verbose": self.verbose,
             "log_console": self.log_console,
+            "capture_network_requests": self.capture_network_requests,
+            "capture_console_messages": self.capture_console_messages,
+            "method": self.method,
             "stream": self.stream,
-            "url": self.url,
             "check_robots_txt": self.check_robots_txt,
             "user_agent": self.user_agent,
             "user_agent_mode": self.user_agent_mode,
             "user_agent_generator_config": self.user_agent_generator_config,
+            "deep_crawl_strategy": self.deep_crawl_strategy,
+            "url": self.url,
+            "experimental": self.experimental,
         }
 
     def clone(self, **kwargs):
         """Create a copy of this configuration with updated values.
-        
+
         Args:
             **kwargs: Key-value pairs of configuration options to update
-            
+
         Returns:
             CrawlerRunConfig: A new instance with the specified updates
-            
+
         Example:
             ```python
             # Create a new config with streaming enabled
             stream_config = config.clone(stream=True)
-            
+
             # Create a new config with multiple updates
             new_config = config.clone(
                 stream=True,
@@ -741,3 +1321,90 @@ class CrawlerRunConfig:
         config_dict = self.to_dict()
         config_dict.update(kwargs)
         return CrawlerRunConfig.from_kwargs(config_dict)
+
+
+class LLMConfig:
+    def __init__(
+        self,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: Optional[str] = None,
+        temprature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+        n: Optional[int] = None,    
+    ):
+        """Configuaration class for LLM provider and API token."""
+        self.provider = provider
+        if api_token and not api_token.startswith("env:"):
+            self.api_token = api_token
+        elif api_token and api_token.startswith("env:"):
+            self.api_token = os.getenv(api_token[4:])
+        else:
+            # Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
+            # If not, check if it is in PROVIDER_MODELS
+            prefixes = PROVIDER_MODELS_PREFIXES.keys()
+            if any(provider.startswith(prefix) for prefix in prefixes):
+                selected_prefix = next(
+                    (prefix for prefix in prefixes if provider.startswith(prefix)),
+                    None,
+                )
+                self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)                    
+            else:
+                self.provider = DEFAULT_PROVIDER
+                self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
+        self.base_url = base_url
+        self.temprature = temprature
+        self.max_tokens = max_tokens
+        self.top_p = top_p
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+        self.stop = stop
+        self.n = n
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "LLMConfig":
+        return LLMConfig(
+            provider=kwargs.get("provider", DEFAULT_PROVIDER),
+            api_token=kwargs.get("api_token"),
+            base_url=kwargs.get("base_url"),
+            temprature=kwargs.get("temprature"),
+            max_tokens=kwargs.get("max_tokens"),
+            top_p=kwargs.get("top_p"),
+            frequency_penalty=kwargs.get("frequency_penalty"),
+            presence_penalty=kwargs.get("presence_penalty"),
+            stop=kwargs.get("stop"),
+            n=kwargs.get("n")
+        )
+
+    def to_dict(self):
+        return {
+            "provider": self.provider,
+            "api_token": self.api_token,
+            "base_url": self.base_url,
+            "temprature": self.temprature,
+            "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "stop": self.stop,
+            "n": self.n
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            llm_config: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return LLMConfig.from_kwargs(config_dict)
+
+
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 5e3c2519..c0e75974 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1,14 +1,13 @@
+from __future__ import annotations
+
 import asyncio
 import base64
 import time
 from abc import ABC, abstractmethod
-from typing import Callable, Dict, Any, List, Optional, Union
+from typing import Callable, Dict, Any, List, Union
+from typing import Optional, AsyncGenerator, Final
 import os
-import sys
-import shutil
-import tempfile
-import subprocess
-from playwright.async_api import Page, Error, BrowserContext
+from playwright.async_api import Page, Error
 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
 from io import BytesIO
 from PIL import Image, ImageDraw, ImageFont
@@ -16,796 +15,21 @@ import hashlib
 import uuid
 from .js_snippet import load_js_script
 from .models import AsyncCrawlResponse
-from .user_agent_generator import UserAgentGenerator
-from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
-from .async_configs import BrowserConfig, CrawlerRunConfig
+from .config import SCREENSHOT_HEIGHT_TRESHOLD
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
 from .async_logger import AsyncLogger
-from playwright_stealth import StealthConfig
 from .ssl_certificate import SSLCertificate
-from .utils import get_home_folder, get_chromium_path
-from .user_agent_generator import ValidUAGenerator, OnlineUAGenerator
-
-stealth_config = StealthConfig(
-    webdriver=True,
-    chrome_app=True,
-    chrome_csi=True,
-    chrome_load_times=True,
-    chrome_runtime=True,
-    navigator_languages=True,
-    navigator_plugins=True,
-    navigator_permissions=True,
-    webgl_vendor=True,
-    outerdimensions=True,
-    navigator_hardware_concurrency=True,
-    media_codecs=True,
-)
-
-BROWSER_DISABLE_OPTIONS = [
-    "--disable-background-networking",
-    "--disable-background-timer-throttling",
-    "--disable-backgrounding-occluded-windows",
-    "--disable-breakpad",
-    "--disable-client-side-phishing-detection",
-    "--disable-component-extensions-with-background-pages",
-    "--disable-default-apps",
-    "--disable-extensions",
-    "--disable-features=TranslateUI",
-    "--disable-hang-monitor",
-    "--disable-ipc-flooding-protection",
-    "--disable-popup-blocking",
-    "--disable-prompt-on-repost",
-    "--disable-sync",
-    "--force-color-profile=srgb",
-    "--metrics-recording-only",
-    "--no-first-run",
-    "--password-store=basic",
-    "--use-mock-keychain",
-]
-
-
-class ManagedBrowser:
-    """
-    Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
-
-    Attributes:
-        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
-                            Default: "chromium".
-        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
-                                     temporary directory may be used. Default: None.
-        headless (bool): Whether to run the browser in headless mode (no visible GUI).
-                         Default: True.
-        browser_process (subprocess.Popen): The process object for the browser.
-        temp_dir (str): Temporary directory for user data if not provided.
-        debugging_port (int): Port for debugging the browser.
-        host (str): Host for debugging the browser.
-
-        Methods:
-            start(): Starts the browser process and returns the CDP endpoint URL.
-            _get_browser_path(): Returns the browser executable path based on OS and browser type.
-            _get_browser_args(): Returns browser-specific command line arguments.
-            _get_user_data_dir(): Returns the user data directory path.
-            _cleanup(): Terminates the browser process and removes the temporary directory.
-    """
-
-    browser_type: str
-    user_data_dir: str
-    headless: bool
-    browser_process: subprocess.Popen
-    temp_dir: str
-    debugging_port: int
-    host: str
-
-    def __init__(
-        self,
-        browser_type: str = "chromium",
-        user_data_dir: Optional[str] = None,
-        headless: bool = False,
-        logger=None,
-        host: str = "localhost",
-        debugging_port: int = 9222,
-        cdp_url: Optional[str] = None, 
-    ):
-        """
-        Initialize the ManagedBrowser instance.
-
-        Args:
-            browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
-                                Default: "chromium".
-            user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
-                                         temporary directory may be used. Default: None.
-            headless (bool): Whether to run the browser in headless mode (no visible GUI).
-                             Default: True.
-            logger (logging.Logger): Logger instance for logging messages. Default: None.
-            host (str): Host for debugging the browser. Default: "localhost".
-            debugging_port (int): Port for debugging the browser. Default: 9222.
-            cdp_url (str or None): CDP URL to connect to the browser. Default: None.
-        """
-        self.browser_type = browser_type
-        self.user_data_dir = user_data_dir
-        self.headless = headless
-        self.browser_process = None
-        self.temp_dir = None
-        self.debugging_port = debugging_port
-        self.host = host
-        self.logger = logger
-        self.shutting_down = False
-        self.cdp_url = cdp_url
-
-    async def start(self) -> str:
-        """
-        Starts the browser process or returns CDP endpoint URL.
-        If cdp_url is provided, returns it directly.
-        If user_data_dir is not provided for local browser, creates a temporary directory.
-        
-        Returns:
-            str: CDP endpoint URL
-        """
-        # If CDP URL provided, just return it
-        if self.cdp_url:
-            return self.cdp_url
-
-        # Create temp dir if needed
-        if not self.user_data_dir:
-            self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
-            self.user_data_dir = self.temp_dir
-
-        # Get browser path and args based on OS and browser type
-        # browser_path = self._get_browser_path()
-        args = await self._get_browser_args()
-
-        # Start browser process
-        try:
-            self.browser_process = subprocess.Popen(
-                args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-            )
-            # Monitor browser process output for errors
-            asyncio.create_task(self._monitor_browser_process())
-            await asyncio.sleep(2)  # Give browser time to start
-            return f"http://{self.host}:{self.debugging_port}"
-        except Exception as e:
-            await self.cleanup()
-            raise Exception(f"Failed to start browser: {e}")
-
-    async def _monitor_browser_process(self):
-        """
-        Monitor the browser process for unexpected termination.
-
-        How it works:
-        1. Read stdout and stderr from the browser process.
-        2. If the process has terminated, log the error message and terminate the browser.
-        3. If the shutting_down flag is set, log the normal termination message.
-        4. If any other error occurs, log the error message.
-
-        Note: This method should be called in a separate task to avoid blocking the main event loop.
-        """
-        if self.browser_process:
-            try:
-                stdout, stderr = await asyncio.gather(
-                    asyncio.to_thread(self.browser_process.stdout.read),
-                    asyncio.to_thread(self.browser_process.stderr.read),
-                )
-
-                # Check shutting_down flag BEFORE logging anything
-                if self.browser_process.poll() is not None:
-                    if not self.shutting_down:
-                        self.logger.error(
-                            message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
-                            tag="ERROR",
-                            params={
-                                "code": self.browser_process.returncode,
-                                "stdout": stdout.decode(),
-                                "stderr": stderr.decode(),
-                            },
-                        )
-                        await self.cleanup()
-                    else:
-                        self.logger.info(
-                            message="Browser process terminated normally | Code: {code}",
-                            tag="INFO",
-                            params={"code": self.browser_process.returncode},
-                        )
-            except Exception as e:
-                if not self.shutting_down:
-                    self.logger.error(
-                        message="Error monitoring browser process: {error}",
-                        tag="ERROR",
-                        params={"error": str(e)},
-                    )
-
-    def _get_browser_path_WIP(self) -> str:
-        """Returns the browser executable path based on OS and browser type"""
-        if sys.platform == "darwin":  # macOS
-            paths = {
-                "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
-                "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
-                "webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
-            }
-        elif sys.platform == "win32":  # Windows
-            paths = {
-                "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
-                "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
-                "webkit": None,  # WebKit not supported on Windows
-            }
-        else:  # Linux
-            paths = {
-                "chromium": "google-chrome",
-                "firefox": "firefox",
-                "webkit": None,  # WebKit not supported on Linux
-            }
-
-        return paths.get(self.browser_type)
-
-    async def _get_browser_path(self) -> str:
-        browser_path = await get_chromium_path(self.browser_type)
-        return browser_path
-
-    async def _get_browser_args(self) -> List[str]:
-        """Returns browser-specific command line arguments"""
-        base_args = [await self._get_browser_path()]
-
-        if self.browser_type == "chromium":
-            args = [
-                f"--remote-debugging-port={self.debugging_port}",
-                f"--user-data-dir={self.user_data_dir}",
-            ]
-            if self.headless:
-                args.append("--headless=new")
-        elif self.browser_type == "firefox":
-            args = [
-                "--remote-debugging-port",
-                str(self.debugging_port),
-                "--profile",
-                self.user_data_dir,
-            ]
-            if self.headless:
-                args.append("--headless")
-        else:
-            raise NotImplementedError(f"Browser type {self.browser_type} not supported")
-
-        return base_args + args
-
-    async def cleanup(self):
-        """Cleanup browser process and temporary directory"""
-        # Set shutting_down flag BEFORE any termination actions
-        self.shutting_down = True
-
-        if self.browser_process:
-            try:
-                self.browser_process.terminate()
-                # Wait for process to end gracefully
-                for _ in range(10):  # 10 attempts, 100ms each
-                    if self.browser_process.poll() is not None:
-                        break
-                    await asyncio.sleep(0.1)
-
-                # Force kill if still running
-                if self.browser_process.poll() is None:
-                    self.browser_process.kill()
-                    await asyncio.sleep(0.1)  # Brief wait for kill to take effect
-
-            except Exception as e:
-                self.logger.error(
-                    message="Error terminating browser: {error}",
-                    tag="ERROR",
-                    params={"error": str(e)},
-                )
-
-        if self.temp_dir and os.path.exists(self.temp_dir):
-            try:
-                shutil.rmtree(self.temp_dir)
-            except Exception as e:
-                self.logger.error(
-                    message="Error removing temporary directory: {error}",
-                    tag="ERROR",
-                    params={"error": str(e)},
-                )
-
-
-class BrowserManager:
-    """
-    Manages the browser instance and context.
-
-    Attributes:
-        config (BrowserConfig): Configuration object containing all browser settings
-        logger: Logger instance for recording events and errors
-        browser (Browser): The browser instance
-        default_context (BrowserContext): The default browser context
-        managed_browser (ManagedBrowser): The managed browser instance
-        playwright (Playwright): The Playwright instance
-        sessions (dict): Dictionary to store session information
-        session_ttl (int): Session timeout in seconds
-    """
-
-    def __init__(self, browser_config: BrowserConfig, logger=None):
-        """
-        Initialize the BrowserManager with a browser configuration.
-
-        Args:
-            browser_config (BrowserConfig): Configuration object containing all browser settings
-            logger: Logger instance for recording events and errors
-        """
-        self.config: BrowserConfig = browser_config
-        self.logger = logger
-
-        # Browser state
-        self.browser = None
-        self.default_context = None
-        self.managed_browser = None
-        self.playwright = None
-
-        # Session management
-        self.sessions = {}
-        self.session_ttl = 1800  # 30 minutes
-
-        # Keep track of contexts by a "config signature," so each unique config reuses a single context
-        self.contexts_by_config = {}
-        self._contexts_lock = asyncio.Lock() 
-
-        # Initialize ManagedBrowser if needed
-        if self.config.use_managed_browser:
-            self.managed_browser = ManagedBrowser(
-                browser_type=self.config.browser_type,
-                user_data_dir=self.config.user_data_dir,
-                headless=self.config.headless,
-                logger=self.logger,
-                debugging_port=self.config.debugging_port,
-            )
-
-    async def start(self):
-        """
-        Start the browser instance and set up the default context.
-
-        How it works:
-        1. Check if Playwright is already initialized.
-        2. If not, initialize Playwright.
-        3. If managed browser is used, start it and connect to the CDP endpoint.
-        4. If managed browser is not used, launch the browser and set up the default context.
-
-        Note: This method should be called in a separate task to avoid blocking the main event loop.
-        """
-        if self.playwright is None:
-            from playwright.async_api import async_playwright
-
-            self.playwright = await async_playwright().start()
-
-        if self.config.use_managed_browser:
-            cdp_url = await self.managed_browser.start()
-            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
-            contexts = self.browser.contexts
-            if contexts:
-                self.default_context = contexts[0]
-            else:
-                self.default_context = await self.create_browser_context()
-                # self.default_context = await self.browser.new_context(
-                #     viewport={
-                #         "width": self.config.viewport_width,
-                #         "height": self.config.viewport_height,
-                #     },
-                #     storage_state=self.config.storage_state,
-                #     user_agent=self.config.headers.get(
-                #         "User-Agent", self.config.user_agent
-                #     ),
-                #     accept_downloads=self.config.accept_downloads,
-                #     ignore_https_errors=self.config.ignore_https_errors,
-                #     java_script_enabled=self.config.java_script_enabled,
-                # )
-            await self.setup_context(self.default_context)
-        else:
-            browser_args = self._build_browser_args()
-
-            # Launch appropriate browser type
-            if self.config.browser_type == "firefox":
-                self.browser = await self.playwright.firefox.launch(**browser_args)
-            elif self.config.browser_type == "webkit":
-                self.browser = await self.playwright.webkit.launch(**browser_args)
-            else:
-                self.browser = await self.playwright.chromium.launch(**browser_args)
-
-            self.default_context = self.browser
-
-    def _build_browser_args(self) -> dict:
-        """Build browser launch arguments from config."""
-        args = [
-            "--disable-gpu",
-            "--disable-gpu-compositing",
-            "--disable-software-rasterizer",
-            "--no-sandbox",
-            "--disable-dev-shm-usage",
-            "--no-first-run",
-            "--no-default-browser-check",
-            "--disable-infobars",
-            "--window-position=0,0",
-            "--ignore-certificate-errors",
-            "--ignore-certificate-errors-spki-list",
-            "--disable-blink-features=AutomationControlled",
-            "--window-position=400,0",
-            "--disable-renderer-backgrounding",
-            "--disable-ipc-flooding-protection",
-            "--force-color-profile=srgb",
-            "--mute-audio",
-            "--disable-background-timer-throttling",
-            # "--single-process",
-            f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
-        ]
-
-        if self.config.light_mode:
-            args.extend(BROWSER_DISABLE_OPTIONS)
-
-        if self.config.text_mode:
-            args.extend(
-                [
-                    "--blink-settings=imagesEnabled=false",
-                    "--disable-remote-fonts",
-                    "--disable-images",
-                    "--disable-javascript",
-                    "--disable-software-rasterizer",
-                    "--disable-dev-shm-usage",
-                ]
-            )
-
-        if self.config.extra_args:
-            args.extend(self.config.extra_args)
-
-        browser_args = {"headless": self.config.headless, "args": args}
-
-        if self.config.chrome_channel:
-            browser_args["channel"] = self.config.chrome_channel
-
-        if self.config.accept_downloads:
-            browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
-                os.getcwd(), "downloads"
-            )
-            os.makedirs(browser_args["downloads_path"], exist_ok=True)
-
-        if self.config.proxy or self.config.proxy_config:
-            from playwright.async_api import ProxySettings
-
-            proxy_settings = (
-                ProxySettings(server=self.config.proxy)
-                if self.config.proxy
-                else ProxySettings(
-                    server=self.config.proxy_config.get("server"),
-                    username=self.config.proxy_config.get("username"),
-                    password=self.config.proxy_config.get("password"),
-                )
-            )
-            browser_args["proxy"] = proxy_settings
-
-        return browser_args
-
-    async def setup_context(
-        self,
-        context: BrowserContext,
-        crawlerRunConfig: CrawlerRunConfig = None,
-        is_default=False,
-    ):
-        """
-        Set up a browser context with the configured options.
-
-        How it works:
-        1. Set extra HTTP headers if provided.
-        2. Add cookies if provided.
-        3. Load storage state if provided.
-        4. Accept downloads if enabled.
-        5. Set default timeouts for navigation and download.
-        6. Set user agent if provided.
-        7. Set browser hints if provided.
-        8. Set proxy if provided.
-        9. Set downloads path if provided.
-        10. Set storage state if provided.
-        11. Set cache if provided.
-        12. Set extra HTTP headers if provided.
-        13. Add cookies if provided.
-        14. Set default timeouts for navigation and download if enabled.
-        15. Set user agent if provided.
-        16. Set browser hints if provided.
-
-        Args:
-            context (BrowserContext): The browser context to set up
-            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
-            is_default (bool): Flag indicating if this is the default context
-        Returns:
-            None
-        """
-        if self.config.headers:
-            await context.set_extra_http_headers(self.config.headers)
-
-        if self.config.cookies:
-            await context.add_cookies(self.config.cookies)
-
-        if self.config.storage_state:
-            await context.storage_state(path=None)
-
-        if self.config.accept_downloads:
-            context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
-            context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
-            if self.config.downloads_path:
-                context._impl_obj._options["accept_downloads"] = True
-                context._impl_obj._options[
-                    "downloads_path"
-                ] = self.config.downloads_path
-
-        # Handle user agent and browser hints
-        if self.config.user_agent:
-            combined_headers = {
-                "User-Agent": self.config.user_agent,
-                "sec-ch-ua": self.config.browser_hint,
-            }
-            combined_headers.update(self.config.headers)
-            await context.set_extra_http_headers(combined_headers)
-
-        # Add default cookie
-        await context.add_cookies(
-            [
-                {
-                    "name": "cookiesEnabled",
-                    "value": "true",
-                    "url": crawlerRunConfig.url
-                    if crawlerRunConfig
-                    else "https://crawl4ai.com/",
-                }
-            ]
-        )
-
-        # Handle navigator overrides
-        if crawlerRunConfig:
-            if (
-                crawlerRunConfig.override_navigator
-                or crawlerRunConfig.simulate_user
-                or crawlerRunConfig.magic
-            ):
-                await context.add_init_script(load_js_script("navigator_overrider"))        
-
-    async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
-        """
-        Creates and returns a new browser context with configured settings.
-        Applies text-only mode settings if text_mode is enabled in config.
-
-        Returns:
-            Context: Browser context object with the specified configurations
-        """
-        # Base settings
-        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
-        viewport_settings = {
-            "width": self.config.viewport_width,
-            "height": self.config.viewport_height,
-        }
-        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
-
-        blocked_extensions = [
-            # Images
-            "jpg",
-            "jpeg",
-            "png",
-            "gif",
-            "webp",
-            "svg",
-            "ico",
-            "bmp",
-            "tiff",
-            "psd",
-            # Fonts
-            "woff",
-            "woff2",
-            "ttf",
-            "otf",
-            "eot",
-            # Styles
-            # 'css', 'less', 'scss', 'sass',
-            # Media
-            "mp4",
-            "webm",
-            "ogg",
-            "avi",
-            "mov",
-            "wmv",
-            "flv",
-            "m4v",
-            "mp3",
-            "wav",
-            "aac",
-            "m4a",
-            "opus",
-            "flac",
-            # Documents
-            "pdf",
-            "doc",
-            "docx",
-            "xls",
-            "xlsx",
-            "ppt",
-            "pptx",
-            # Archives
-            "zip",
-            "rar",
-            "7z",
-            "tar",
-            "gz",
-            # Scripts and data
-            "xml",
-            "swf",
-            "wasm",
-        ]
-
-        # Common context settings
-        context_settings = {
-            "user_agent": user_agent,
-            "viewport": viewport_settings,
-            "proxy": proxy_settings,
-            "accept_downloads": self.config.accept_downloads,
-            "storage_state": self.config.storage_state,
-            "ignore_https_errors": self.config.ignore_https_errors,
-            "device_scale_factor": 1.0,
-            "java_script_enabled": self.config.java_script_enabled,
-        }
-        
-        if crawlerRunConfig:
-            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
-            if crawlerRunConfig.proxy_config:
-                proxy_settings = {
-                    "server": crawlerRunConfig.proxy_config.get("server"),
-                }
-                if crawlerRunConfig.proxy_config.get("username"):
-                    proxy_settings.update({
-                        "username": crawlerRunConfig.proxy_config.get("username"),
-                        "password": crawlerRunConfig.proxy_config.get("password"),
-                    })
-                context_settings["proxy"] = proxy_settings
-
-        if self.config.text_mode:
-            text_mode_settings = {
-                "has_touch": False,
-                "is_mobile": False,
-            }
-            # Update context settings with text mode settings
-            context_settings.update(text_mode_settings)
-
-        # Create and return the context with all settings
-        context = await self.browser.new_context(**context_settings)
-
-        # Apply text mode settings if enabled
-        if self.config.text_mode:
-            # Create and apply route patterns for each extension
-            for ext in blocked_extensions:
-                await context.route(f"**/*.{ext}", lambda route: route.abort())
-        return context
-
-    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
-        """
-        Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
-        then returns a hash of the sorted JSON. This yields a stable signature
-        that identifies configurations requiring a unique browser context.
-        """
-        import json, hashlib
-
-        config_dict = crawlerRunConfig.__dict__.copy()
-        # Exclude items that do not affect browser-level setup.
-        # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
-        ephemeral_keys = [
-            "session_id",
-            "js_code",
-            "scraping_strategy",
-            "extraction_strategy",
-            "chunking_strategy",
-            "cache_mode",
-            "content_filter",
-            "semaphore_count",
-            "url"
-        ]
-        for key in ephemeral_keys:
-            if key in config_dict:
-                del config_dict[key]
-        # Convert to canonical JSON string
-        signature_json = json.dumps(config_dict, sort_keys=True, default=str)
-
-        # Hash the JSON so we get a compact, unique string
-        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
-        return signature_hash
-
-    async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
-        """
-        Get a page for the given session ID, creating a new one if needed.
-
-        Args:
-            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
-
-        Returns:
-            (page, context): The Page and its BrowserContext
-        """
-        self._cleanup_expired_sessions()
-
-        # If a session_id is provided and we already have it, reuse that page + context
-        if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
-            context, page, _ = self.sessions[crawlerRunConfig.session_id]
-            # Update last-used timestamp
-            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
-            return page, context
-
-        # If using a managed browser, just grab the shared default_context
-        if self.config.use_managed_browser:
-            context = self.default_context
-            page = await context.new_page()
-        else:
-            # Otherwise, check if we have an existing context for this config
-            config_signature = self._make_config_signature(crawlerRunConfig)
-
-            async with self._contexts_lock:
-                if config_signature in self.contexts_by_config:
-                    context = self.contexts_by_config[config_signature]
-                else:
-                    # Create and setup a new context
-                    context = await self.create_browser_context(crawlerRunConfig)
-                    await self.setup_context(context, crawlerRunConfig)
-                    self.contexts_by_config[config_signature] = context
-
-            # Create a new page from the chosen context
-            page = await context.new_page()
-
-        # If a session_id is specified, store this session so we can reuse later
-        if crawlerRunConfig.session_id:
-            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
-
-        return page, context
-
-    async def kill_session(self, session_id: str):
-        """
-        Kill a browser session and clean up resources.
-
-        Args:
-            session_id (str): The session ID to kill.
-        """
-        if session_id in self.sessions:
-            context, page, _ = self.sessions[session_id]
-            await page.close()
-            if not self.config.use_managed_browser:
-                await context.close()
-            del self.sessions[session_id]
-
-    def _cleanup_expired_sessions(self):
-        """Clean up expired sessions based on TTL."""
-        current_time = time.time()
-        expired_sessions = [
-            sid
-            for sid, (_, _, last_used) in self.sessions.items()
-            if current_time - last_used > self.session_ttl
-        ]
-        for sid in expired_sessions:
-            asyncio.create_task(self.kill_session(sid))
-
-    async def close(self):
-        """Close all browser resources and clean up."""
-        if self.config.sleep_on_close:
-            await asyncio.sleep(0.5)
-
-        session_ids = list(self.sessions.keys())
-        for session_id in session_ids:
-            await self.kill_session(session_id)
-
-        # Now close all contexts we created. This reclaims memory from ephemeral contexts.
-        for ctx in self.contexts_by_config.values():
-            try:
-                await ctx.close()
-            except Exception as e:
-                self.logger.error(
-                    message="Error closing context: {error}",
-                    tag="ERROR",
-                    params={"error": str(e)}
-                )
-        self.contexts_by_config.clear()
-
-        if self.browser:
-            await self.browser.close()
-            self.browser = None
-
-        if self.managed_browser:
-            await asyncio.sleep(0.5)
-            await self.managed_browser.cleanup()
-            self.managed_browser = None
-
-        if self.playwright:
-            await self.playwright.stop()
-            self.playwright = None
-
+from .user_agent_generator import ValidUAGenerator
+from .browser_manager import BrowserManager
+
+import aiofiles
+import aiohttp
+import chardet
+from aiohttp.client import ClientTimeout
+from urllib.parse import urlparse
+from types import MappingProxyType
+import contextlib
+from functools import partial
 
 class AsyncCrawlerStrategy(ABC):
     """
@@ -817,7 +41,6 @@ class AsyncCrawlerStrategy(ABC):
     async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
         pass  # 4 + 3
 
-
 class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     """
     Crawler strategy using Playwright.
@@ -872,6 +95,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             "on_page_context_created": None,
             "on_user_agent_updated": None,
             "on_execution_started": None,
+            "on_execution_ended": None,
             "before_goto": None,
             "after_goto": None,
             "before_return_html": None,
@@ -906,6 +130,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         Close the browser and clean up resources.
         """
         await self.browser_manager.close()
+        # Explicitly reset the static Playwright instance
+        BrowserManager._playwright_instance = None
 
     async def kill_session(self, session_id: str):
         """
@@ -1185,7 +411,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
         user_agent = kwargs.get("user_agent", self.user_agent)
         # Use browser_manager to get a fresh page & context assigned to this session_id
-        page, context = await self.browser_manager.get_page(session_id, user_agent)
+        page, context = await self.browser_manager.get_page(CrawlerRunConfig(
+            session_id=session_id,
+            user_agent=user_agent,
+            **kwargs,
+        ))
         return session_id
 
     async def crawl(
@@ -1211,7 +441,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         status_code = 200  # Default for local/raw HTML
         screenshot_data = None
 
-        if url.startswith(("http://", "https://")):
+        if url.startswith(("http://", "https://", "view-source:")):
             return await self._crawl_web(url, config)
 
         elif url.startswith("file://"):
@@ -1223,12 +453,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 html = f.read()
             if config.screenshot:
                 screenshot_data = await self._generate_screenshot_from_html(html)
+            if config.capture_console_messages:
+                page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+                captured_console = await self._capture_console_messages(page, url)
+
             return AsyncCrawlResponse(
                 html=html,
                 response_headers=response_headers,
                 status_code=status_code,
                 screenshot=screenshot_data,
                 get_delayed_content=None,
+                console_messages=captured_console,
             )
 
         elif url.startswith("raw:"):
@@ -1254,6 +489,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
     ) -> AsyncCrawlResponse:
         """
         Internal method to crawl web URLs with the specified configuration.
+        Includes optional network and console capturing.
 
         Args:
             url (str): The web URL to crawl
@@ -1264,11 +500,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         """
         config.url = url
         response_headers = {}
+        execution_result = None
         status_code = None
         redirected_url = url 
 
         # Reset downloaded files list for new crawl
         self._downloaded_files = []
+        
+        # Initialize capture lists
+        captured_requests = []
+        captured_console = []
 
         # Handle user agent with magic mode
         user_agent_to_override = config.user_agent
@@ -1282,10 +523,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         # Get page for session
         page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
 
+        # await page.goto(URL)
+
         # Add default cookie
-        await context.add_cookies(
-            [{"name": "cookiesEnabled", "value": "true", "url": url}]
-        )
+        # await context.add_cookies(
+        #     [{"name": "cookiesEnabled", "value": "true", "url": url}]
+        # )
 
         # Handle navigator overrides
         if config.override_navigator or config.simulate_user or config.magic:
@@ -1294,23 +537,169 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         # Call hook after page creation
         await self.execute_hook("on_page_context_created", page, context=context, config=config)
 
+        # Network Request Capturing
+        if config.capture_network_requests:
+            async def handle_request_capture(request):
+                try:
+                    post_data_str = None
+                    try:
+                        # Be cautious with large post data
+                        post_data = request.post_data_buffer
+                        if post_data:
+                             # Attempt to decode, fallback to base64 or size indication
+                             try:
+                                 post_data_str = post_data.decode('utf-8', errors='replace')
+                             except UnicodeDecodeError:
+                                 post_data_str = f"[Binary data: {len(post_data)} bytes]"
+                    except Exception:
+                        post_data_str = "[Error retrieving post data]"
+
+                    captured_requests.append({
+                        "event_type": "request",
+                        "url": request.url,
+                        "method": request.method,
+                        "headers": dict(request.headers), # Convert Header dict
+                        "post_data": post_data_str,
+                        "resource_type": request.resource_type,
+                        "is_navigation_request": request.is_navigation_request(),
+                        "timestamp": time.time()
+                    })
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing request details for {request.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "request_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+            async def handle_response_capture(response):
+                try:
+                    try:
+                        # body = await response.body()
+                        # json_body = await response.json()
+                        text_body = await response.text()
+                    except Exception as e:
+                        body = None
+                        # json_body = None
+                        # text_body = None
+                    captured_requests.append({
+                        "event_type": "response",
+                        "url": response.url,
+                        "status": response.status,
+                        "status_text": response.status_text,
+                        "headers": dict(response.headers), # Convert Header dict
+                        "from_service_worker": response.from_service_worker,
+                        "request_timing": response.request.timing, # Detailed timing info
+                        "timestamp": time.time(),
+                        "body" : {
+                            # "raw": body,
+                            # "json": json_body,
+                            "text": text_body
+                        }
+                    })
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing response details for {response.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "response_capture_error", "url": response.url, "error": str(e), "timestamp": time.time()})
+
+            async def handle_request_failed_capture(request):
+                 try:
+                    captured_requests.append({
+                        "event_type": "request_failed",
+                        "url": request.url,
+                        "method": request.method,
+                        "resource_type": request.resource_type,
+                        "failure_text": str(request.failure) if request.failure else "Unknown failure",
+                        "timestamp": time.time()
+                    })
+                 except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing request failed details for {request.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "request_failed_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+            page.on("request", handle_request_capture)
+            page.on("response", handle_response_capture)
+            page.on("requestfailed", handle_request_failed_capture)
+
+        # Console Message Capturing
+        if config.capture_console_messages:
+            def handle_console_capture(msg):
+                try:
+                    message_type = "unknown"
+                    try:
+                        message_type = msg.type
+                    except:
+                        pass
+                        
+                    message_text = "unknown"
+                    try:
+                        message_text = msg.text
+                    except:
+                        pass
+                        
+                    # Basic console message with minimal content
+                    entry = {
+                        "type": message_type,
+                        "text": message_text,
+                        "timestamp": time.time()
+                    }
+                    
+                    captured_console.append(entry)
+                    
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing console message: {e}", tag="CAPTURE")
+                    # Still add something to the list even on error
+                    captured_console.append({
+                        "type": "console_capture_error", 
+                        "error": str(e), 
+                        "timestamp": time.time()
+                    })
+
+            def handle_pageerror_capture(err):
+                try:
+                    error_message = "Unknown error"
+                    try:
+                        error_message = err.message
+                    except:
+                        pass
+                        
+                    error_stack = ""
+                    try:
+                        error_stack = err.stack
+                    except:
+                        pass
+                        
+                    captured_console.append({
+                        "type": "error",
+                        "text": error_message,
+                        "stack": error_stack,
+                        "timestamp": time.time()
+                    })
+                except Exception as e:
+                    if self.logger:
+                        self.logger.warning(f"Error capturing page error: {e}", tag="CAPTURE")
+                    captured_console.append({
+                        "type": "pageerror_capture_error", 
+                        "error": str(e), 
+                        "timestamp": time.time()
+                    })
+
+            # Add event listeners directly
+            page.on("console", handle_console_capture)
+            page.on("pageerror", handle_pageerror_capture)
+
         # Set up console logging if requested
         if config.log_console:
-
             def log_consol(
                 msg, console_log_type="debug"
             ):  # Corrected the parameter syntax
                 if console_log_type == "error":
                     self.logger.error(
                         message=f"Console error: {msg}",  # Use f-string for variable interpolation
-                        tag="CONSOLE",
-                        params={"msg": msg.text},
+                        tag="CONSOLE"
                     )
                 elif console_log_type == "debug":
                     self.logger.debug(
                         message=f"Console: {msg}",  # Use f-string for variable interpolation
-                        tag="CONSOLE",
-                        params={"msg": msg.text},
+                        tag="CONSOLE"
                     )
 
             page.on("console", log_consol)
@@ -1337,14 +726,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
                 try:
                     # Generate a unique nonce for this request
-                    nonce = hashlib.sha256(os.urandom(32)).hexdigest()
+                    if config.experimental.get("use_csp_nonce", False):
+                        nonce = hashlib.sha256(os.urandom(32)).hexdigest()
 
-                    # Add CSP headers to the request
-                    await page.set_extra_http_headers(
-                        {
-                            "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
-                        }
-                    )
+                        # Add CSP headers to the request
+                        await page.set_extra_http_headers(
+                            {
+                                "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
+                            }
+                        )
 
                     response = await page.goto(
                         url, wait_until=config.wait_until, timeout=config.page_timeout
@@ -1394,7 +784,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             except Error:
                 visibility_info = await self.check_visibility(page)
 
-                if self.config.verbose:
+                if self.browser_config.config.verbose:
                     self.logger.debug(
                         message="Body visibility info: {info}",
                         tag="DEBUG",
@@ -1521,6 +911,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 execution_result = await self.robust_execute_user_script(
                     page, config.js_code
                 )
+
                 if not execution_result["success"]:
                     self.logger.warning(
                         message="User script execution had issues: {error}",
@@ -1529,6 +920,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     )
 
                 await self.execute_hook("on_execution_started", page, context=context, config=config)
+                await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result)
 
             # Handle user simulation
             if config.simulate_user or config.magic:
@@ -1540,6 +932,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             # Handle wait_for condition
             # Todo: Decide how to handle this
             if not config.wait_for and config.css_selector and False:
+            # if not config.wait_for and config.css_selector:
                 config.wait_for = f"css:{config.css_selector}"
 
             if config.wait_for:
@@ -1579,20 +972,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             if config.remove_overlay_elements:
                 await self.remove_overlay_elements(page)
 
-            # Get final HTML content
-            html = await page.content()
+            if config.css_selector:
+                try:
+                    # Handle comma-separated selectors by splitting them
+                    selectors = [s.strip() for s in config.css_selector.split(',')]
+                    html_parts = []
+                    
+                    for selector in selectors:
+                        try:
+                            content = await page.evaluate(
+                                f"""Array.from(document.querySelectorAll("{selector}"))
+                                    .map(el => el.outerHTML)
+                                    .join('')"""
+                            )
+                            html_parts.append(content)
+                        except Error as e:
+                            print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
+                    
+                    # Wrap in a div to create a valid HTML structure
+                    html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"                    
+                except Error as e:
+                    raise RuntimeError(f"Failed to extract HTML content: {str(e)}")
+            else:
+                html = await page.content()
+            
+            # # Get final HTML content
+            # html = await page.content()
             await self.execute_hook(
                 "before_return_html", page=page, html=html, context=context, config=config
             )
 
-            # Handle PDF and screenshot generation
+            # Handle PDF, MHTML and screenshot generation
             start_export_time = time.perf_counter()
             pdf_data = None
             screenshot_data = None
+            mhtml_data = None
 
             if config.pdf:
                 pdf_data = await self.export_pdf(page)
 
+            if config.capture_mhtml:
+                mhtml_data = await self.capture_mhtml(page)
+
             if config.screenshot:
                 if config.screenshot_wait_for:
                     await asyncio.sleep(config.screenshot_wait_for)
@@ -1600,9 +1021,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     page, screenshot_height_threshold=config.screenshot_height_threshold
                 )
 
-            if screenshot_data or pdf_data:
+            if screenshot_data or pdf_data or mhtml_data:
                 self.logger.info(
-                    message="Exporting PDF and taking screenshot took {duration:.2f}s",
+                    message="Exporting media (PDF/MHTML/screenshot) took {duration:.2f}s",
                     tag="EXPORT",
                     params={"duration": time.perf_counter() - start_export_time},
                 )
@@ -1621,15 +1042,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             return AsyncCrawlResponse(
                 html=html,
                 response_headers=response_headers,
+                js_execution_result=execution_result,
                 status_code=status_code,
                 screenshot=screenshot_data,
                 pdf_data=pdf_data,
+                mhtml_data=mhtml_data,
                 get_delayed_content=get_delayed_content,
                 ssl_certificate=ssl_cert,
                 downloaded_files=(
                     self._downloaded_files if self._downloaded_files else None
                 ),
                 redirected_url=redirected_url,
+                # Include captured data if enabled
+                network_requests=captured_requests if config.capture_network_requests else None,
+                console_messages=captured_console if config.capture_console_messages else None,
             )
 
         except Exception as e:
@@ -1638,6 +1064,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         finally:
             # If no session_id is given we should close the page
             if not config.session_id:
+                # Detach listeners before closing to prevent potential errors during close
+                if config.capture_network_requests:
+                    page.remove_listener("request", handle_request_capture)
+                    page.remove_listener("response", handle_response_capture)
+                    page.remove_listener("requestfailed", handle_request_failed_capture)
+                if config.capture_console_messages:
+                    page.remove_listener("console", handle_console_capture)
+                    page.remove_listener("pageerror", handle_pageerror_capture)
+                
                 await page.close()
 
     async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
@@ -1658,7 +1093,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
         """
         try:
-            viewport_height = page.viewport_size.get(
+            viewport_size = page.viewport_size
+            if viewport_size is None:
+                await page.set_viewport_size(
+                    {"width": self.browser_config.viewport_width, "height": self.browser_config.viewport_height}
+                )
+                viewport_size = page.viewport_size
+
+            viewport_height = viewport_size.get(
                 "height", self.browser_config.viewport_height
             )
             current_position = viewport_height
@@ -1718,7 +1160,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         """
         try:
             suggested_filename = download.suggested_filename
-            download_path = os.path.join(self.downloads_path, suggested_filename)
+            download_path = os.path.join(self.browser_config.downloads_path, suggested_filename)
 
             self.logger.info(
                 message="Downloading {filename} to {path}",
@@ -1793,7 +1235,107 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         """
         pdf_data = await page.pdf(print_background=True)
         return pdf_data
+        
+    async def capture_mhtml(self, page: Page) -> Optional[str]:
+        """
+        Captures the current page as MHTML using CDP.
+        
+        MHTML (MIME HTML) is a web page archive format that combines the HTML content 
+        with its resources (images, CSS, etc.) into a single MIME-encoded file.
+        
+        Args:
+            page (Page): The Playwright page object
+            
+        Returns:
+            Optional[str]: The MHTML content as a string, or None if there was an error
+        """
+        try:
+            # Ensure the page is fully loaded before capturing
+            try:
+                # Wait for DOM content and network to be idle
+                await page.wait_for_load_state("domcontentloaded", timeout=5000)
+                await page.wait_for_load_state("networkidle", timeout=5000)
+                
+                # Give a little extra time for JavaScript execution
+                await page.wait_for_timeout(1000)
+                
+                # Wait for any animations to complete
+                await page.evaluate("""
+                    () => new Promise(resolve => {
+                        // First requestAnimationFrame gets scheduled after the next repaint
+                        requestAnimationFrame(() => {
+                            // Second requestAnimationFrame gets called after all animations complete
+                            requestAnimationFrame(resolve);
+                        });
+                    })
+                """)
+            except Error as e:
+                if self.logger:
+                    self.logger.warning(
+                        message="Wait for load state timed out: {error}",
+                        tag="MHTML",
+                        params={"error": str(e)},
+                    )
+            
+            # Create a new CDP session
+            cdp_session = await page.context.new_cdp_session(page)
+            
+            # Call Page.captureSnapshot with format "mhtml"
+            result = await cdp_session.send("Page.captureSnapshot", {"format": "mhtml"})
+            
+            # The result contains a 'data' field with the MHTML content
+            mhtml_content = result.get("data")
+            
+            # Detach the CDP session to clean up resources
+            await cdp_session.detach()
+            
+            return mhtml_content
+        except Exception as e:
+            # Log the error but don't raise it - we'll just return None for the MHTML
+            if self.logger:
+                self.logger.error(
+                    message="Failed to capture MHTML: {error}",
+                    tag="MHTML",
+                    params={"error": str(e)},
+                )
+            return None
 
+    async def _capture_console_messages(
+        self, page: Page, file_path: str
+    ) -> List[Dict[str, Union[str, float]]]:
+        """
+        Captures console messages from the page.
+        Args:
+
+            page (Page): The Playwright page object
+        Returns:
+            List[Dict[str, Union[str, float]]]: A list of captured console messages
+        """
+        captured_console = []
+
+        def handle_console_message(msg):
+            try:
+                message_type = msg.type
+                message_text = msg.text
+
+                entry = {
+                    "type": message_type,
+                    "text": message_text,
+                    "timestamp": time.time(),
+                }
+                captured_console.append(entry)
+            except Exception as e:
+                if self.logger:
+                    self.logger.warning(
+                        f"Error capturing console message: {e}", tag="CAPTURE"
+                    )
+
+        page.on("console", handle_console_message)
+        
+        await page.goto(file_path)
+
+        return captured_console
+        
     async def take_screenshot(self, page, **kwargs) -> str:
         """
         Take a screenshot of the current page.
@@ -1925,8 +1467,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             buffered = BytesIO()
             img.save(buffered, format="JPEG")
             return base64.b64encode(buffered.getvalue()).decode("utf-8")
-        finally:
-            await page.close()
+        # finally:
+        #     await page.close()
 
     async def take_screenshot_naive(self, page: Page) -> str:
         """
@@ -1959,8 +1501,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             buffered = BytesIO()
             img.save(buffered, format="JPEG")
             return base64.b64encode(buffered.getvalue()).decode("utf-8")
-        finally:
-            await page.close()
+        # finally:
+        #     await page.close()
 
     async def export_storage_state(self, path: str = None) -> dict:
         """
@@ -2028,8 +1570,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                             f"""
                         (async () => {{
                             try {{
-                                {script}
-                                return {{ success: true }};
+                                const script_result = {script};
+                                return {{ success: true, result: script_result }};
                             }} catch (err) {{
                                 return {{ success: false, error: err.toString(), stack: err.stack }};
                             }}
@@ -2364,3 +1906,267 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 params={"error": str(e)},
             )
             return True  # Default to scrolling if check fails
+
+
+####################################################################################################
+# HTTP Crawler Strategy
+####################################################################################################
+
+class HTTPCrawlerError(Exception):
+    """Base error class for HTTP crawler specific exceptions"""
+    pass
+
+
+class ConnectionTimeoutError(HTTPCrawlerError):
+    """Raised when connection timeout occurs"""
+    pass
+
+
+class HTTPStatusError(HTTPCrawlerError):
+    """Raised for unexpected status codes"""
+    def __init__(self, status_code: int, message: str):
+        self.status_code = status_code
+        super().__init__(f"HTTP {status_code}: {message}")
+
+
+class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
+    """
+    Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency.
+    """
+    
+    __slots__ = ('logger', 'max_connections', 'dns_cache_ttl', 'chunk_size', '_session', 'hooks', 'browser_config')
+
+    DEFAULT_TIMEOUT: Final[int] = 30
+    DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024  
+    DEFAULT_MAX_CONNECTIONS: Final[int] = min(32, (os.cpu_count() or 1) * 4)
+    DEFAULT_DNS_CACHE_TTL: Final[int] = 300
+    VALID_SCHEMES: Final = frozenset({'http', 'https', 'file', 'raw'})
+
+    _BASE_HEADERS: Final = MappingProxyType({
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate, br',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+    })
+    
+    def __init__(
+        self, 
+        browser_config: Optional[HTTPCrawlerConfig] = None,
+        logger: Optional[AsyncLogger] = None,
+        max_connections: int = DEFAULT_MAX_CONNECTIONS,
+        dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL,
+        chunk_size: int = DEFAULT_CHUNK_SIZE
+    ):
+        """Initialize the HTTP crawler with config"""
+        self.browser_config = browser_config or HTTPCrawlerConfig()
+        self.logger = logger
+        self.max_connections = max_connections
+        self.dns_cache_ttl = dns_cache_ttl
+        self.chunk_size = chunk_size
+        self._session: Optional[aiohttp.ClientSession] = None
+        
+        self.hooks = {
+            k: partial(self._execute_hook, k) 
+            for k in ('before_request', 'after_request', 'on_error')
+        }
+
+        # Set default hooks
+        self.set_hook('before_request', lambda *args, **kwargs: None)
+        self.set_hook('after_request', lambda *args, **kwargs: None)
+        self.set_hook('on_error', lambda *args, **kwargs: None)
+                      
+
+    async def __aenter__(self) -> AsyncHTTPCrawlerStrategy:
+        await self.start()
+        return self
+        
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        await self.close()
+
+    @contextlib.asynccontextmanager
+    async def _session_context(self):
+        try:
+            if not self._session:
+                await self.start()
+            yield self._session
+        finally:
+            pass
+
+    def set_hook(self, hook_type: str, hook_func: Callable) -> None:
+        if hook_type in self.hooks:
+            self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
+        else:
+            raise ValueError(f"Invalid hook type: {hook_type}")
+
+    async def _execute_hook(
+        self, 
+        hook_type: str, 
+        hook_func: Callable,
+        *args: Any, 
+        **kwargs: Any
+    ) -> Any:
+        if asyncio.iscoroutinefunction(hook_func):
+            return await hook_func(*args, **kwargs)
+        return hook_func(*args, **kwargs)
+
+    async def start(self) -> None:
+        if not self._session:
+            connector = aiohttp.TCPConnector(
+                limit=self.max_connections,
+                ttl_dns_cache=self.dns_cache_ttl,
+                use_dns_cache=True,
+                force_close=False
+            )
+            self._session = aiohttp.ClientSession(
+                headers=dict(self._BASE_HEADERS),
+                connector=connector,
+                timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT)
+            )
+
+    async def close(self) -> None:
+        if self._session and not self._session.closed:
+            try:
+                await asyncio.wait_for(self._session.close(), timeout=5.0)
+            except asyncio.TimeoutError:
+                if self.logger:
+                    self.logger.warning(
+                        message="Session cleanup timed out",
+                        tag="CLEANUP"
+                    )
+            finally:
+                self._session = None
+
+    async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]:
+        async with aiofiles.open(path, mode='rb') as f:
+            while chunk := await f.read(self.chunk_size):
+                yield memoryview(chunk)
+
+    async def _handle_file(self, path: str) -> AsyncCrawlResponse:
+        if not os.path.exists(path):
+            raise FileNotFoundError(f"Local file not found: {path}")
+            
+        chunks = []
+        async for chunk in self._stream_file(path):
+            chunks.append(chunk.tobytes().decode('utf-8', errors='replace'))
+            
+        return AsyncCrawlResponse(
+            html=''.join(chunks),
+            response_headers={},
+            status_code=200
+        )
+
+    async def _handle_raw(self, content: str) -> AsyncCrawlResponse:
+        return AsyncCrawlResponse(
+            html=content,
+            response_headers={},
+            status_code=200
+        )
+
+
+    async def _handle_http(
+        self, 
+        url: str, 
+        config: CrawlerRunConfig
+    ) -> AsyncCrawlResponse:
+        async with self._session_context() as session:
+            timeout = ClientTimeout(
+                total=config.page_timeout or self.DEFAULT_TIMEOUT,
+                connect=10,
+                sock_read=30
+            )
+            
+            headers = dict(self._BASE_HEADERS)
+            if self.browser_config.headers:
+                headers.update(self.browser_config.headers)
+
+            request_kwargs = {
+                'timeout': timeout,
+                'allow_redirects': self.browser_config.follow_redirects,
+                'ssl': self.browser_config.verify_ssl,
+                'headers': headers
+            }
+
+            if self.browser_config.method == "POST":
+                if self.browser_config.data:
+                    request_kwargs['data'] = self.browser_config.data
+                if self.browser_config.json:
+                    request_kwargs['json'] = self.browser_config.json
+
+            await self.hooks['before_request'](url, request_kwargs)
+
+            try:
+                async with session.request(self.browser_config.method, url, **request_kwargs) as response:
+                    content = memoryview(await response.read())
+                    
+                    if not (200 <= response.status < 300):
+                        raise HTTPStatusError(
+                            response.status,
+                            f"Unexpected status code for {url}"
+                        )
+                    
+                    encoding = response.charset
+                    if not encoding:
+                        encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'                    
+                    
+                    result = AsyncCrawlResponse(
+                        html=content.tobytes().decode(encoding, errors='replace'),
+                        response_headers=dict(response.headers),
+                        status_code=response.status,
+                        redirected_url=str(response.url)
+                    )
+                    
+                    await self.hooks['after_request'](result)
+                    return result
+
+            except aiohttp.ServerTimeoutError as e:
+                await self.hooks['on_error'](e)
+                raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
+                
+            except aiohttp.ClientConnectorError as e:
+                await self.hooks['on_error'](e)
+                raise ConnectionError(f"Connection failed: {str(e)}")
+                
+            except aiohttp.ClientError as e:
+                await self.hooks['on_error'](e)
+                raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
+            
+            except asyncio.exceptions.TimeoutError as e:
+                await self.hooks['on_error'](e)
+                raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
+            
+            except Exception as e:
+                await self.hooks['on_error'](e)
+                raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
+
+    async def crawl(
+        self, 
+        url: str, 
+        config: Optional[CrawlerRunConfig] = None, 
+        **kwargs
+    ) -> AsyncCrawlResponse:
+        config = config or CrawlerRunConfig.from_kwargs(kwargs)
+        
+        parsed = urlparse(url)
+        scheme = parsed.scheme.rstrip('/')
+        
+        if scheme not in self.VALID_SCHEMES:
+            raise ValueError(f"Unsupported URL scheme: {scheme}")
+            
+        try:
+            if scheme == 'file':
+                return await self._handle_file(parsed.path)
+            elif scheme == 'raw':
+                return await self._handle_raw(parsed.path)
+            else:  # http or https
+                return await self._handle_http(url, config)
+                
+        except Exception as e:
+            if self.logger:
+                self.logger.error(
+                    message="Crawl failed: {error}",
+                    tag="CRAWL",
+                    params={"error": str(e), "url": url}
+                )
+            raise
\ No newline at end of file
diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py
index b0c20f29..a41ca97f 100644
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -4,19 +4,14 @@ import aiosqlite
 import asyncio
 from typing import Optional, Dict
 from contextlib import asynccontextmanager
-import logging
-import json  # Added for serialization/deserialization
-from .utils import ensure_content_dirs, generate_content_hash
-from .models import CrawlResult, MarkdownGenerationResult
+import json  
+from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
 import aiofiles
-from .version_manager import VersionManager
 from .async_logger import AsyncLogger
-from .utils import get_error_context, create_box_message
 
-# Set up logging
-# logging.basicConfig(level=logging.INFO)
-# logger = logging.getLogger(__name__)
-# logger.setLevel(logging.INFO)
+from .utils import ensure_content_dirs, generate_content_hash
+from .utils import VersionManager
+from .utils import get_error_context, create_box_message
 
 base_directory = DB_PATH = os.path.join(
     os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
@@ -176,7 +171,10 @@ class AsyncDatabaseManager:
                             f"Code context:\n{error_context['code_context']}"
                         )
                         self.logger.error(
-                            message=create_box_message(error_message, type="error"),
+                            message="{error}",
+                            tag="ERROR",
+                            params={"error": str(error_message)},
+                            boxes=["error"],
                         )
 
                         raise
@@ -194,7 +192,10 @@ class AsyncDatabaseManager:
                 f"Code context:\n{error_context['code_context']}"
             )
             self.logger.error(
-                message=create_box_message(error_message, type="error"),
+                message="{error}",
+                tag="ERROR",
+                params={"error": str(error_message)},
+                boxes=["error"],
             )
             raise
         finally:
@@ -336,12 +337,17 @@ class AsyncDatabaseManager:
                     except json.JSONDecodeError:
                         # Very UGLY, never mention it to me please
                         if field == "markdown" and isinstance(row_dict[field], str):
-                            row_dict[field] = row_dict[field]
+                            row_dict[field] = MarkdownGenerationResult(
+                                raw_markdown=row_dict[field] or "",
+                                markdown_with_citations="",
+                                references_markdown="",
+                                fit_markdown="",
+                                fit_html="",
+                            )
                         else:
                             row_dict[field] = {}
 
                 if isinstance(row_dict["markdown"], Dict):
-                    row_dict["markdown_v2"] = row_dict["markdown"]
                     if row_dict["markdown"].get("raw_markdown"):
                         row_dict["markdown"] = row_dict["markdown"]["raw_markdown"]
 
@@ -358,7 +364,7 @@ class AsyncDatabaseManager:
                 # Remove any fields not in CrawlResult model
                 valid_fields = CrawlResult.__annotations__.keys()
                 filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
-
+                filtered_dict["markdown"] = row_dict["markdown"]
                 return CrawlResult(**filtered_dict)
 
         try:
@@ -384,14 +390,14 @@ class AsyncDatabaseManager:
         }
 
         try:
-            if isinstance(result.markdown, MarkdownGenerationResult):
+            if isinstance(result.markdown, StringCompatibleMarkdown):
                 content_map["markdown"] = (
-                    result.markdown.model_dump_json(),
+                    result.markdown,
                     "markdown",
                 )
-            elif hasattr(result, "markdown_v2"):
+            elif isinstance(result.markdown, MarkdownGenerationResult):
                 content_map["markdown"] = (
-                    result.markdown_v2.model_dump_json(),
+                    result.markdown.model_dump_json(),
                     "markdown",
                 )
             elif isinstance(result.markdown, str):
diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py
index ed40b8b4..b97d59a7 100644
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -4,17 +4,15 @@ from .models import (
     CrawlResult,
     CrawlerTaskResult,
     CrawlStatus,
-    DisplayMode,
-    CrawlStats,
     DomainState,
 )
 
-from rich.live import Live
-from rich.table import Table
-from rich.console import Console
-from rich import box
-from datetime import datetime, timedelta
+from .components.crawler_monitor import CrawlerMonitor
+
+from .types import AsyncWebCrawler
+
 from collections.abc import AsyncGenerator
+
 import time
 import psutil
 import asyncio
@@ -25,7 +23,6 @@ import random
 from abc import ABC, abstractmethod
 
 
-
 class RateLimiter:
     def __init__(
         self,
@@ -86,201 +83,6 @@ class RateLimiter:
         return True
 
 
-class CrawlerMonitor:
-    def __init__(
-        self,
-        max_visible_rows: int = 15,
-        display_mode: DisplayMode = DisplayMode.DETAILED,
-    ):
-        self.console = Console()
-        self.max_visible_rows = max_visible_rows
-        self.display_mode = display_mode
-        self.stats: Dict[str, CrawlStats] = {}
-        self.process = psutil.Process()
-        self.start_time = datetime.now()
-        self.live = Live(self._create_table(), refresh_per_second=2)
-
-    def start(self):
-        self.live.start()
-
-    def stop(self):
-        self.live.stop()
-
-    def add_task(self, task_id: str, url: str):
-        self.stats[task_id] = CrawlStats(
-            task_id=task_id, url=url, status=CrawlStatus.QUEUED
-        )
-        self.live.update(self._create_table())
-
-    def update_task(self, task_id: str, **kwargs):
-        if task_id in self.stats:
-            for key, value in kwargs.items():
-                setattr(self.stats[task_id], key, value)
-            self.live.update(self._create_table())
-
-    def _create_aggregated_table(self) -> Table:
-        """Creates a compact table showing only aggregated statistics"""
-        table = Table(
-            box=box.ROUNDED,
-            title="Crawler Status Overview",
-            title_style="bold magenta",
-            header_style="bold blue",
-            show_lines=True,
-        )
-
-        # Calculate statistics
-        total_tasks = len(self.stats)
-        queued = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED
-        )
-        in_progress = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
-        )
-        completed = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
-        )
-        failed = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
-        )
-
-        # Memory statistics
-        current_memory = self.process.memory_info().rss / (1024 * 1024)
-        total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
-        peak_memory = max(
-            (stat.peak_memory for stat in self.stats.values()), default=0.0
-        )
-
-        # Duration
-        duration = datetime.now() - self.start_time
-
-        # Create status row
-        table.add_column("Status", style="bold cyan")
-        table.add_column("Count", justify="right")
-        table.add_column("Percentage", justify="right")
-
-        table.add_row("Total Tasks", str(total_tasks), "100%")
-        table.add_row(
-            "[yellow]In Queue[/yellow]",
-            str(queued),
-            f"{(queued/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
-        )
-        table.add_row(
-            "[blue]In Progress[/blue]",
-            str(in_progress),
-            f"{(in_progress/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
-        )
-        table.add_row(
-            "[green]Completed[/green]",
-            str(completed),
-            f"{(completed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
-        )
-        table.add_row(
-            "[red]Failed[/red]",
-            str(failed),
-            f"{(failed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
-        )
-
-        # Add memory information
-        table.add_section()
-        table.add_row(
-            "[magenta]Current Memory[/magenta]", f"{current_memory:.1f} MB", ""
-        )
-        table.add_row(
-            "[magenta]Total Task Memory[/magenta]", f"{total_task_memory:.1f} MB", ""
-        )
-        table.add_row(
-            "[magenta]Peak Task Memory[/magenta]", f"{peak_memory:.1f} MB", ""
-        )
-        table.add_row(
-            "[yellow]Runtime[/yellow]",
-            str(timedelta(seconds=int(duration.total_seconds()))),
-            "",
-        )
-
-        return table
-
-    def _create_detailed_table(self) -> Table:
-        table = Table(
-            box=box.ROUNDED,
-            title="Crawler Performance Monitor",
-            title_style="bold magenta",
-            header_style="bold blue",
-        )
-
-        # Add columns
-        table.add_column("Task ID", style="cyan", no_wrap=True)
-        table.add_column("URL", style="cyan", no_wrap=True)
-        table.add_column("Status", style="bold")
-        table.add_column("Memory (MB)", justify="right")
-        table.add_column("Peak (MB)", justify="right")
-        table.add_column("Duration", justify="right")
-        table.add_column("Info", style="italic")
-
-        # Add summary row
-        total_memory = sum(stat.memory_usage for stat in self.stats.values())
-        active_count = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
-        )
-        completed_count = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
-        )
-        failed_count = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
-        )
-
-        table.add_row(
-            "[bold yellow]SUMMARY",
-            f"Total: {len(self.stats)}",
-            f"Active: {active_count}",
-            f"{total_memory:.1f}",
-            f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
-            str(
-                timedelta(
-                    seconds=int((datetime.now() - self.start_time).total_seconds())
-                )
-            ),
-            f"✓{completed_count} ✗{failed_count}",
-            style="bold",
-        )
-
-        table.add_section()
-
-        # Add rows for each task
-        visible_stats = sorted(
-            self.stats.values(),
-            key=lambda x: (
-                x.status != CrawlStatus.IN_PROGRESS,
-                x.status != CrawlStatus.QUEUED,
-                x.end_time or datetime.max,
-            ),
-        )[: self.max_visible_rows]
-
-        for stat in visible_stats:
-            status_style = {
-                CrawlStatus.QUEUED: "white",
-                CrawlStatus.IN_PROGRESS: "yellow",
-                CrawlStatus.COMPLETED: "green",
-                CrawlStatus.FAILED: "red",
-            }[stat.status]
-
-            table.add_row(
-                stat.task_id[:8],  # Show first 8 chars of task ID
-                stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
-                f"[{status_style}]{stat.status.value}[/{status_style}]",
-                f"{stat.memory_usage:.1f}",
-                f"{stat.peak_memory:.1f}",
-                stat.duration,
-                stat.error_message[:40] if stat.error_message else "",
-            )
-
-        return table
-
-    def _create_table(self) -> Table:
-        """Creates the appropriate table based on display mode"""
-        if self.display_mode == DisplayMode.AGGREGATED:
-            return self._create_aggregated_table()
-        return self._create_detailed_table()
-
 
 class BaseDispatcher(ABC):
     def __init__(
@@ -308,7 +110,7 @@ class BaseDispatcher(ABC):
     async def run_urls(
         self,
         urls: List[str],
-        crawler: "AsyncWebCrawler",  # noqa: F821
+        crawler: AsyncWebCrawler,  # noqa: F821
         config: CrawlerRunConfig,
         monitor: Optional[CrawlerMonitor] = None,
     ) -> List[CrawlerTaskResult]:
@@ -319,71 +121,144 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
     def __init__(
         self,
         memory_threshold_percent: float = 90.0,
+        critical_threshold_percent: float = 95.0,  # New critical threshold
+        recovery_threshold_percent: float = 85.0,  # New recovery threshold
         check_interval: float = 1.0,
         max_session_permit: int = 20,
-        memory_wait_timeout: float = 300.0,  # 5 minutes default timeout
+        fairness_timeout: float = 600.0,  # 10 minutes before prioritizing long-waiting URLs
         rate_limiter: Optional[RateLimiter] = None,
         monitor: Optional[CrawlerMonitor] = None,
     ):
         super().__init__(rate_limiter, monitor)
         self.memory_threshold_percent = memory_threshold_percent
+        self.critical_threshold_percent = critical_threshold_percent
+        self.recovery_threshold_percent = recovery_threshold_percent
         self.check_interval = check_interval
         self.max_session_permit = max_session_permit
-        self.memory_wait_timeout = memory_wait_timeout
-        self.result_queue = asyncio.Queue()  # Queue for storing results
-
+        self.fairness_timeout = fairness_timeout
+        self.result_queue = asyncio.Queue()
+        self.task_queue = asyncio.PriorityQueue()  # Priority queue for better management
+        self.memory_pressure_mode = False  # Flag to indicate when we're in memory pressure mode
+        self.current_memory_percent = 0.0  # Track current memory usage
+        
+    async def _memory_monitor_task(self):
+        """Background task to continuously monitor memory usage and update state"""
+        while True:
+            self.current_memory_percent = psutil.virtual_memory().percent
+            
+            # Enter memory pressure mode if we cross the threshold
+            if not self.memory_pressure_mode and self.current_memory_percent >= self.memory_threshold_percent:
+                self.memory_pressure_mode = True
+                if self.monitor:
+                    self.monitor.update_memory_status("PRESSURE")
+            
+            # Exit memory pressure mode if we go below recovery threshold
+            elif self.memory_pressure_mode and self.current_memory_percent <= self.recovery_threshold_percent:
+                self.memory_pressure_mode = False
+                if self.monitor:
+                    self.monitor.update_memory_status("NORMAL")
+            
+            # In critical mode, we might need to take more drastic action
+            if self.current_memory_percent >= self.critical_threshold_percent:
+                if self.monitor:
+                    self.monitor.update_memory_status("CRITICAL")
+                # We could implement additional memory-saving measures here
+                
+            await asyncio.sleep(self.check_interval)
+    
+    def _get_priority_score(self, wait_time: float, retry_count: int) -> float:
+        """Calculate priority score (lower is higher priority)
+        - URLs waiting longer than fairness_timeout get higher priority
+        - More retry attempts decreases priority
+        """
+        if wait_time > self.fairness_timeout:
+            # High priority for long-waiting URLs
+            return -wait_time
+        # Standard priority based on retries
+        return retry_count
+    
     async def crawl_url(
         self,
         url: str,
         config: CrawlerRunConfig,
         task_id: str,
+        retry_count: int = 0,
     ) -> CrawlerTaskResult:
-        start_time = datetime.now()
+        start_time = time.time()
         error_message = ""
         memory_usage = peak_memory = 0.0
-
+        
+        # Get starting memory for accurate measurement
+        process = psutil.Process()
+        start_memory = process.memory_info().rss / (1024 * 1024)
+        
         try:
             if self.monitor:
                 self.monitor.update_task(
-                    task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
+                    task_id, 
+                    status=CrawlStatus.IN_PROGRESS, 
+                    start_time=start_time,
+                    retry_count=retry_count
                 )
+                
             self.concurrent_sessions += 1
-
+            
             if self.rate_limiter:
                 await self.rate_limiter.wait_if_needed(url)
-
-            process = psutil.Process()
-            start_memory = process.memory_info().rss / (1024 * 1024)
+                
+            # Check if we're in critical memory state
+            if self.current_memory_percent >= self.critical_threshold_percent:
+                # Requeue this task with increased priority and retry count
+                enqueue_time = time.time()
+                priority = self._get_priority_score(enqueue_time - start_time, retry_count + 1)
+                await self.task_queue.put((priority, (url, task_id, retry_count + 1, enqueue_time)))
+                
+                # Update monitoring
+                if self.monitor:
+                    self.monitor.update_task(
+                        task_id,
+                        status=CrawlStatus.QUEUED,
+                        error_message="Requeued due to critical memory pressure"
+                    )
+                
+                # Return placeholder result with requeued status
+                return CrawlerTaskResult(
+                    task_id=task_id,
+                    url=url,
+                    result=CrawlResult(
+                        url=url, html="", metadata={"status": "requeued"}, 
+                        success=False, error_message="Requeued due to critical memory pressure"
+                    ),
+                    memory_usage=0,
+                    peak_memory=0,
+                    start_time=start_time,
+                    end_time=time.time(),
+                    error_message="Requeued due to critical memory pressure",
+                    retry_count=retry_count + 1
+                )
+            
+            # Execute the crawl
             result = await self.crawler.arun(url, config=config, session_id=task_id)
+            
+            # Measure memory usage
             end_memory = process.memory_info().rss / (1024 * 1024)
-
             memory_usage = peak_memory = end_memory - start_memory
-
+            
+            # Handle rate limiting
             if self.rate_limiter and result.status_code:
                 if not self.rate_limiter.update_delay(url, result.status_code):
                     error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
                     if self.monitor:
                         self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
-                    result = CrawlerTaskResult(
-                        task_id=task_id,
-                        url=url,
-                        result=result,
-                        memory_usage=memory_usage,
-                        peak_memory=peak_memory,
-                        start_time=start_time,
-                        end_time=datetime.now(),
-                        error_message=error_message,
-                    )
-                    await self.result_queue.put(result)
-                    return result
-
+                        
+            # Update status based on result
             if not result.success:
                 error_message = result.error_message
                 if self.monitor:
                     self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
             elif self.monitor:
                 self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
-
+                
         except Exception as e:
             error_message = str(e)
             if self.monitor:
@@ -391,9 +266,9 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
             result = CrawlResult(
                 url=url, html="", metadata={}, success=False, error_message=str(e)
             )
-
+            
         finally:
-            end_time = datetime.now()
+            end_time = time.time()
             if self.monitor:
                 self.monitor.update_task(
                     task_id,
@@ -401,9 +276,10 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
                     memory_usage=memory_usage,
                     peak_memory=peak_memory,
                     error_message=error_message,
+                    retry_count=retry_count
                 )
             self.concurrent_sessions -= 1
-
+            
         return CrawlerTaskResult(
             task_id=task_id,
             url=url,
@@ -413,117 +289,240 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
             start_time=start_time,
             end_time=end_time,
             error_message=error_message,
+            retry_count=retry_count
         )
-
+        
     async def run_urls(
         self,
         urls: List[str],
-        crawler: "AsyncWebCrawler",  # noqa: F821
+        crawler: AsyncWebCrawler,
         config: CrawlerRunConfig,
-        ) -> List[CrawlerTaskResult]:
-            self.crawler = crawler
-
-            if self.monitor:
-                self.monitor.start()
-
-            try:
-                pending_tasks = []
-                active_tasks = []
-                task_queue = []
-
-                for url in urls:
-                    task_id = str(uuid.uuid4())
-                    if self.monitor:
-                        self.monitor.add_task(task_id, url)
-                    task_queue.append((url, task_id))
-
-                while task_queue or active_tasks:
-                    wait_start_time = time.time()
-                    while len(active_tasks) < self.max_session_permit and task_queue:
-                        if psutil.virtual_memory().percent >= self.memory_threshold_percent:
-                            # Check if we've exceeded the timeout
-                            if time.time() - wait_start_time > self.memory_wait_timeout:
-                                raise MemoryError(
-                                    f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds"
-                                )
-                            await asyncio.sleep(self.check_interval)
-                            continue
-
-                        url, task_id = task_queue.pop(0)
-                        task = asyncio.create_task(self.crawl_url(url, config, task_id))
-                        active_tasks.append(task)
-
-                    if not active_tasks:
-                        await asyncio.sleep(self.check_interval)
-                        continue
-
-                    done, pending = await asyncio.wait(
-                        active_tasks, return_when=asyncio.FIRST_COMPLETED
-                    )
-
-                    pending_tasks.extend(done)
-                    active_tasks = list(pending)
-
-                return await asyncio.gather(*pending_tasks)
-            finally:
-                if self.monitor:
-                    self.monitor.stop()
-
-    async def run_urls_stream(
-        self,
-        urls: List[str],
-        crawler: "AsyncWebCrawler",
-        config: CrawlerRunConfig,
-    ) -> AsyncGenerator[CrawlerTaskResult, None]:
+    ) -> List[CrawlerTaskResult]:
         self.crawler = crawler
+        
+        # Start the memory monitor task
+        memory_monitor = asyncio.create_task(self._memory_monitor_task())
+        
         if self.monitor:
             self.monitor.start()
-
+            
+        results = []
+        
         try:
-            active_tasks = []
-            task_queue = []
-            completed_count = 0
-            total_urls = len(urls)
-
             # Initialize task queue
             for url in urls:
                 task_id = str(uuid.uuid4())
                 if self.monitor:
                     self.monitor.add_task(task_id, url)
-                task_queue.append((url, task_id))
-
-            while completed_count < total_urls:
-                # Start new tasks if memory permits
-                while len(active_tasks) < self.max_session_permit and task_queue:
-                    if psutil.virtual_memory().percent >= self.memory_threshold_percent:
-                        await asyncio.sleep(self.check_interval)
-                        continue
-
-                    url, task_id = task_queue.pop(0)
-                    task = asyncio.create_task(self.crawl_url(url, config, task_id))
-                    active_tasks.append(task)
-
-                if not active_tasks and not task_queue:
-                    break
-
-                # Wait for any task to complete and yield results
+                # Add to queue with initial priority 0, retry count 0, and current time
+                await self.task_queue.put((0, (url, task_id, 0, time.time())))
+                
+            active_tasks = []
+            
+            # Process until both queues are empty
+            while not self.task_queue.empty() or active_tasks:
+                # If memory pressure is low, start new tasks
+                if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
+                    try:
+                        # Try to get a task with timeout to avoid blocking indefinitely
+                        priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
+                            self.task_queue.get(), timeout=0.1
+                        )
+                        
+                        # Create and start the task
+                        task = asyncio.create_task(
+                            self.crawl_url(url, config, task_id, retry_count)
+                        )
+                        active_tasks.append(task)
+                        
+                        # Update waiting time in monitor
+                        if self.monitor:
+                            wait_time = time.time() - enqueue_time
+                            self.monitor.update_task(
+                                task_id, 
+                                wait_time=wait_time,
+                                status=CrawlStatus.IN_PROGRESS
+                            )
+                            
+                    except asyncio.TimeoutError:
+                        # No tasks in queue, that's fine
+                        pass
+                        
+                # Wait for completion even if queue is starved
                 if active_tasks:
                     done, pending = await asyncio.wait(
-                        active_tasks,
-                        timeout=0.1,
-                        return_when=asyncio.FIRST_COMPLETED
+                        active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
                     )
+                    
+                    # Process completed tasks
                     for completed_task in done:
                         result = await completed_task
-                        completed_count += 1
-                        yield result
+                        results.append(result)
+                        
+                    # Update active tasks list
                     active_tasks = list(pending)
                 else:
-                    await asyncio.sleep(self.check_interval)
+                    # If no active tasks but still waiting, sleep briefly
+                    await asyncio.sleep(self.check_interval / 2)
+                    
+                # Update priorities for waiting tasks if needed
+                await self._update_queue_priorities()
+                
+            return results
 
+        except Exception as e:
+            if self.monitor:
+                self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")                
+        
         finally:
+            # Clean up
+            memory_monitor.cancel()
             if self.monitor:
                 self.monitor.stop()
+                
+    async def _update_queue_priorities(self):
+        """Periodically update priorities of items in the queue to prevent starvation"""
+        # Skip if queue is empty
+        if self.task_queue.empty():
+            return
+            
+        # Use a drain-and-refill approach to update all priorities
+        temp_items = []
+        
+        # Drain the queue (with a safety timeout to prevent blocking)
+        try:
+            drain_start = time.time()
+            while not self.task_queue.empty() and time.time() - drain_start < 5.0:  # 5 second safety timeout
+                try:
+                    # Get item from queue with timeout
+                    priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
+                        self.task_queue.get(), timeout=0.1
+                    )
+                    
+                    # Calculate new priority based on current wait time
+                    current_time = time.time()
+                    wait_time = current_time - enqueue_time
+                    new_priority = self._get_priority_score(wait_time, retry_count)
+                    
+                    # Store with updated priority
+                    temp_items.append((new_priority, (url, task_id, retry_count, enqueue_time)))
+                    
+                    # Update monitoring stats for this task
+                    if self.monitor and task_id in self.monitor.stats:
+                        self.monitor.update_task(task_id, wait_time=wait_time)
+                        
+                except asyncio.TimeoutError:
+                    # Queue might be empty or very slow
+                    break
+        except Exception as e:
+            # If anything goes wrong, make sure we refill the queue with what we've got
+            self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")
+        
+        # Calculate queue statistics
+        if temp_items and self.monitor:
+            total_queued = len(temp_items)
+            wait_times = [item[1][3] for item in temp_items]
+            highest_wait_time = time.time() - min(wait_times) if wait_times else 0
+            avg_wait_time = sum(time.time() - t for t in wait_times) / len(wait_times) if wait_times else 0
+            
+            # Update queue statistics in monitor
+            self.monitor.update_queue_statistics(
+                total_queued=total_queued,
+                highest_wait_time=highest_wait_time,
+                avg_wait_time=avg_wait_time
+            )
+        
+        # Sort by priority (lowest number = highest priority)
+        temp_items.sort(key=lambda x: x[0])
+        
+        # Refill the queue with updated priorities
+        for item in temp_items:
+            await self.task_queue.put(item)
+                
+    async def run_urls_stream(
+        self,
+        urls: List[str],
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlerTaskResult, None]:
+        self.crawler = crawler
+        
+        # Start the memory monitor task
+        memory_monitor = asyncio.create_task(self._memory_monitor_task())
+        
+        if self.monitor:
+            self.monitor.start()
+            
+        try:
+            # Initialize task queue
+            for url in urls:
+                task_id = str(uuid.uuid4())
+                if self.monitor:
+                    self.monitor.add_task(task_id, url)
+                # Add to queue with initial priority 0, retry count 0, and current time
+                await self.task_queue.put((0, (url, task_id, 0, time.time())))
+                
+            active_tasks = []
+            completed_count = 0
+            total_urls = len(urls)
+            
+            while completed_count < total_urls:
+                # If memory pressure is low, start new tasks
+                if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
+                    try:
+                        # Try to get a task with timeout
+                        priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
+                            self.task_queue.get(), timeout=0.1
+                        )
+                        
+                        # Create and start the task
+                        task = asyncio.create_task(
+                            self.crawl_url(url, config, task_id, retry_count)
+                        )
+                        active_tasks.append(task)
+                        
+                        # Update waiting time in monitor
+                        if self.monitor:
+                            wait_time = time.time() - enqueue_time
+                            self.monitor.update_task(
+                                task_id, 
+                                wait_time=wait_time,
+                                status=CrawlStatus.IN_PROGRESS
+                            )
+                            
+                    except asyncio.TimeoutError:
+                        # No tasks in queue, that's fine
+                        pass
+                        
+                # Process completed tasks and yield results
+                if active_tasks:
+                    done, pending = await asyncio.wait(
+                        active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
+                    )
+                    
+                    for completed_task in done:
+                        result = await completed_task
+                        
+                        # Only count as completed if it wasn't requeued
+                        if "requeued" not in result.error_message:
+                            completed_count += 1
+                            yield result
+                        
+                    # Update active tasks list
+                    active_tasks = list(pending)
+                else:
+                    # If no active tasks but still waiting, sleep briefly
+                    await asyncio.sleep(self.check_interval / 2)
+                
+                # Update priorities for waiting tasks if needed
+                await self._update_queue_priorities()
+                
+        finally:
+            # Clean up
+            memory_monitor.cancel()
+            if self.monitor:
+                self.monitor.stop()
+                
 
 class SemaphoreDispatcher(BaseDispatcher):
     def __init__(
@@ -544,7 +543,7 @@ class SemaphoreDispatcher(BaseDispatcher):
         task_id: str,
         semaphore: asyncio.Semaphore = None,
     ) -> CrawlerTaskResult:
-        start_time = datetime.now()
+        start_time = time.time()
         error_message = ""
         memory_usage = peak_memory = 0.0
 
@@ -577,7 +576,7 @@ class SemaphoreDispatcher(BaseDispatcher):
                             memory_usage=memory_usage,
                             peak_memory=peak_memory,
                             start_time=start_time,
-                            end_time=datetime.now(),
+                            end_time=time.time(),
                             error_message=error_message,
                         )
 
@@ -597,7 +596,7 @@ class SemaphoreDispatcher(BaseDispatcher):
             )
 
         finally:
-            end_time = datetime.now()
+            end_time = time.time()
             if self.monitor:
                 self.monitor.update_task(
                     task_id,
@@ -620,7 +619,7 @@ class SemaphoreDispatcher(BaseDispatcher):
 
     async def run_urls(
         self,
-        crawler: "AsyncWebCrawler",  # noqa: F821
+        crawler: AsyncWebCrawler,  # noqa: F821
         urls: List[str],
         config: CrawlerRunConfig,
     ) -> List[CrawlerTaskResult]:
@@ -644,4 +643,4 @@ class SemaphoreDispatcher(BaseDispatcher):
             return await asyncio.gather(*tasks, return_exceptions=True)
         finally:
             if self.monitor:
-                self.monitor.stop()
+                self.monitor.stop()
\ No newline at end of file
diff --git a/crawl4ai/async_dispatcher_.py b/crawl4ai/async_dispatcher_.py
deleted file mode 100644
index 64578bf6..00000000
--- a/crawl4ai/async_dispatcher_.py
+++ /dev/null
@@ -1,588 +0,0 @@
-from typing import Dict, Optional, List, Tuple
-from .async_configs import CrawlerRunConfig
-from .models import (
-    CrawlResult,
-    CrawlerTaskResult,
-    CrawlStatus,
-    DisplayMode,
-    CrawlStats,
-    DomainState,
-)
-
-from rich.live import Live
-from rich.table import Table
-from rich.console import Console
-from rich import box
-from datetime import datetime, timedelta
-
-import time
-import psutil
-import asyncio
-import uuid
-
-from urllib.parse import urlparse
-import random
-from abc import ABC, abstractmethod
-
-
-class RateLimiter:
-    def __init__(
-        self,
-        base_delay: Tuple[float, float] = (1.0, 3.0),
-        max_delay: float = 60.0,
-        max_retries: int = 3,
-        rate_limit_codes: List[int] = None,
-    ):
-        self.base_delay = base_delay
-        self.max_delay = max_delay
-        self.max_retries = max_retries
-        self.rate_limit_codes = rate_limit_codes or [429, 503]
-        self.domains: Dict[str, DomainState] = {}
-
-    def get_domain(self, url: str) -> str:
-        return urlparse(url).netloc
-
-    async def wait_if_needed(self, url: str) -> None:
-        domain = self.get_domain(url)
-        state = self.domains.get(domain)
-
-        if not state:
-            self.domains[domain] = DomainState()
-            state = self.domains[domain]
-
-        now = time.time()
-        if state.last_request_time:
-            wait_time = max(0, state.current_delay - (now - state.last_request_time))
-            if wait_time > 0:
-                await asyncio.sleep(wait_time)
-
-        # Random delay within base range if no current delay
-        if state.current_delay == 0:
-            state.current_delay = random.uniform(*self.base_delay)
-
-        state.last_request_time = time.time()
-
-    def update_delay(self, url: str, status_code: int) -> bool:
-        domain = self.get_domain(url)
-        state = self.domains[domain]
-
-        if status_code in self.rate_limit_codes:
-            state.fail_count += 1
-            if state.fail_count > self.max_retries:
-                return False
-
-            # Exponential backoff with random jitter
-            state.current_delay = min(
-                state.current_delay * 2 * random.uniform(0.75, 1.25), self.max_delay
-            )
-        else:
-            # Gradually reduce delay on success
-            state.current_delay = max(
-                random.uniform(*self.base_delay), state.current_delay * 0.75
-            )
-            state.fail_count = 0
-
-        return True
-
-
-class CrawlerMonitor:
-    def __init__(
-        self,
-        max_visible_rows: int = 15,
-        display_mode: DisplayMode = DisplayMode.DETAILED,
-    ):
-        self.console = Console()
-        self.max_visible_rows = max_visible_rows
-        self.display_mode = display_mode
-        self.stats: Dict[str, CrawlStats] = {}
-        self.process = psutil.Process()
-        self.start_time = datetime.now()
-        self.live = Live(self._create_table(), refresh_per_second=2)
-
-    def start(self):
-        self.live.start()
-
-    def stop(self):
-        self.live.stop()
-
-    def add_task(self, task_id: str, url: str):
-        self.stats[task_id] = CrawlStats(
-            task_id=task_id, url=url, status=CrawlStatus.QUEUED
-        )
-        self.live.update(self._create_table())
-
-    def update_task(self, task_id: str, **kwargs):
-        if task_id in self.stats:
-            for key, value in kwargs.items():
-                setattr(self.stats[task_id], key, value)
-            self.live.update(self._create_table())
-
-    def _create_aggregated_table(self) -> Table:
-        """Creates a compact table showing only aggregated statistics"""
-        table = Table(
-            box=box.ROUNDED,
-            title="Crawler Status Overview",
-            title_style="bold magenta",
-            header_style="bold blue",
-            show_lines=True,
-        )
-
-        # Calculate statistics
-        total_tasks = len(self.stats)
-        queued = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED
-        )
-        in_progress = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
-        )
-        completed = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
-        )
-        failed = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
-        )
-
-        # Memory statistics
-        current_memory = self.process.memory_info().rss / (1024 * 1024)
-        total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
-        peak_memory = max(
-            (stat.peak_memory for stat in self.stats.values()), default=0.0
-        )
-
-        # Duration
-        duration = datetime.now() - self.start_time
-
-        # Create status row
-        table.add_column("Status", style="bold cyan")
-        table.add_column("Count", justify="right")
-        table.add_column("Percentage", justify="right")
-
-        table.add_row("Total Tasks", str(total_tasks), "100%")
-        table.add_row(
-            "[yellow]In Queue[/yellow]",
-            str(queued),
-            f"{(queued/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
-        )
-        table.add_row(
-            "[blue]In Progress[/blue]",
-            str(in_progress),
-            f"{(in_progress/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
-        )
-        table.add_row(
-            "[green]Completed[/green]",
-            str(completed),
-            f"{(completed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
-        )
-        table.add_row(
-            "[red]Failed[/red]",
-            str(failed),
-            f"{(failed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
-        )
-
-        # Add memory information
-        table.add_section()
-        table.add_row(
-            "[magenta]Current Memory[/magenta]", f"{current_memory:.1f} MB", ""
-        )
-        table.add_row(
-            "[magenta]Total Task Memory[/magenta]", f"{total_task_memory:.1f} MB", ""
-        )
-        table.add_row(
-            "[magenta]Peak Task Memory[/magenta]", f"{peak_memory:.1f} MB", ""
-        )
-        table.add_row(
-            "[yellow]Runtime[/yellow]",
-            str(timedelta(seconds=int(duration.total_seconds()))),
-            "",
-        )
-
-        return table
-
-    def _create_detailed_table(self) -> Table:
-        table = Table(
-            box=box.ROUNDED,
-            title="Crawler Performance Monitor",
-            title_style="bold magenta",
-            header_style="bold blue",
-        )
-
-        # Add columns
-        table.add_column("Task ID", style="cyan", no_wrap=True)
-        table.add_column("URL", style="cyan", no_wrap=True)
-        table.add_column("Status", style="bold")
-        table.add_column("Memory (MB)", justify="right")
-        table.add_column("Peak (MB)", justify="right")
-        table.add_column("Duration", justify="right")
-        table.add_column("Info", style="italic")
-
-        # Add summary row
-        total_memory = sum(stat.memory_usage for stat in self.stats.values())
-        active_count = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
-        )
-        completed_count = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
-        )
-        failed_count = sum(
-            1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
-        )
-
-        table.add_row(
-            "[bold yellow]SUMMARY",
-            f"Total: {len(self.stats)}",
-            f"Active: {active_count}",
-            f"{total_memory:.1f}",
-            f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
-            str(
-                timedelta(
-                    seconds=int((datetime.now() - self.start_time).total_seconds())
-                )
-            ),
-            f"✓{completed_count} ✗{failed_count}",
-            style="bold",
-        )
-
-        table.add_section()
-
-        # Add rows for each task
-        visible_stats = sorted(
-            self.stats.values(),
-            key=lambda x: (
-                x.status != CrawlStatus.IN_PROGRESS,
-                x.status != CrawlStatus.QUEUED,
-                x.end_time or datetime.max,
-            ),
-        )[: self.max_visible_rows]
-
-        for stat in visible_stats:
-            status_style = {
-                CrawlStatus.QUEUED: "white",
-                CrawlStatus.IN_PROGRESS: "yellow",
-                CrawlStatus.COMPLETED: "green",
-                CrawlStatus.FAILED: "red",
-            }[stat.status]
-
-            table.add_row(
-                stat.task_id[:8],  # Show first 8 chars of task ID
-                stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
-                f"[{status_style}]{stat.status.value}[/{status_style}]",
-                f"{stat.memory_usage:.1f}",
-                f"{stat.peak_memory:.1f}",
-                stat.duration,
-                stat.error_message[:40] if stat.error_message else "",
-            )
-
-        return table
-
-    def _create_table(self) -> Table:
-        """Creates the appropriate table based on display mode"""
-        if self.display_mode == DisplayMode.AGGREGATED:
-            return self._create_aggregated_table()
-        return self._create_detailed_table()
-
-
-class BaseDispatcher(ABC):
-    def __init__(
-        self,
-        rate_limiter: Optional[RateLimiter] = None,
-        monitor: Optional[CrawlerMonitor] = None,
-    ):
-        self.crawler = None
-        self._domain_last_hit: Dict[str, float] = {}
-        self.concurrent_sessions = 0
-        self.rate_limiter = rate_limiter
-        self.monitor = monitor
-
-    @abstractmethod
-    async def crawl_url(
-        self,
-        url: str,
-        config: CrawlerRunConfig,
-        task_id: str,
-        monitor: Optional[CrawlerMonitor] = None,
-    ) -> CrawlerTaskResult:
-        pass
-
-    @abstractmethod
-    async def run_urls(
-        self,
-        urls: List[str],
-        crawler: "AsyncWebCrawler",  # noqa: F821
-        config: CrawlerRunConfig,
-        monitor: Optional[CrawlerMonitor] = None,
-    ) -> List[CrawlerTaskResult]:
-        pass
-
-
-class MemoryAdaptiveDispatcher(BaseDispatcher):
-    def __init__(
-        self,
-        memory_threshold_percent: float = 90.0,
-        check_interval: float = 1.0,
-        max_session_permit: int = 20,
-        memory_wait_timeout: float = 300.0,  # 5 minutes default timeout
-        rate_limiter: Optional[RateLimiter] = None,
-        monitor: Optional[CrawlerMonitor] = None,
-    ):
-        super().__init__(rate_limiter, monitor)
-        self.memory_threshold_percent = memory_threshold_percent
-        self.check_interval = check_interval
-        self.max_session_permit = max_session_permit
-        self.memory_wait_timeout = memory_wait_timeout
-
-    async def crawl_url(
-        self,
-        url: str,
-        config: CrawlerRunConfig,
-        task_id: str,
-    ) -> CrawlerTaskResult:
-        start_time = datetime.now()
-        error_message = ""
-        memory_usage = peak_memory = 0.0
-
-        try:
-            if self.monitor:
-                self.monitor.update_task(
-                    task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
-                )
-            self.concurrent_sessions += 1
-
-            if self.rate_limiter:
-                await self.rate_limiter.wait_if_needed(url)
-
-            process = psutil.Process()
-            start_memory = process.memory_info().rss / (1024 * 1024)
-            result = await self.crawler.arun(url, config=config, session_id=task_id)
-            end_memory = process.memory_info().rss / (1024 * 1024)
-
-            memory_usage = peak_memory = end_memory - start_memory
-
-            if self.rate_limiter and result.status_code:
-                if not self.rate_limiter.update_delay(url, result.status_code):
-                    error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
-                    if self.monitor:
-                        self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
-                    return CrawlerTaskResult(
-                        task_id=task_id,
-                        url=url,
-                        result=result,
-                        memory_usage=memory_usage,
-                        peak_memory=peak_memory,
-                        start_time=start_time,
-                        end_time=datetime.now(),
-                        error_message=error_message,
-                    )
-
-            if not result.success:
-                error_message = result.error_message
-                if self.monitor:
-                    self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
-            elif self.monitor:
-                self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
-
-        except Exception as e:
-            error_message = str(e)
-            if self.monitor:
-                self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
-            result = CrawlResult(
-                url=url, html="", metadata={}, success=False, error_message=str(e)
-            )
-
-        finally:
-            end_time = datetime.now()
-            if self.monitor:
-                self.monitor.update_task(
-                    task_id,
-                    end_time=end_time,
-                    memory_usage=memory_usage,
-                    peak_memory=peak_memory,
-                    error_message=error_message,
-                )
-            self.concurrent_sessions -= 1
-
-        return CrawlerTaskResult(
-            task_id=task_id,
-            url=url,
-            result=result,
-            memory_usage=memory_usage,
-            peak_memory=peak_memory,
-            start_time=start_time,
-            end_time=end_time,
-            error_message=error_message,
-        )
-
-    async def run_urls(
-        self,
-        urls: List[str],
-        crawler: "AsyncWebCrawler",  # noqa: F821
-        config: CrawlerRunConfig,
-    ) -> List[CrawlerTaskResult]:
-        self.crawler = crawler
-
-        if self.monitor:
-            self.monitor.start()
-
-        try:
-            pending_tasks = []
-            active_tasks = []
-            task_queue = []
-
-            for url in urls:
-                task_id = str(uuid.uuid4())
-                if self.monitor:
-                    self.monitor.add_task(task_id, url)
-                task_queue.append((url, task_id))
-
-            while task_queue or active_tasks:
-                wait_start_time = time.time()
-                while len(active_tasks) < self.max_session_permit and task_queue:
-                    if psutil.virtual_memory().percent >= self.memory_threshold_percent:
-                        # Check if we've exceeded the timeout
-                        if time.time() - wait_start_time > self.memory_wait_timeout:
-                            raise MemoryError(
-                                f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds"
-                            )
-                        await asyncio.sleep(self.check_interval)
-                        continue
-
-                    url, task_id = task_queue.pop(0)
-                    task = asyncio.create_task(self.crawl_url(url, config, task_id))
-                    active_tasks.append(task)
-
-                if not active_tasks:
-                    await asyncio.sleep(self.check_interval)
-                    continue
-
-                done, pending = await asyncio.wait(
-                    active_tasks, return_when=asyncio.FIRST_COMPLETED
-                )
-
-                pending_tasks.extend(done)
-                active_tasks = list(pending)
-
-            return await asyncio.gather(*pending_tasks)
-        finally:
-            if self.monitor:
-                self.monitor.stop()
-
-
-class SemaphoreDispatcher(BaseDispatcher):
-    def __init__(
-        self,
-        semaphore_count: int = 5,
-        max_session_permit: int = 20,
-        rate_limiter: Optional[RateLimiter] = None,
-        monitor: Optional[CrawlerMonitor] = None,
-    ):
-        super().__init__(rate_limiter, monitor)
-        self.semaphore_count = semaphore_count
-        self.max_session_permit = max_session_permit
-
-    async def crawl_url(
-        self,
-        url: str,
-        config: CrawlerRunConfig,
-        task_id: str,
-        semaphore: asyncio.Semaphore = None,
-    ) -> CrawlerTaskResult:
-        start_time = datetime.now()
-        error_message = ""
-        memory_usage = peak_memory = 0.0
-
-        try:
-            if self.monitor:
-                self.monitor.update_task(
-                    task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
-                )
-
-            if self.rate_limiter:
-                await self.rate_limiter.wait_if_needed(url)
-
-            async with semaphore:
-                process = psutil.Process()
-                start_memory = process.memory_info().rss / (1024 * 1024)
-                result = await self.crawler.arun(url, config=config, session_id=task_id)
-                end_memory = process.memory_info().rss / (1024 * 1024)
-
-                memory_usage = peak_memory = end_memory - start_memory
-
-                if self.rate_limiter and result.status_code:
-                    if not self.rate_limiter.update_delay(url, result.status_code):
-                        error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
-                        if self.monitor:
-                            self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
-                        return CrawlerTaskResult(
-                            task_id=task_id,
-                            url=url,
-                            result=result,
-                            memory_usage=memory_usage,
-                            peak_memory=peak_memory,
-                            start_time=start_time,
-                            end_time=datetime.now(),
-                            error_message=error_message,
-                        )
-
-                if not result.success:
-                    error_message = result.error_message
-                    if self.monitor:
-                        self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
-                elif self.monitor:
-                    self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
-
-        except Exception as e:
-            error_message = str(e)
-            if self.monitor:
-                self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
-            result = CrawlResult(
-                url=url, html="", metadata={}, success=False, error_message=str(e)
-            )
-
-        finally:
-            end_time = datetime.now()
-            if self.monitor:
-                self.monitor.update_task(
-                    task_id,
-                    end_time=end_time,
-                    memory_usage=memory_usage,
-                    peak_memory=peak_memory,
-                    error_message=error_message,
-                )
-
-        return CrawlerTaskResult(
-            task_id=task_id,
-            url=url,
-            result=result,
-            memory_usage=memory_usage,
-            peak_memory=peak_memory,
-            start_time=start_time,
-            end_time=end_time,
-            error_message=error_message,
-        )
-
-    async def run_urls(
-        self,
-        crawler: "AsyncWebCrawler",  # noqa: F821
-        urls: List[str],
-        config: CrawlerRunConfig,
-    ) -> List[CrawlerTaskResult]:
-        self.crawler = crawler
-        if self.monitor:
-            self.monitor.start()
-
-        try:
-            semaphore = asyncio.Semaphore(self.semaphore_count)
-            tasks = []
-
-            for url in urls:
-                task_id = str(uuid.uuid4())
-                if self.monitor:
-                    self.monitor.add_task(task_id, url)
-                task = asyncio.create_task(
-                    self.crawl_url(url, config, task_id, semaphore)
-                )
-                tasks.append(task)
-
-            return await asyncio.gather(*tasks, return_exceptions=True)
-        finally:
-            if self.monitor:
-                self.monitor.stop()
diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
index 0e049289..49c7ee6f 100644
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -1,19 +1,81 @@
+from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Optional, Dict, Any
-from colorama import Fore, Style, init
+from typing import Optional, Dict, Any, List
 import os
 from datetime import datetime
+from urllib.parse import unquote
+from rich.console import Console
+from rich.text import Text
+from .utils import create_box_message
 
 
 class LogLevel(Enum):
+    DEFAULT = 0
     DEBUG = 1
     INFO = 2
     SUCCESS = 3
     WARNING = 4
     ERROR = 5
+    CRITICAL = 6
+    ALERT = 7
+    NOTICE = 8
+    EXCEPTION = 9
+    FATAL = 10
+    
+
+    def __str__(self):
+        return self.name.lower()
+
+class LogColor(str, Enum):
+    """Enum for log colors."""
+
+    DEBUG = "lightblack"
+    INFO = "cyan"
+    SUCCESS = "green"
+    WARNING = "yellow"
+    ERROR = "red"
+    CYAN = "cyan"
+    GREEN = "green"
+    YELLOW = "yellow"
+    MAGENTA = "magenta"
+    DIM_MAGENTA = "dim magenta"
+
+    def __str__(self):
+        """Automatically convert rich color to string."""
+        return self.value
 
 
-class AsyncLogger:
+class AsyncLoggerBase(ABC):
+    @abstractmethod
+    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
+        pass
+
+    @abstractmethod
+    def info(self, message: str, tag: str = "INFO", **kwargs):
+        pass
+
+    @abstractmethod
+    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
+        pass
+
+    @abstractmethod
+    def warning(self, message: str, tag: str = "WARNING", **kwargs):
+        pass
+
+    @abstractmethod
+    def error(self, message: str, tag: str = "ERROR", **kwargs):
+        pass
+
+    @abstractmethod
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
+        pass
+
+    @abstractmethod
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
+        pass
+
+
+class AsyncLogger(AsyncLoggerBase):
     """
     Asynchronous logger with support for colored console output and file logging.
     Supports templated messages with colored components.
@@ -30,14 +92,21 @@ class AsyncLogger:
         "DEBUG": "⋯",
         "INFO": "ℹ",
         "WARNING": "⚠",
+        "SUCCESS": "✔",
+        "CRITICAL": "‼",
+        "ALERT": "⚡",
+        "NOTICE": "ℹ",
+        "EXCEPTION": "❗",
+        "FATAL": "☠",
+        "DEFAULT": "•",
     }
 
     DEFAULT_COLORS = {
-        LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
-        LogLevel.INFO: Fore.CYAN,
-        LogLevel.SUCCESS: Fore.GREEN,
-        LogLevel.WARNING: Fore.YELLOW,
-        LogLevel.ERROR: Fore.RED,
+        LogLevel.DEBUG: LogColor.DEBUG,
+        LogLevel.INFO: LogColor.INFO,
+        LogLevel.SUCCESS: LogColor.SUCCESS,
+        LogLevel.WARNING: LogColor.WARNING,
+        LogLevel.ERROR: LogColor.ERROR,
     }
 
     def __init__(
@@ -46,7 +115,7 @@ class AsyncLogger:
         log_level: LogLevel = LogLevel.DEBUG,
         tag_width: int = 10,
         icons: Optional[Dict[str, str]] = None,
-        colors: Optional[Dict[LogLevel, str]] = None,
+        colors: Optional[Dict[LogLevel, LogColor]] = None,
         verbose: bool = True,
     ):
         """
@@ -60,13 +129,13 @@ class AsyncLogger:
             colors: Custom colors for different log levels
             verbose: Whether to output to console
         """
-        init()  # Initialize colorama
         self.log_file = log_file
         self.log_level = log_level
         self.tag_width = tag_width
         self.icons = icons or self.DEFAULT_ICONS
         self.colors = colors or self.DEFAULT_COLORS
         self.verbose = verbose
+        self.console = Console()
 
         # Create log file directory if needed
         if log_file:
@@ -79,20 +148,23 @@ class AsyncLogger:
     def _get_icon(self, tag: str) -> str:
         """Get the icon for a tag, defaulting to info icon if not found."""
         return self.icons.get(tag, self.icons["INFO"])
+    
+    def _shorten(self, text, length, placeholder="..."):
+        """Truncate text in the middle if longer than length, or pad if shorter."""
+        if len(text) <= length:
+            return text.ljust(length)  # Pad with spaces to reach desired length
+        half = (length - len(placeholder)) // 2
+        shortened = text[:half] + placeholder + text[-half:]
+        return shortened.ljust(length)  # Also pad shortened text to consistent length
 
     def _write_to_file(self, message: str):
         """Write a message to the log file if configured."""
         if self.log_file:
+            text = Text.from_markup(message)
+            plain_text = text.plain
             timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
             with open(self.log_file, "a", encoding="utf-8") as f:
-                # Strip ANSI color codes for file output
-                clean_message = message.replace(Fore.RESET, "").replace(
-                    Style.RESET_ALL, ""
-                )
-                for color in vars(Fore).values():
-                    if isinstance(color, str):
-                        clean_message = clean_message.replace(color, "")
-                f.write(f"[{timestamp}] {clean_message}\n")
+                f.write(f"[{timestamp}] {plain_text}\n")
 
     def _log(
         self,
@@ -100,8 +172,9 @@ class AsyncLogger:
         message: str,
         tag: str,
         params: Optional[Dict[str, Any]] = None,
-        colors: Optional[Dict[str, str]] = None,
-        base_color: Optional[str] = None,
+        colors: Optional[Dict[str, LogColor]] = None,
+        boxes: Optional[List[str]] = None,
+        base_color: Optional[LogColor] = None,
         **kwargs,
     ):
         """
@@ -113,42 +186,44 @@ class AsyncLogger:
             tag: Tag for the message
             params: Parameters to format into the message
             colors: Color overrides for specific parameters
+            boxes: Box overrides for specific parameters
             base_color: Base color for the entire message
         """
         if level.value < self.log_level.value:
             return
 
-        # Format the message with parameters if provided
+        # avoid conflict with rich formatting
+        parsed_message = message.replace("[", "[[").replace("]", "]]")
         if params:
-            try:
-                # First format the message with raw parameters
-                formatted_message = message.format(**params)
+            # FIXME: If there are formatting strings in floating point format, 
+            # this may result in colors and boxes not being applied properly.
+            # such as {value:.2f}, the value is 0.23333 format it to 0.23,
+            # but we replace("0.23333", "[color]0.23333[/color]")
+            formatted_message = parsed_message.format(**params)
+            for key, value in params.items():
+                # value_str may discard `[` and `]`, so we need to replace it. 
+                value_str = str(value).replace("[", "[[").replace("]", "]]")
+                # check is need apply color
+                if colors and key in colors:
+                    color_str = f"[{colors[key]}]{value_str}[/{colors[key]}]"
+                    formatted_message = formatted_message.replace(value_str, color_str)
+                    value_str = color_str
 
-                # Then apply colors if specified
-                if colors:
-                    for key, color in colors.items():
-                        # Find the formatted value in the message and wrap it with color
-                        if key in params:
-                            value_str = str(params[key])
-                            formatted_message = formatted_message.replace(
-                                value_str, f"{color}{value_str}{Style.RESET_ALL}"
-                            )
+                # check is need apply box
+                if boxes and key in boxes:
+                    formatted_message = formatted_message.replace(value_str,
+                        create_box_message(value_str, type=str(level)))
 
-            except KeyError as e:
-                formatted_message = (
-                    f"LOGGING ERROR: Missing parameter {e} in message template"
-                )
-                level = LogLevel.ERROR
         else:
-            formatted_message = message
+            formatted_message = parsed_message
 
         # Construct the full log line
-        color = base_color or self.colors[level]
-        log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
+        color: LogColor = base_color or self.colors[level]
+        log_line = f"[{color}]{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message} [/{color}]"
 
         # Output to console if verbose
         if self.verbose or kwargs.get("force_verbose", False):
-            print(log_line)
+            self.console.print(log_line)
 
         # Write to file if configured
         self._write_to_file(log_line)
@@ -168,6 +243,22 @@ class AsyncLogger:
     def warning(self, message: str, tag: str = "WARNING", **kwargs):
         """Log a warning message."""
         self._log(LogLevel.WARNING, message, tag, **kwargs)
+        
+    def critical(self, message: str, tag: str = "CRITICAL", **kwargs):
+        """Log a critical message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+    def exception(self, message: str, tag: str = "EXCEPTION", **kwargs):
+        """Log an exception message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+    def fatal(self, message: str, tag: str = "FATAL", **kwargs):
+        """Log a fatal message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+    def alert(self, message: str, tag: str = "ALERT", **kwargs):
+        """Log an alert message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+    def notice(self, message: str, tag: str = "NOTICE", **kwargs):
+        """Log a notice message."""
+        self._log(LogLevel.INFO, message, tag, **kwargs)
 
     def error(self, message: str, tag: str = "ERROR", **kwargs):
         """Log an error message."""
@@ -179,7 +270,7 @@ class AsyncLogger:
         success: bool,
         timing: float,
         tag: str = "FETCH",
-        url_length: int = 50,
+        url_length: int = 100,
     ):
         """
         Convenience method for logging URL fetch status.
@@ -191,19 +282,20 @@ class AsyncLogger:
             tag: Tag for the message
             url_length: Maximum length for URL in log
         """
+        decoded_url = unquote(url)
+        readable_url = self._shorten(decoded_url, url_length)
         self._log(
             level=LogLevel.SUCCESS if success else LogLevel.ERROR,
-            message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
+            message="{url} | {status} | ⏱: {timing:.2f}s",
             tag=tag,
             params={
-                "url": url,
-                "url_length": url_length,
-                "status": success,
+                "url": readable_url,
+                "status": "✓" if success else "✗",
                 "timing": timing,
             },
             colors={
-                "status": Fore.GREEN if success else Fore.RED,
-                "timing": Fore.YELLOW,
+                "status": LogColor.SUCCESS if success else LogColor.ERROR,
+                "timing": LogColor.WARNING,
             },
         )
 
@@ -219,9 +311,63 @@ class AsyncLogger:
             tag: Tag for the message
             url_length: Maximum length for URL in log
         """
+        decoded_url = unquote(url)
+        readable_url = self._shorten(decoded_url, url_length)
         self._log(
             level=LogLevel.ERROR,
-            message="{url:.{url_length}}... | Error: {error}",
+            message="{url} | Error: {error}",
             tag=tag,
-            params={"url": url, "url_length": url_length, "error": error},
+            params={"url": readable_url, "error": error},
         )
+
+class AsyncFileLogger(AsyncLoggerBase):
+    """
+    File-only asynchronous logger that writes logs to a specified file.
+    """
+
+    def __init__(self, log_file: str):
+        """
+        Initialize the file logger.
+
+        Args:
+            log_file: File path for logging
+        """
+        self.log_file = log_file
+        os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
+
+    def _write_to_file(self, level: str, message: str, tag: str):
+        """Write a message to the log file."""
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+        with open(self.log_file, "a", encoding="utf-8") as f:
+            f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
+
+    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
+        """Log a debug message to file."""
+        self._write_to_file("DEBUG", message, tag)
+
+    def info(self, message: str, tag: str = "INFO", **kwargs):
+        """Log an info message to file."""
+        self._write_to_file("INFO", message, tag)
+
+    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
+        """Log a success message to file."""
+        self._write_to_file("SUCCESS", message, tag)
+
+    def warning(self, message: str, tag: str = "WARNING", **kwargs):
+        """Log a warning message to file."""
+        self._write_to_file("WARNING", message, tag)
+
+    def error(self, message: str, tag: str = "ERROR", **kwargs):
+        """Log an error message to file."""
+        self._write_to_file("ERROR", message, tag)
+
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 100):
+        """Log URL fetch status to file."""
+        status = "SUCCESS" if success else "FAILED"
+        message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
+        self._write_to_file("URL_STATUS", message, tag)
+
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 100):
+        """Log error status to file."""
+        message = f"{url[:url_length]}... | Error: {error}"
+        self._write_to_file("ERROR", message, tag)
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 617b6901..19b98522 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -1,8 +1,7 @@
+from .__version__ import __version__ as crawl4ai_version
 import os
 import sys
 import time
-import warnings
-from colorama import Fore
 from pathlib import Path
 from typing import Optional, List
 import json
@@ -10,47 +9,45 @@ import asyncio
 
 # from contextlib import nullcontext, asynccontextmanager
 from contextlib import asynccontextmanager
-from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult, DispatchResult
+from .models import (
+    CrawlResult,
+    MarkdownGenerationResult,
+    DispatchResult,
+    ScrapingResult,
+    CrawlResultContainer,
+    RunManyReturn
+)
 from .async_database import async_db_manager
 from .chunking_strategy import *  # noqa: F403
-from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking
+from .chunking_strategy import IdentityChunking
 from .content_filter_strategy import *  # noqa: F403
-from .content_filter_strategy import RelevantContentFilter
-from .extraction_strategy import * # noqa: F403
-from .extraction_strategy import NoExtractionStrategy, ExtractionStrategy
+from .extraction_strategy import *  # noqa: F403
+from .extraction_strategy import NoExtractionStrategy
 from .async_crawler_strategy import (
     AsyncCrawlerStrategy,
     AsyncPlaywrightCrawlerStrategy,
     AsyncCrawlResponse,
 )
-from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
+from .cache_context import CacheMode, CacheContext
 from .markdown_generation_strategy import (
     DefaultMarkdownGenerator,
     MarkdownGenerationStrategy,
 )
-from .async_logger import AsyncLogger
-from .async_configs import BrowserConfig, CrawlerRunConfig
-from .async_dispatcher import * # noqa: F403
+from .deep_crawling import DeepCrawlDecorator
+from .async_logger import AsyncLogger, AsyncLoggerBase
+from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
+from .async_dispatcher import *  # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
 
-from .config import MIN_WORD_THRESHOLD
 from .utils import (
     sanitize_input_encode,
     InvalidCSSSelectorError,
     fast_format_html,
-    create_box_message,
     get_error_context,
     RobotsParser,
+    preprocess_html_for_schema,
 )
 
-from typing import Union, AsyncGenerator, List, TypeVar
-from collections.abc import AsyncGenerator
-
-CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
-RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
-
-from .__version__ import __version__ as crawl4ai_version
-
 
 class AsyncWebCrawler:
     """
@@ -76,31 +73,21 @@ class AsyncWebCrawler:
         await crawler.close()
         ```
 
-    Migration Guide:
-    Old way (deprecated):
-        crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True)
-
-    New way (recommended):
-        browser_config = BrowserConfig(browser_type="chromium", headless=True)
-        crawler = AsyncWebCrawler(config=browser_config)
-
-
     Attributes:
         browser_config (BrowserConfig): Configuration object for browser settings.
         crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
         logger (AsyncLogger): Logger instance for recording events and errors.
-        always_bypass_cache (bool): Whether to always bypass cache.
         crawl4ai_folder (str): Directory for storing cache.
         base_directory (str): Base directory for storing cache.
         ready (bool): Whether the crawler is ready for use.
 
-        Methods:
-            start(): Start the crawler explicitly without using context manager.
-            close(): Close the crawler explicitly without using context manager.
-            arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
-            awarmup(): Perform warmup sequence.
-            arun_many(): Run the crawler for multiple sources.
-            aprocess_html(): Process HTML content.
+    Methods:
+        start(): Start the crawler explicitly without using context manager.
+        close(): Close the crawler explicitly without using context manager.
+        arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
+        awarmup(): Perform warmup sequence.
+        arun_many(): Run the crawler for multiple sources.
+        aprocess_html(): Process HTML content.
 
     Typical Usage:
         async with AsyncWebCrawler() as crawler:
@@ -121,81 +108,45 @@ class AsyncWebCrawler:
 
     def __init__(
         self,
-        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
-        config: Optional[BrowserConfig] = None,
-        always_bypass_cache: bool = False,
-        always_by_pass_cache: Optional[bool] = None,  # Deprecated parameter
-        base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
+        crawler_strategy: AsyncCrawlerStrategy = None,
+        config: BrowserConfig = None,
+        base_directory: str = str(
+            os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
         thread_safe: bool = False,
+        logger: AsyncLoggerBase = None,
         **kwargs,
     ):
         """
         Initialize the AsyncWebCrawler.
 
         Args:
-            crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
-            config: Configuration object for browser settings. If None, will be created from kwargs
-            always_bypass_cache: Whether to always bypass cache (new parameter)
-            always_by_pass_cache: Deprecated, use always_bypass_cache instead
+            crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
+            config: Configuration object for browser settings. Default BrowserConfig()
             base_directory: Base directory for storing cache
             thread_safe: Whether to use thread-safe operations
             **kwargs: Additional arguments for backwards compatibility
         """
         # Handle browser configuration
-        browser_config = config
-        if browser_config is not None:
-            if any(
-                k in kwargs
-                for k in [
-                    "browser_type",
-                    "headless",
-                    "viewport_width",
-                    "viewport_height",
-                ]
-            ):
-                self.logger.warning(
-                    message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
-                    tag="WARNING",
-                )
-        else:
-            # Create browser config from kwargs for backwards compatibility
-            browser_config = BrowserConfig.from_kwargs(kwargs)
+        browser_config = config or BrowserConfig()
 
         self.browser_config = browser_config
 
         # Initialize logger first since other components may need it
-        self.logger = AsyncLogger(
+        self.logger = logger or AsyncLogger(
             log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
             verbose=self.browser_config.verbose,
             tag_width=10,
         )
 
         # Initialize crawler strategy
-        params = {k: v for k, v in kwargs.items() if k in ["browser_congig", "logger"]}
+        params = {k: v for k, v in kwargs.items() if k in [
+            "browser_config", "logger"]}
         self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
             browser_config=browser_config,
             logger=self.logger,
             **params,  # Pass remaining kwargs for backwards compatibility
         )
 
-        # If craweler strategy doesnt have logger, use crawler logger
-        if not self.crawler_strategy.logger:
-            self.crawler_strategy.logger = self.logger
-
-        # Handle deprecated cache parameter
-        if always_by_pass_cache is not None:
-            if kwargs.get("warning", True):
-                warnings.warn(
-                    "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
-                    "Use 'always_bypass_cache' instead. "
-                    "Pass warning=False to suppress this warning.",
-                    DeprecationWarning,
-                    stacklevel=2,
-                )
-            self.always_bypass_cache = always_by_pass_cache
-        else:
-            self.always_bypass_cache = always_bypass_cache
-
         # Thread safety setup
         self._lock = asyncio.Lock() if thread_safe else None
 
@@ -209,21 +160,20 @@ class AsyncWebCrawler:
 
         self.ready = False
 
+        # Decorate arun method with deep crawling capabilities
+        self._deep_handler = DeepCrawlDecorator(self)
+        self.arun = self._deep_handler(self.arun)
+
     async def start(self):
         """
         Start the crawler explicitly without using context manager.
         This is equivalent to using 'async with' but gives more control over the lifecycle.
-
-        This method will:
-        1. Initialize the browser and context
-        2. Perform warmup sequence
-        3. Return the crawler instance for method chaining
-
         Returns:
             AsyncWebCrawler: The initialized crawler instance
         """
         await self.crawler_strategy.__aenter__()
-        await self.awarmup()
+        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
+        self.ready = True
         return self
 
     async def close(self):
@@ -243,18 +193,6 @@ class AsyncWebCrawler:
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         await self.close()
 
-    async def awarmup(self):
-        """
-        Initialize the crawler with warm-up sequence.
-
-        This method:
-        1. Logs initialization info
-        2. Sets up browser configuration
-        3. Marks the crawler as ready
-        """
-        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
-        self.ready = True
-
     @asynccontextmanager
     async def nullcontext(self):
         """异步空上下文管理器"""
@@ -263,26 +201,9 @@ class AsyncWebCrawler:
     async def arun(
         self,
         url: str,
-        config: Optional[CrawlerRunConfig] = None,
-        # Legacy parameters maintained for backwards compatibility
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        content_filter: RelevantContentFilter = None,
-        cache_mode: Optional[CacheMode] = None,
-        # Deprecated cache parameters
-        bypass_cache: bool = False,
-        disable_cache: bool = False,
-        no_cache_read: bool = False,
-        no_cache_write: bool = False,
-        # Other legacy parameters
-        css_selector: str = None,
-        screenshot: bool = False,
-        pdf: bool = False,
-        user_agent: str = None,
-        verbose=True,
+        config: CrawlerRunConfig = None,
         **kwargs,
-    ) -> CrawlResult:
+    ) -> RunManyReturn:
         """
         Runs the crawler for a single source: URL (web, local file, or raw HTML).
 
@@ -311,70 +232,25 @@ class AsyncWebCrawler:
         Returns:
             CrawlResult: The result of crawling and processing
         """
-        crawler_config = config
+        # Auto-start if not ready
+        if not self.ready:
+            await self.start()
+
+        config = config or CrawlerRunConfig()
         if not isinstance(url, str) or not url:
-            raise ValueError("Invalid URL, make sure the URL is a non-empty string")
+            raise ValueError(
+                "Invalid URL, make sure the URL is a non-empty string")
 
         async with self._lock or self.nullcontext():
             try:
-                # Handle configuration
-                if crawler_config is not None:
-                    # if any(param is not None for param in [
-                    #     word_count_threshold, extraction_strategy, chunking_strategy,
-                    #     content_filter, cache_mode, css_selector, screenshot, pdf
-                    # ]):
-                    #     self.logger.warning(
-                    #         message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
-                    #         tag="WARNING"
-                    #     )
-                    config = crawler_config
-                else:
-                    # Merge all parameters into a single kwargs dict for config creation
-                    config_kwargs = {
-                        "word_count_threshold": word_count_threshold,
-                        "extraction_strategy": extraction_strategy,
-                        "chunking_strategy": chunking_strategy,
-                        "content_filter": content_filter,
-                        "cache_mode": cache_mode,
-                        "bypass_cache": bypass_cache,
-                        "disable_cache": disable_cache,
-                        "no_cache_read": no_cache_read,
-                        "no_cache_write": no_cache_write,
-                        "css_selector": css_selector,
-                        "screenshot": screenshot,
-                        "pdf": pdf,
-                        "verbose": verbose,
-                        **kwargs,
-                    }
-                    config = CrawlerRunConfig.from_kwargs(config_kwargs)
-
-                # Handle deprecated cache parameters
-                if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
-                    if kwargs.get("warning", True):
-                        warnings.warn(
-                            "Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
-                            "Use 'cache_mode' parameter instead.",
-                            DeprecationWarning,
-                            stacklevel=2,
-                        )
-
-                    # Convert legacy parameters if cache_mode not provided
-                    if config.cache_mode is None:
-                        config.cache_mode = _legacy_to_cache_mode(
-                            disable_cache=disable_cache,
-                            bypass_cache=bypass_cache,
-                            no_cache_read=no_cache_read,
-                            no_cache_write=no_cache_write,
-                        )
+                self.logger.verbose = config.verbose
 
                 # Default to ENABLED if no cache mode specified
                 if config.cache_mode is None:
                     config.cache_mode = CacheMode.ENABLED
 
                 # Create cache context
-                cache_context = CacheContext(
-                    url, config.cache_mode, self.always_bypass_cache
-                )
+                cache_context = CacheContext(url, config.cache_mode, False)
 
                 # Initialize processing variables
                 async_response: AsyncCrawlResponse = None
@@ -401,7 +277,11 @@ class AsyncWebCrawler:
                     # If screenshot is requested but its not in cache, then set cache_result to None
                     screenshot_data = cached_result.screenshot
                     pdf_data = cached_result.pdf
-                    if config.screenshot and not screenshot or config.pdf and not pdf:
+                    # if config.screenshot and not screenshot or config.pdf and not pdf:
+                    if config.screenshot and not screenshot_data:
+                        cached_result = None
+
+                    if config.pdf and not pdf_data:
                         cached_result = None
 
                     self.logger.url_status(
@@ -411,26 +291,45 @@ class AsyncWebCrawler:
                         tag="FETCH",
                     )
 
+                # Update proxy configuration from rotation strategy if available
+                if config and config.proxy_rotation_strategy:
+                    next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
+                    if next_proxy:
+                        self.logger.info(
+                            message="Switch proxy: {proxy}",
+                            tag="PROXY",
+                            params={"proxy": next_proxy.server}
+                        )
+                        config.proxy_config = next_proxy
+                        # config = config.clone(proxy_config=next_proxy)
+
                 # Fetch fresh content if needed
                 if not cached_result or not html:
                     t1 = time.perf_counter()
 
-                    if user_agent:
-                        self.crawler_strategy.update_user_agent(user_agent)
+                    if config.user_agent:
+                        self.crawler_strategy.update_user_agent(
+                            config.user_agent)
 
                     # Check robots.txt if enabled
                     if config and config.check_robots_txt:
-                        if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
+                        if not await self.robots_parser.can_fetch(
+                            url, self.browser_config.user_agent
+                        ):
                             return CrawlResult(
                                 url=url,
                                 html="",
                                 success=False,
                                 status_code=403,
                                 error_message="Access denied by robots.txt",
-                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
+                                response_headers={
+                                    "X-Robots-Status": "Blocked by robots.txt"
+                                },
                             )
 
-                    # Pass config to crawl method
+                    ##############################
+                    # Call CrawlerStrategy.crawl #
+                    ##############################
                     async_response = await self.crawler_strategy.crawl(
                         url,
                         config=config,  # Pass the entire config object
@@ -439,6 +338,7 @@ class AsyncWebCrawler:
                     html = sanitize_input_encode(async_response.html)
                     screenshot_data = async_response.screenshot
                     pdf_data = async_response.pdf_data
+                    js_execution_result = async_response.js_execution_result
 
                     t2 = time.perf_counter()
                     self.logger.url_status(
@@ -448,16 +348,19 @@ class AsyncWebCrawler:
                         tag="FETCH",
                     )
 
-                    # Process the HTML content
-                    crawl_result : CrawlResult = await self.aprocess_html(
+                    ###############################################################
+                    # Process the HTML content, Call CrawlerStrategy.process_html #
+                    ###############################################################
+                    crawl_result: CrawlResult = await self.aprocess_html(
                         url=url,
                         html=html,
                         extracted_content=extracted_content,
                         config=config,  # Pass the config object instead of individual parameters
-                        screenshot=screenshot_data,
+                        screenshot_data=screenshot_data,
                         pdf_data=pdf_data,
                         verbose=config.verbose,
                         is_raw_html=True if url.startswith("raw:") else False,
+                        redirected_url=async_response.redirected_url, 
                         **kwargs,
                     )
 
@@ -465,69 +368,42 @@ class AsyncWebCrawler:
                     crawl_result.redirected_url = async_response.redirected_url or url
                     crawl_result.response_headers = async_response.response_headers
                     crawl_result.downloaded_files = async_response.downloaded_files
-                    crawl_result.ssl_certificate = (
-                        async_response.ssl_certificate
-                    )  # Add SSL certificate
-
-                    # # Check and set values from async_response to crawl_result
-                    # try:
-                    #     for key in vars(async_response):
-                    #         if hasattr(crawl_result, key):
-                    #             value = getattr(async_response, key, None)
-                    #             current_value = getattr(crawl_result, key, None)
-                    #             if value is not None and not current_value:
-                    #                 try:
-                    #                     setattr(crawl_result, key, value)
-                    #                 except Exception as e:
-                    #                     self.logger.warning(
-                    #                         message=f"Failed to set attribute {key}: {str(e)}",
-                    #                         tag="WARNING"
-                    #                     )
-                    # except Exception as e:
-                    #     self.logger.warning(
-                    #         message=f"Error copying response attributes: {str(e)}",
-                    #         tag="WARNING"
-                    #     )
+                    crawl_result.js_execution_result = js_execution_result
+                    crawl_result.mhtml = async_response.mhtml_data
+                    crawl_result.ssl_certificate = async_response.ssl_certificate
+                    # Add captured network and console data if available
+                    crawl_result.network_requests = async_response.network_requests
+                    crawl_result.console_messages = async_response.console_messages
 
                     crawl_result.success = bool(html)
-                    crawl_result.session_id = getattr(config, "session_id", None)
+                    crawl_result.session_id = getattr(
+                        config, "session_id", None)
 
-                    self.logger.success(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=crawl_result.success,
+                        timing=time.perf_counter() - start_time,
                         tag="COMPLETE",
-                        params={
-                            "url": cache_context.display_url,
-                            "status": crawl_result.success,
-                            "timing": f"{time.perf_counter() - start_time:.2f}s",
-                        },
-                        colors={
-                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
-                            "timing": Fore.YELLOW,
-                        },
                     )
 
                     # Update cache if appropriate
                     if cache_context.should_write() and not bool(cached_result):
                         await async_db_manager.acache_url(crawl_result)
 
-                    return crawl_result
+                    return CrawlResultContainer(crawl_result)
 
                 else:
-                    self.logger.success(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
-                        tag="COMPLETE",
-                        params={
-                            "url": cache_context.display_url,
-                            "status": True,
-                            "timing": f"{time.perf_counter() - start_time:.2f}s",
-                        },
-                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=True,
+                        timing=time.perf_counter() - start_time,
+                        tag="COMPLETE"
                     )
-
                     cached_result.success = bool(html)
-                    cached_result.session_id = getattr(config, "session_id", None)
+                    cached_result.session_id = getattr(
+                        config, "session_id", None)
                     cached_result.redirected_url = cached_result.redirected_url or url
-                    return cached_result
+                    return CrawlResultContainer(cached_result)
 
             except Exception as e:
                 error_context = get_error_context(sys.exc_info())
@@ -538,17 +414,17 @@ class AsyncWebCrawler:
                     f"Error: {str(e)}\n\n"
                     f"Code context:\n{error_context['code_context']}"
                 )
-                # if not hasattr(e, "msg"):
-                #     e.msg = str(e)
 
                 self.logger.error_status(
                     url=url,
-                    error=create_box_message(error_message, type="error"),
+                    error=error_message,
                     tag="ERROR",
                 )
 
-                return CrawlResult(
-                    url=url, html="", success=False, error_message=error_message
+                return CrawlResultContainer(
+                    CrawlResult(
+                        url=url, html="", success=False, error_message=error_message
+                    )
                 )
 
     async def aprocess_html(
@@ -557,7 +433,7 @@ class AsyncWebCrawler:
         html: str,
         extracted_content: str,
         config: CrawlerRunConfig,
-        screenshot: str,
+        screenshot_data: str,
         pdf_data: str,
         verbose: bool,
         **kwargs,
@@ -570,7 +446,7 @@ class AsyncWebCrawler:
             html: Raw HTML content
             extracted_content: Previously extracted content (if any)
             config: Configuration object controlling processing behavior
-            screenshot: Screenshot data (if any)
+            screenshot_data: Screenshot data (if any)
             pdf_data: PDF data (if any)
             verbose: Whether to enable verbose logging
             **kwargs: Additional parameters for backwards compatibility
@@ -578,6 +454,7 @@ class AsyncWebCrawler:
         Returns:
             CrawlResult: Processed result containing extracted and formatted content
         """
+        cleaned_html = ""
         try:
             _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
             t1 = time.perf_counter()
@@ -588,11 +465,17 @@ class AsyncWebCrawler:
                 scraping_strategy.logger = self.logger
 
             # Process HTML content
-            params = {k: v for k, v in config.to_dict().items() if k not in ["url"]}
+            params = config.__dict__.copy()
+            params.pop("url", None)
             # add keys from kwargs to params that doesn't exist in params
-            params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
+            params.update({k: v for k, v in kwargs.items()
+                          if k not in params.keys()})
 
-            result = scraping_strategy.scrap(url, html, **params)
+            ################################
+            # Scraping Strategy Execution  #
+            ################################
+            result: ScrapingResult = scraping_strategy.scrap(
+                url, html, **params)
 
             if result is None:
                 raise ValueError(
@@ -608,50 +491,97 @@ class AsyncWebCrawler:
 
         # Extract results - handle both dict and ScrapingResult
         if isinstance(result, dict):
-            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+            cleaned_html = sanitize_input_encode(
+                result.get("cleaned_html", ""))
             media = result.get("media", {})
+            tables = media.pop("tables", []) if isinstance(media, dict) else []
             links = result.get("links", {})
             metadata = result.get("metadata", {})
         else:
             cleaned_html = sanitize_input_encode(result.cleaned_html)
             media = result.media.model_dump()
+            tables = media.pop("tables", [])
             links = result.links.model_dump()
             metadata = result.metadata
+            
+        fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
 
-        # Markdown Generation
+        ################################
+        # Generate Markdown            #
+        ################################
         markdown_generator: Optional[MarkdownGenerationStrategy] = (
             config.markdown_generator or DefaultMarkdownGenerator()
         )
 
+        # --- SELECT HTML SOURCE BASED ON CONTENT_SOURCE ---
+        # Get the desired source from the generator config, default to 'cleaned_html'
+        selected_html_source = getattr(markdown_generator, 'content_source', 'cleaned_html')
+
+        # Define the source selection logic using dict dispatch
+        html_source_selector = {
+            "raw_html": lambda: html,  # The original raw HTML
+            "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
+            "fit_html": lambda: fit_html,  # The HTML after preprocessing for schema
+        }
+
+        markdown_input_html = cleaned_html  # Default to cleaned_html
+
+        try:
+            # Get the appropriate lambda function, default to returning cleaned_html if key not found
+            source_lambda = html_source_selector.get(selected_html_source, lambda: cleaned_html)
+            # Execute the lambda to get the selected HTML
+            markdown_input_html = source_lambda()
+
+            # Log which source is being used (optional, but helpful for debugging)
+            # if self.logger and verbose:
+            #     actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
+            #     self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
+
+        except Exception as e:
+            # Handle potential errors, especially from preprocess_html_for_schema
+            if self.logger:
+                self.logger.warning(
+                    f"Error getting/processing '{selected_html_source}' for markdown source: {e}. Falling back to cleaned_html.",
+                    tag="MARKDOWN_SRC"
+                )
+            # Ensure markdown_input_html is still the default cleaned_html in case of error
+            markdown_input_html = cleaned_html
+        # --- END: HTML SOURCE SELECTION ---
+
         # Uncomment if by default we want to use PruningContentFilter
         # if not config.content_filter and not markdown_generator.content_filter:
         #     markdown_generator.content_filter = PruningContentFilter()
 
         markdown_result: MarkdownGenerationResult = (
             markdown_generator.generate_markdown(
-                cleaned_html=cleaned_html,
-                base_url=url,
+                input_html=markdown_input_html,
+                base_url=params.get("redirected_url", url)
                 # html2text_options=kwargs.get('html2text', {})
             )
         )
-        markdown_v2 = markdown_result
-        markdown = sanitize_input_encode(markdown_result.raw_markdown)
 
         # Log processing completion
-        self.logger.info(
-            message="Processed {url:.50}... | Time: {timing}ms",
-            tag="SCRAPE",
-            params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)},
+        self.logger.url_status(
+            url=_url,
+            success=True,
+            timing=int((time.perf_counter() - t1) * 1000) / 1000,
+            tag="SCRAPE"
         )
+        # self.logger.info(
+        #     message="{url:.50}... | Time: {timing}s",
+        #     tag="SCRAPE",
+        #     params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
+        # )
 
-        # Handle content extraction if needed
+        ################################
+        # Structured Content Extraction           #
+        ################################
         if (
             not bool(extracted_content)
             and config.extraction_strategy
             and not isinstance(config.extraction_strategy, NoExtractionStrategy)
         ):
             t1 = time.perf_counter()
-
             # Choose content based on input_format
             content_format = config.extraction_strategy.input_format
             if content_format == "fit_markdown" and not markdown_result.fit_markdown:
@@ -663,15 +593,17 @@ class AsyncWebCrawler:
                 content_format = "markdown"
 
             content = {
-                "markdown": markdown,
+                "markdown": markdown_result.raw_markdown,
                 "html": html,
-                "fit_markdown": markdown_result.raw_markdown,
-            }.get(content_format, markdown)
+                "fit_html": fit_html,
+                "cleaned_html": cleaned_html,
+                "fit_markdown": markdown_result.fit_markdown,
+            }.get(content_format, markdown_result.raw_markdown)
 
             # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
             chunking = (
                 IdentityChunking()
-                if content_format == "html"
+                if content_format in ["html", "cleaned_html", "fit_html"]
                 else config.chunking_strategy
             )
             sections = chunking.chunk(content)
@@ -687,10 +619,6 @@ class AsyncWebCrawler:
                 params={"url": _url, "timing": time.perf_counter() - t1},
             )
 
-        # Handle screenshot and PDF data
-        screenshot_data = None if not screenshot else screenshot
-        pdf_data = None if not pdf_data else pdf_data
-
         # Apply HTML formatting if requested
         if config.prettiify:
             cleaned_html = fast_format_html(cleaned_html)
@@ -699,12 +627,11 @@ class AsyncWebCrawler:
         return CrawlResult(
             url=url,
             html=html,
+            fit_html=fit_html,
             cleaned_html=cleaned_html,
-            markdown_v2=markdown_v2,
-            markdown=markdown,
-            fit_markdown=markdown_result.fit_markdown,
-            fit_html=markdown_result.fit_html,
+            markdown=markdown_result,
             media=media,
+            tables=tables,                       # NEW
             links=links,
             metadata=metadata,
             screenshot=screenshot_data,
@@ -717,22 +644,22 @@ class AsyncWebCrawler:
     async def arun_many(
         self,
         urls: List[str],
-        config: Optional[CrawlerRunConfig] = None, 
+        config: Optional[CrawlerRunConfig] = None,
         dispatcher: Optional[BaseDispatcher] = None,
         # Legacy parameters maintained for backwards compatibility
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        content_filter: RelevantContentFilter = None,
-        cache_mode: Optional[CacheMode] = None,
-        bypass_cache: bool = False,
-        css_selector: str = None,
-        screenshot: bool = False,
-        pdf: bool = False,
-        user_agent: str = None,
-        verbose=True,
-        **kwargs
-        ) -> RunManyReturn:
+        # word_count_threshold=MIN_WORD_THRESHOLD,
+        # extraction_strategy: ExtractionStrategy = None,
+        # chunking_strategy: ChunkingStrategy = RegexChunking(),
+        # content_filter: RelevantContentFilter = None,
+        # cache_mode: Optional[CacheMode] = None,
+        # bypass_cache: bool = False,
+        # css_selector: str = None,
+        # screenshot: bool = False,
+        # pdf: bool = False,
+        # user_agent: str = None,
+        # verbose=True,
+        **kwargs,
+    ) -> RunManyReturn:
         """
         Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
 
@@ -763,20 +690,21 @@ class AsyncWebCrawler:
         ):
             print(f"Processed {result.url}: {len(result.markdown)} chars")
         """
-        if config is None:
-            config = CrawlerRunConfig(
-                word_count_threshold=word_count_threshold,
-                extraction_strategy=extraction_strategy,
-                chunking_strategy=chunking_strategy,
-                content_filter=content_filter,
-                cache_mode=cache_mode,
-                bypass_cache=bypass_cache,
-                css_selector=css_selector,
-                screenshot=screenshot,
-                pdf=pdf,
-                verbose=verbose,
-                **kwargs,
-            )
+        config = config or CrawlerRunConfig()
+        # if config is None:
+        #     config = CrawlerRunConfig(
+        #         word_count_threshold=word_count_threshold,
+        #         extraction_strategy=extraction_strategy,
+        #         chunking_strategy=chunking_strategy,
+        #         content_filter=content_filter,
+        #         cache_mode=cache_mode,
+        #         bypass_cache=bypass_cache,
+        #         css_selector=css_selector,
+        #         screenshot=screenshot,
+        #         pdf=pdf,
+        #         verbose=verbose,
+        #         **kwargs,
+        #     )
 
         if dispatcher is None:
             dispatcher = MemoryAdaptiveDispatcher(
@@ -785,38 +713,34 @@ class AsyncWebCrawler:
                 ),
             )
 
-        transform_result = lambda task_result: (
-            setattr(task_result.result, 'dispatch_result', 
-                DispatchResult(
-                    task_id=task_result.task_id,
-                    memory_usage=task_result.memory_usage,
-                    peak_memory=task_result.peak_memory,
-                    start_time=task_result.start_time,
-                    end_time=task_result.end_time,
-                    error_message=task_result.error_message,
+        def transform_result(task_result):
+            return (
+                setattr(
+                    task_result.result,
+                    "dispatch_result",
+                    DispatchResult(
+                        task_id=task_result.task_id,
+                        memory_usage=task_result.memory_usage,
+                        peak_memory=task_result.peak_memory,
+                        start_time=task_result.start_time,
+                        end_time=task_result.end_time,
+                        error_message=task_result.error_message,
+                    ),
                 )
-            ) or task_result.result
-        )
+                or task_result.result
+            )
 
         stream = config.stream
-        
+
         if stream:
+
             async def result_transformer():
-                async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
+                async for task_result in dispatcher.run_urls_stream(
+                    crawler=self, urls=urls, config=config
+                ):
                     yield transform_result(task_result)
+
             return result_transformer()
         else:
             _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
-            return [transform_result(res) for res in _results]    
-
-    async def aclear_cache(self):
-        """Clear the cache database."""
-        await async_db_manager.cleanup()
-
-    async def aflush_cache(self):
-        """Flush the cache database."""
-        await async_db_manager.aflush_db()
-
-    async def aget_cache_size(self):
-        """Get the total number of cached items."""
-        return await async_db_manager.aget_total_count()
+            return [transform_result(res) for res in _results]
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
new file mode 100644
index 00000000..d4e074cf
--- /dev/null
+++ b/crawl4ai/browser_manager.py
@@ -0,0 +1,1050 @@
+import asyncio
+import time
+from typing import List, Optional
+import os
+import sys
+import shutil
+import tempfile
+import psutil  
+import signal
+import subprocess
+import shlex
+from playwright.async_api import BrowserContext
+import hashlib
+from .js_snippet import load_js_script
+from .config import DOWNLOAD_PAGE_TIMEOUT
+from .async_configs import BrowserConfig, CrawlerRunConfig
+from playwright_stealth import StealthConfig
+from .utils import get_chromium_path
+
+stealth_config = StealthConfig(
+    webdriver=True,
+    chrome_app=True,
+    chrome_csi=True,
+    chrome_load_times=True,
+    chrome_runtime=True,
+    navigator_languages=True,
+    navigator_plugins=True,
+    navigator_permissions=True,
+    webgl_vendor=True,
+    outerdimensions=True,
+    navigator_hardware_concurrency=True,
+    media_codecs=True,
+)
+
+BROWSER_DISABLE_OPTIONS = [
+    "--disable-background-networking",
+    "--disable-background-timer-throttling",
+    "--disable-backgrounding-occluded-windows",
+    "--disable-breakpad",
+    "--disable-client-side-phishing-detection",
+    "--disable-component-extensions-with-background-pages",
+    "--disable-default-apps",
+    "--disable-extensions",
+    "--disable-features=TranslateUI",
+    "--disable-hang-monitor",
+    "--disable-ipc-flooding-protection",
+    "--disable-popup-blocking",
+    "--disable-prompt-on-repost",
+    "--disable-sync",
+    "--force-color-profile=srgb",
+    "--metrics-recording-only",
+    "--no-first-run",
+    "--password-store=basic",
+    "--use-mock-keychain",
+]
+
+
+class ManagedBrowser:
+    """
+    Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
+
+    Attributes:
+        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                            Default: "chromium".
+        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                     temporary directory may be used. Default: None.
+        headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                         Default: True.
+        browser_process (subprocess.Popen): The process object for the browser.
+        temp_dir (str): Temporary directory for user data if not provided.
+        debugging_port (int): Port for debugging the browser.
+        host (str): Host for debugging the browser.
+
+        Methods:
+            start(): Starts the browser process and returns the CDP endpoint URL.
+            _get_browser_path(): Returns the browser executable path based on OS and browser type.
+            _get_browser_args(): Returns browser-specific command line arguments.
+            _get_user_data_dir(): Returns the user data directory path.
+            _cleanup(): Terminates the browser process and removes the temporary directory.
+            create_profile(): Static method to create a user profile by launching a browser for user interaction.
+    """
+    
+    @staticmethod
+    def build_browser_flags(config: BrowserConfig) -> List[str]:
+        """Common CLI flags for launching Chromium"""
+        flags = [
+            "--disable-gpu",
+            "--disable-gpu-compositing",
+            "--disable-software-rasterizer",
+            "--no-sandbox",
+            "--disable-dev-shm-usage",
+            "--no-first-run",
+            "--no-default-browser-check",
+            "--disable-infobars",
+            "--window-position=0,0",
+            "--ignore-certificate-errors",
+            "--ignore-certificate-errors-spki-list",
+            "--disable-blink-features=AutomationControlled",
+            "--window-position=400,0",
+            "--disable-renderer-backgrounding",
+            "--disable-ipc-flooding-protection",
+            "--force-color-profile=srgb",
+            "--mute-audio",
+            "--disable-background-timer-throttling",
+        ]
+        if config.light_mode:
+            flags.extend(BROWSER_DISABLE_OPTIONS)
+        if config.text_mode:
+            flags.extend([
+                "--blink-settings=imagesEnabled=false",
+                "--disable-remote-fonts",
+                "--disable-images",
+                "--disable-javascript",
+                "--disable-software-rasterizer",
+                "--disable-dev-shm-usage",
+            ])
+        # proxy support
+        if config.proxy:
+            flags.append(f"--proxy-server={config.proxy}")
+        elif config.proxy_config:
+            creds = ""
+            if config.proxy_config.username and config.proxy_config.password:
+                creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
+            flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
+        # dedupe
+        return list(dict.fromkeys(flags))
+
+    browser_type: str
+    user_data_dir: str
+    headless: bool
+    browser_process: subprocess.Popen
+    temp_dir: str
+    debugging_port: int
+    host: str
+
+    def __init__(
+        self,
+        browser_type: str = "chromium",
+        user_data_dir: Optional[str] = None,
+        headless: bool = False,
+        logger=None,
+        host: str = "localhost",
+        debugging_port: int = 9222,
+        cdp_url: Optional[str] = None, 
+        browser_config: Optional[BrowserConfig] = None,
+    ):
+        """
+        Initialize the ManagedBrowser instance.
+
+        Args:
+            browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                                Default: "chromium".
+            user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                         temporary directory may be used. Default: None.
+            headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                             Default: True.
+            logger (logging.Logger): Logger instance for logging messages. Default: None.
+            host (str): Host for debugging the browser. Default: "localhost".
+            debugging_port (int): Port for debugging the browser. Default: 9222.
+            cdp_url (str or None): CDP URL to connect to the browser. Default: None.
+            browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
+        """
+        self.browser_type = browser_config.browser_type
+        self.user_data_dir = browser_config.user_data_dir
+        self.headless = browser_config.headless
+        self.browser_process = None
+        self.temp_dir = None
+        self.debugging_port = browser_config.debugging_port
+        self.host = browser_config.host
+        self.logger = logger
+        self.shutting_down = False
+        self.cdp_url = browser_config.cdp_url
+        self.browser_config = browser_config
+
+    async def start(self) -> str:
+        """
+        Starts the browser process or returns CDP endpoint URL.
+        If cdp_url is provided, returns it directly.
+        If user_data_dir is not provided for local browser, creates a temporary directory.
+        
+        Returns:
+            str: CDP endpoint URL
+        """
+        # If CDP URL provided, just return it
+        if self.cdp_url:
+            return self.cdp_url
+
+        # Create temp dir if needed
+        if not self.user_data_dir:
+            self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
+            self.user_data_dir = self.temp_dir
+
+        # Get browser path and args based on OS and browser type
+        # browser_path = self._get_browser_path()
+        args = await self._get_browser_args()
+        
+        if self.browser_config.extra_args:
+            args.extend(self.browser_config.extra_args)
+            
+
+        # ── make sure no old Chromium instance is owning the same port/profile ──
+        try:
+            if sys.platform == "win32":
+                if psutil is None:
+                    raise RuntimeError("psutil not available, cannot clean old browser")
+                for p in psutil.process_iter(["pid", "name", "cmdline"]):
+                    cl = " ".join(p.info.get("cmdline") or [])
+                    if (
+                        f"--remote-debugging-port={self.debugging_port}" in cl
+                        and f"--user-data-dir={self.user_data_dir}" in cl
+                    ):
+                        p.kill()
+                        p.wait(timeout=5)
+            else:  # macOS / Linux
+                # kill any process listening on the same debugging port
+                pids = (
+                    subprocess.check_output(shlex.split(f"lsof -t -i:{self.debugging_port}"))
+                    .decode()
+                    .strip()
+                    .splitlines()
+                )
+                for pid in pids:
+                    try:
+                        os.kill(int(pid), signal.SIGTERM)
+                    except ProcessLookupError:
+                        pass
+
+                # remove Chromium singleton locks, or new launch exits with
+                # “Opening in existing browser session.”
+                for f in ("SingletonLock", "SingletonSocket", "SingletonCookie"):
+                    fp = os.path.join(self.user_data_dir, f)
+                    if os.path.exists(fp):
+                        os.remove(fp)
+        except Exception as _e:
+            # non-fatal — we'll try to start anyway, but log what happened
+            self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER")            
+            
+
+        # Start browser process
+        try:
+            # Use DETACHED_PROCESS flag on Windows to fully detach the process
+            # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
+            if sys.platform == "win32":
+                self.browser_process = subprocess.Popen(
+                    args, 
+                    stdout=subprocess.PIPE, 
+                    stderr=subprocess.PIPE,
+                    creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
+                )
+            else:
+                self.browser_process = subprocess.Popen(
+                    args, 
+                    stdout=subprocess.PIPE, 
+                    stderr=subprocess.PIPE,
+                    preexec_fn=os.setpgrp  # Start in a new process group
+                )
+                
+            # We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
+            await asyncio.sleep(0.5)  # Give browser time to start
+            await self._initial_startup_check()
+            await asyncio.sleep(2)  # Give browser time to start
+            return f"http://{self.host}:{self.debugging_port}"
+        except Exception as e:
+            await self.cleanup()
+            raise Exception(f"Failed to start browser: {e}")
+
+    async def _initial_startup_check(self):
+        """
+        Perform a quick check to make sure the browser started successfully.
+        This only runs once at startup rather than continuously monitoring.
+        """
+        if not self.browser_process:
+            return
+            
+        # Check that process started without immediate termination
+        await asyncio.sleep(0.5)
+        if self.browser_process.poll() is not None:
+            # Process already terminated
+            stdout, stderr = b"", b""
+            try:
+                stdout, stderr = self.browser_process.communicate(timeout=0.5)
+            except subprocess.TimeoutExpired:
+                pass
+                
+            self.logger.error(
+                message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                tag="ERROR",
+                params={
+                    "code": self.browser_process.returncode,
+                    "stdout": stdout.decode() if stdout else "",
+                    "stderr": stderr.decode() if stderr else "",
+                },
+            )
+    
+    async def _monitor_browser_process(self):
+        """
+        Monitor the browser process for unexpected termination.
+
+        How it works:
+        1. Read stdout and stderr from the browser process.
+        2. If the process has terminated, log the error message and terminate the browser.
+        3. If the shutting_down flag is set, log the normal termination message.
+        4. If any other error occurs, log the error message.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        This is DEPRECATED and should not be used for builtin browsers that need to outlive the Python process.
+        """
+        if self.browser_process:
+            try:
+                stdout, stderr = await asyncio.gather(
+                    asyncio.to_thread(self.browser_process.stdout.read),
+                    asyncio.to_thread(self.browser_process.stderr.read),
+                )
+
+                # Check shutting_down flag BEFORE logging anything
+                if self.browser_process.poll() is not None:
+                    if not self.shutting_down:
+                        self.logger.error(
+                            message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                            tag="ERROR",
+                            params={
+                                "code": self.browser_process.returncode,
+                                "stdout": stdout.decode(),
+                                "stderr": stderr.decode(),
+                            },
+                        )
+                        await self.cleanup()
+                    else:
+                        self.logger.info(
+                            message="Browser process terminated normally | Code: {code}",
+                            tag="INFO",
+                            params={"code": self.browser_process.returncode},
+                        )
+            except Exception as e:
+                if not self.shutting_down:
+                    self.logger.error(
+                        message="Error monitoring browser process: {error}",
+                        tag="ERROR",
+                        params={"error": str(e)},
+                    )
+
+    def _get_browser_path_WIP(self) -> str:
+        """Returns the browser executable path based on OS and browser type"""
+        if sys.platform == "darwin":  # macOS
+            paths = {
+                "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+                "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
+                "webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
+            }
+        elif sys.platform == "win32":  # Windows
+            paths = {
+                "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
+                "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
+                "webkit": None,  # WebKit not supported on Windows
+            }
+        else:  # Linux
+            paths = {
+                "chromium": "google-chrome",
+                "firefox": "firefox",
+                "webkit": None,  # WebKit not supported on Linux
+            }
+
+        return paths.get(self.browser_type)
+
+    async def _get_browser_path(self) -> str:
+        browser_path = await get_chromium_path(self.browser_type)
+        return browser_path
+
+    async def _get_browser_args(self) -> List[str]:
+        """Returns full CLI args for launching the browser"""
+        base = [await self._get_browser_path()]
+        if self.browser_type == "chromium":
+            flags = [
+                f"--remote-debugging-port={self.debugging_port}",
+                f"--user-data-dir={self.user_data_dir}",
+            ]
+            if self.headless:
+                flags.append("--headless=new")
+            # merge common launch flags
+            flags.extend(self.build_browser_flags(self.browser_config))
+        elif self.browser_type == "firefox":
+            flags = [
+                "--remote-debugging-port",
+                str(self.debugging_port),
+                "--profile",
+                self.user_data_dir,
+            ]
+            if self.headless:
+                flags.append("--headless")
+        else:
+            raise NotImplementedError(f"Browser type {self.browser_type} not supported")
+        return base + flags
+
+    async def cleanup(self):
+        """Cleanup browser process and temporary directory"""
+        # Set shutting_down flag BEFORE any termination actions
+        self.shutting_down = True
+
+        if self.browser_process:
+            try:
+                # For builtin browsers that should persist, we should check if it's a detached process
+                # Only terminate if we have proper control over the process
+                if not self.browser_process.poll():
+                    # Process is still running
+                    self.browser_process.terminate()
+                    # Wait for process to end gracefully
+                    for _ in range(10):  # 10 attempts, 100ms each
+                        if self.browser_process.poll() is not None:
+                            break
+                        await asyncio.sleep(0.1)
+
+                    # Force kill if still running
+                    if self.browser_process.poll() is None:
+                        if sys.platform == "win32":
+                            # On Windows we might need taskkill for detached processes
+                            try:
+                                subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
+                            except Exception:
+                                self.browser_process.kill()
+                        else:
+                            self.browser_process.kill()
+                        await asyncio.sleep(0.1)  # Brief wait for kill to take effect
+
+            except Exception as e:
+                self.logger.error(
+                    message="Error terminating browser: {error}",
+                    tag="ERROR", 
+                    params={"error": str(e)},
+                )
+
+        if self.temp_dir and os.path.exists(self.temp_dir):
+            try:
+                shutil.rmtree(self.temp_dir)
+            except Exception as e:
+                self.logger.error(
+                    message="Error removing temporary directory: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)},
+                )
+                
+    # These methods have been moved to BrowserProfiler class
+    @staticmethod
+    async def create_profile(browser_config=None, profile_name=None, logger=None):
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Creates a browser profile by launching a browser for interactive user setup
+        and waits until the user closes it. The profile is stored in a directory that
+        can be used later with BrowserConfig.user_data_dir.
+        
+        Please use BrowserProfiler.create_profile() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            profile_path = await profiler.create_profile(profile_name="my-login-profile")
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler(logger=logger)
+        return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config)
+    
+    @staticmethod
+    def list_profiles():
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Lists all available browser profiles in the Crawl4AI profiles directory.
+        
+        Please use BrowserProfiler.list_profiles() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            profiles = profiler.list_profiles()
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler()
+        return profiler.list_profiles()
+        
+    @staticmethod
+    def delete_profile(profile_name_or_path):
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Delete a browser profile by name or path.
+        
+        Please use BrowserProfiler.delete_profile() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            success = profiler.delete_profile("my-profile")
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler()
+        return profiler.delete_profile(profile_name_or_path)
+
+
+
+
+class BrowserManager:
+    """
+    Manages the browser instance and context.
+
+    Attributes:
+        config (BrowserConfig): Configuration object containing all browser settings
+        logger: Logger instance for recording events and errors
+        browser (Browser): The browser instance
+        default_context (BrowserContext): The default browser context
+        managed_browser (ManagedBrowser): The managed browser instance
+        playwright (Playwright): The Playwright instance
+        sessions (dict): Dictionary to store session information
+        session_ttl (int): Session timeout in seconds
+    """
+
+    _playwright_instance = None
+    
+    @classmethod
+    async def get_playwright(cls):
+        from playwright.async_api import async_playwright
+        cls._playwright_instance = await async_playwright().start()
+        return cls._playwright_instance    
+
+    def __init__(self, browser_config: BrowserConfig, logger=None):
+        """
+        Initialize the BrowserManager with a browser configuration.
+
+        Args:
+            browser_config (BrowserConfig): Configuration object containing all browser settings
+            logger: Logger instance for recording events and errors
+        """
+        self.config: BrowserConfig = browser_config
+        self.logger = logger
+
+        # Browser state
+        self.browser = None
+        self.default_context = None
+        self.managed_browser = None
+        self.playwright = None
+
+        # Session management
+        self.sessions = {}
+        self.session_ttl = 1800  # 30 minutes
+
+        # Keep track of contexts by a "config signature," so each unique config reuses a single context
+        self.contexts_by_config = {}
+        self._contexts_lock = asyncio.Lock() 
+
+        # Initialize ManagedBrowser if needed
+        if self.config.use_managed_browser:
+            self.managed_browser = ManagedBrowser(
+                browser_type=self.config.browser_type,
+                user_data_dir=self.config.user_data_dir,
+                headless=self.config.headless,
+                logger=self.logger,
+                debugging_port=self.config.debugging_port,
+                cdp_url=self.config.cdp_url,
+                browser_config=self.config,
+            )
+
+    async def start(self):
+        """
+        Start the browser instance and set up the default context.
+
+        How it works:
+        1. Check if Playwright is already initialized.
+        2. If not, initialize Playwright.
+        3. If managed browser is used, start it and connect to the CDP endpoint.
+        4. If managed browser is not used, launch the browser and set up the default context.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        """
+        if self.playwright is not None:
+            await self.close()
+            
+        from playwright.async_api import async_playwright
+
+        self.playwright = await async_playwright().start()
+
+        if self.config.cdp_url or self.config.use_managed_browser:
+            self.config.use_managed_browser = True
+            cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
+            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
+            contexts = self.browser.contexts
+            if contexts:
+                self.default_context = contexts[0]
+            else:
+                self.default_context = await self.create_browser_context()
+            await self.setup_context(self.default_context)
+        else:
+            browser_args = self._build_browser_args()
+
+            # Launch appropriate browser type
+            if self.config.browser_type == "firefox":
+                self.browser = await self.playwright.firefox.launch(**browser_args)
+            elif self.config.browser_type == "webkit":
+                self.browser = await self.playwright.webkit.launch(**browser_args)
+            else:
+                self.browser = await self.playwright.chromium.launch(**browser_args)
+
+            self.default_context = self.browser
+
+
+    def _build_browser_args(self) -> dict:
+        """Build browser launch arguments from config."""
+        args = [
+            "--disable-gpu",
+            "--disable-gpu-compositing",
+            "--disable-software-rasterizer",
+            "--no-sandbox",
+            "--disable-dev-shm-usage",
+            "--no-first-run",
+            "--no-default-browser-check",
+            "--disable-infobars",
+            "--window-position=0,0",
+            "--ignore-certificate-errors",
+            "--ignore-certificate-errors-spki-list",
+            "--disable-blink-features=AutomationControlled",
+            "--window-position=400,0",
+            "--disable-renderer-backgrounding",
+            "--disable-ipc-flooding-protection",
+            "--force-color-profile=srgb",
+            "--mute-audio",
+            "--disable-background-timer-throttling",
+            # "--single-process",
+            f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
+        ]
+
+        if self.config.light_mode:
+            args.extend(BROWSER_DISABLE_OPTIONS)
+
+        if self.config.text_mode:
+            args.extend(
+                [
+                    "--blink-settings=imagesEnabled=false",
+                    "--disable-remote-fonts",
+                    "--disable-images",
+                    "--disable-javascript",
+                    "--disable-software-rasterizer",
+                    "--disable-dev-shm-usage",
+                ]
+            )
+
+        if self.config.extra_args:
+            args.extend(self.config.extra_args)
+
+        # Deduplicate args
+        args = list(dict.fromkeys(args))
+        
+        browser_args = {"headless": self.config.headless, "args": args}
+
+        if self.config.chrome_channel:
+            browser_args["channel"] = self.config.chrome_channel
+
+        if self.config.accept_downloads:
+            browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
+                os.getcwd(), "downloads"
+            )
+            os.makedirs(browser_args["downloads_path"], exist_ok=True)
+
+        if self.config.proxy or self.config.proxy_config:
+            from playwright.async_api import ProxySettings
+
+            proxy_settings = (
+                ProxySettings(server=self.config.proxy)
+                if self.config.proxy
+                else ProxySettings(
+                    server=self.config.proxy_config.server,
+                    username=self.config.proxy_config.username,
+                    password=self.config.proxy_config.password,
+                )
+            )
+            browser_args["proxy"] = proxy_settings
+
+        return browser_args
+
+    async def setup_context(
+        self,
+        context: BrowserContext,
+        crawlerRunConfig: CrawlerRunConfig = None,
+        is_default=False,
+    ):
+        """
+        Set up a browser context with the configured options.
+
+        How it works:
+        1. Set extra HTTP headers if provided.
+        2. Add cookies if provided.
+        3. Load storage state if provided.
+        4. Accept downloads if enabled.
+        5. Set default timeouts for navigation and download.
+        6. Set user agent if provided.
+        7. Set browser hints if provided.
+        8. Set proxy if provided.
+        9. Set downloads path if provided.
+        10. Set storage state if provided.
+        11. Set cache if provided.
+        12. Set extra HTTP headers if provided.
+        13. Add cookies if provided.
+        14. Set default timeouts for navigation and download if enabled.
+        15. Set user agent if provided.
+        16. Set browser hints if provided.
+
+        Args:
+            context (BrowserContext): The browser context to set up
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+            is_default (bool): Flag indicating if this is the default context
+        Returns:
+            None
+        """
+        if self.config.headers:
+            await context.set_extra_http_headers(self.config.headers)
+
+        if self.config.cookies:
+            await context.add_cookies(self.config.cookies)
+
+        if self.config.storage_state:
+            await context.storage_state(path=None)
+
+        if self.config.accept_downloads:
+            context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            if self.config.downloads_path:
+                context._impl_obj._options["accept_downloads"] = True
+                context._impl_obj._options[
+                    "downloads_path"
+                ] = self.config.downloads_path
+
+        # Handle user agent and browser hints
+        if self.config.user_agent:
+            combined_headers = {
+                "User-Agent": self.config.user_agent,
+                "sec-ch-ua": self.config.browser_hint,
+            }
+            combined_headers.update(self.config.headers)
+            await context.set_extra_http_headers(combined_headers)
+
+        # Add default cookie
+        await context.add_cookies(
+            [
+                {
+                    "name": "cookiesEnabled",
+                    "value": "true",
+                    "url": crawlerRunConfig.url
+                    if crawlerRunConfig and crawlerRunConfig.url
+                    else "https://crawl4ai.com/",
+                }
+            ]
+        )
+
+        # Handle navigator overrides
+        if crawlerRunConfig:
+            if (
+                crawlerRunConfig.override_navigator
+                or crawlerRunConfig.simulate_user
+                or crawlerRunConfig.magic
+            ):
+                await context.add_init_script(load_js_script("navigator_overrider"))        
+
+    async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
+        """
+        Creates and returns a new browser context with configured settings.
+        Applies text-only mode settings if text_mode is enabled in config.
+
+        Returns:
+            Context: Browser context object with the specified configurations
+        """
+        # Base settings
+        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
+        viewport_settings = {
+            "width": self.config.viewport_width,
+            "height": self.config.viewport_height,
+        }
+        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
+
+        blocked_extensions = [
+            # Images
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "webp",
+            "svg",
+            "ico",
+            "bmp",
+            "tiff",
+            "psd",
+            # Fonts
+            "woff",
+            "woff2",
+            "ttf",
+            "otf",
+            "eot",
+            # Styles
+            # 'css', 'less', 'scss', 'sass',
+            # Media
+            "mp4",
+            "webm",
+            "ogg",
+            "avi",
+            "mov",
+            "wmv",
+            "flv",
+            "m4v",
+            "mp3",
+            "wav",
+            "aac",
+            "m4a",
+            "opus",
+            "flac",
+            # Documents
+            "pdf",
+            "doc",
+            "docx",
+            "xls",
+            "xlsx",
+            "ppt",
+            "pptx",
+            # Archives
+            "zip",
+            "rar",
+            "7z",
+            "tar",
+            "gz",
+            # Scripts and data
+            "xml",
+            "swf",
+            "wasm",
+        ]
+
+        # Common context settings
+        context_settings = {
+            "user_agent": user_agent,
+            "viewport": viewport_settings,
+            "proxy": proxy_settings,
+            "accept_downloads": self.config.accept_downloads,
+            "storage_state": self.config.storage_state,
+            "ignore_https_errors": self.config.ignore_https_errors,
+            "device_scale_factor": 1.0,
+            "java_script_enabled": self.config.java_script_enabled,
+        }
+        
+        if crawlerRunConfig:
+            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
+            if crawlerRunConfig.proxy_config:
+                proxy_settings = {
+                    "server": crawlerRunConfig.proxy_config.server,
+                }
+                if crawlerRunConfig.proxy_config.username:
+                    proxy_settings.update({
+                        "username": crawlerRunConfig.proxy_config.username,
+                        "password": crawlerRunConfig.proxy_config.password,
+                    })
+                context_settings["proxy"] = proxy_settings
+
+        if self.config.text_mode:
+            text_mode_settings = {
+                "has_touch": False,
+                "is_mobile": False,
+            }
+            # Update context settings with text mode settings
+            context_settings.update(text_mode_settings)
+
+        # inject locale / tz / geo if user provided them
+        if crawlerRunConfig:
+            if crawlerRunConfig.locale:
+                context_settings["locale"] = crawlerRunConfig.locale
+            if crawlerRunConfig.timezone_id:
+                context_settings["timezone_id"] = crawlerRunConfig.timezone_id
+            if crawlerRunConfig.geolocation:
+                context_settings["geolocation"] = {
+                    "latitude": crawlerRunConfig.geolocation.latitude,
+                    "longitude": crawlerRunConfig.geolocation.longitude,
+                    "accuracy": crawlerRunConfig.geolocation.accuracy,
+                }
+                # ensure geolocation permission
+                perms = context_settings.get("permissions", [])
+                perms.append("geolocation")
+                context_settings["permissions"] = perms
+
+        # Create and return the context with all settings
+        context = await self.browser.new_context(**context_settings)
+
+        # Apply text mode settings if enabled
+        if self.config.text_mode:
+            # Create and apply route patterns for each extension
+            for ext in blocked_extensions:
+                await context.route(f"**/*.{ext}", lambda route: route.abort())
+        return context
+
+    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
+        """
+        Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
+        then returns a hash of the sorted JSON. This yields a stable signature
+        that identifies configurations requiring a unique browser context.
+        """
+        import json
+
+        config_dict = crawlerRunConfig.__dict__.copy()
+        # Exclude items that do not affect browser-level setup.
+        # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
+        ephemeral_keys = [
+            "session_id",
+            "js_code",
+            "scraping_strategy",
+            "extraction_strategy",
+            "chunking_strategy",
+            "cache_mode",
+            "content_filter",
+            "semaphore_count",
+            "url"
+        ]
+        
+        # Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
+        # and should cause a new context to be created if they change
+        
+        for key in ephemeral_keys:
+            if key in config_dict:
+                del config_dict[key]
+        # Convert to canonical JSON string
+        signature_json = json.dumps(config_dict, sort_keys=True, default=str)
+
+        # Hash the JSON so we get a compact, unique string
+        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
+        return signature_hash
+
+    async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
+        """
+        Get a page for the given session ID, creating a new one if needed.
+
+        Args:
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+
+        Returns:
+            (page, context): The Page and its BrowserContext
+        """
+        self._cleanup_expired_sessions()
+
+        # If a session_id is provided and we already have it, reuse that page + context
+        if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
+            context, page, _ = self.sessions[crawlerRunConfig.session_id]
+            # Update last-used timestamp
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+            return page, context
+
+        # If using a managed browser, just grab the shared default_context
+        if self.config.use_managed_browser:
+            context = self.default_context
+            pages = context.pages
+            page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
+            if not page:
+                page = context.pages[0] # await context.new_page()
+        else:
+            # Otherwise, check if we have an existing context for this config
+            config_signature = self._make_config_signature(crawlerRunConfig)
+
+            async with self._contexts_lock:
+                if config_signature in self.contexts_by_config:
+                    context = self.contexts_by_config[config_signature]
+                else:
+                    # Create and setup a new context
+                    context = await self.create_browser_context(crawlerRunConfig)
+                    await self.setup_context(context, crawlerRunConfig)
+                    self.contexts_by_config[config_signature] = context
+
+            # Create a new page from the chosen context
+            page = await context.new_page()
+
+        # If a session_id is specified, store this session so we can reuse later
+        if crawlerRunConfig.session_id:
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+
+        return page, context
+
+    async def kill_session(self, session_id: str):
+        """
+        Kill a browser session and clean up resources.
+
+        Args:
+            session_id (str): The session ID to kill.
+        """
+        if session_id in self.sessions:
+            context, page, _ = self.sessions[session_id]
+            await page.close()
+            if not self.config.use_managed_browser:
+                await context.close()
+            del self.sessions[session_id]
+
+    def _cleanup_expired_sessions(self):
+        """Clean up expired sessions based on TTL."""
+        current_time = time.time()
+        expired_sessions = [
+            sid
+            for sid, (_, _, last_used) in self.sessions.items()
+            if current_time - last_used > self.session_ttl
+        ]
+        for sid in expired_sessions:
+            asyncio.create_task(self.kill_session(sid))
+
+    async def close(self):
+        """Close all browser resources and clean up."""
+        if self.config.cdp_url:
+            return
+        
+        if self.config.sleep_on_close:
+            await asyncio.sleep(0.5)
+
+        session_ids = list(self.sessions.keys())
+        for session_id in session_ids:
+            await self.kill_session(session_id)
+
+        # Now close all contexts we created. This reclaims memory from ephemeral contexts.
+        for ctx in self.contexts_by_config.values():
+            try:
+                await ctx.close()
+            except Exception as e:
+                self.logger.error(
+                    message="Error closing context: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )
+        self.contexts_by_config.clear()
+
+        if self.browser:
+            await self.browser.close()
+            self.browser = None
+
+        if self.managed_browser:
+            await asyncio.sleep(0.5)
+            await self.managed_browser.cleanup()
+            self.managed_browser = None
+
+        if self.playwright:
+            await self.playwright.stop()
+            self.playwright = None
diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py
new file mode 100644
index 00000000..961ba740
--- /dev/null
+++ b/crawl4ai/browser_profiler.py
@@ -0,0 +1,1011 @@
+"""
+Browser Profiler Module
+
+This module provides a dedicated class for managing browser profiles
+that can be used for identity-based crawling with Crawl4AI.
+"""
+
+import os
+import asyncio
+import signal
+import sys
+import datetime
+import uuid
+import shutil
+import json
+import subprocess
+import time
+from typing import List, Dict, Optional, Any
+from rich.console import Console
+
+from .async_configs import BrowserConfig
+from .browser_manager import ManagedBrowser
+from .async_logger import AsyncLogger, AsyncLoggerBase, LogColor
+from .utils import get_home_folder
+
+
+class BrowserProfiler:
+    """
+    A dedicated class for managing browser profiles for Crawl4AI.
+    
+    The BrowserProfiler allows you to:
+    - Create browser profiles interactively
+    - List available profiles
+    - Delete profiles when no longer needed
+    - Get profile paths for use in BrowserConfig
+    
+    Profiles are stored by default in ~/.crawl4ai/profiles/
+    """
+    
+    def __init__(self, logger: Optional[AsyncLoggerBase] = None):
+        """
+        Initialize the BrowserProfiler.
+        
+        Args:
+            logger (AsyncLoggerBase, optional): Logger for outputting messages.
+                If None, a default AsyncLogger will be created.
+        """
+        # Initialize rich console for colorful input prompts
+        self.console = Console()
+        
+        # Create a logger if not provided
+        if logger is None:
+            self.logger = AsyncLogger(verbose=True)
+        elif not isinstance(logger, AsyncLoggerBase):
+            self.logger = AsyncLogger(verbose=True)
+        else:
+            self.logger = logger
+            
+        # Ensure profiles directory exists
+        self.profiles_dir = os.path.join(get_home_folder(), "profiles")
+        os.makedirs(self.profiles_dir, exist_ok=True)
+        
+        # Builtin browser config file
+        self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser")
+        self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
+        os.makedirs(self.builtin_browser_dir, exist_ok=True)
+    
+    async def create_profile(self, 
+                            profile_name: Optional[str] = None, 
+                            browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
+        """
+        Creates a browser profile by launching a browser for interactive user setup
+        and waits until the user closes it. The profile is stored in a directory that
+        can be used later with BrowserConfig.user_data_dir.
+        
+        Args:
+            profile_name (str, optional): Name for the profile directory.
+                If None, a name is generated based on timestamp.
+            browser_config (BrowserConfig, optional): Configuration for the browser.
+                If None, a default configuration is used with headless=False.
+                
+        Returns:
+            str: Path to the created profile directory, or None if creation failed
+            
+        Example:
+            ```python
+            profiler = BrowserProfiler()
+            
+            # Create a profile interactively
+            profile_path = await profiler.create_profile(
+                profile_name="my-login-profile"
+            )
+            
+            # Use the profile in a crawler
+            browser_config = BrowserConfig(
+                headless=True,
+                use_managed_browser=True,
+                user_data_dir=profile_path
+            )
+            
+            async with AsyncWebCrawler(config=browser_config) as crawler:
+                # The crawler will now use your profile with all your cookies and login state
+                result = await crawler.arun("https://example.com/dashboard")
+            ```
+        """
+        # Create default browser config if none provided
+        if browser_config is None:
+            from .async_configs import BrowserConfig
+            browser_config = BrowserConfig(
+                browser_type="chromium",
+                headless=False,  # Must be visible for user interaction
+                verbose=True
+            )
+        else:
+            # Ensure headless is False for user interaction
+            browser_config.headless = False
+            
+        # Generate profile name if not provided
+        if not profile_name:
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}"
+            
+        # Sanitize profile name (replace spaces and special chars)
+        profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name)
+        
+        # Set user data directory
+        profile_path = os.path.join(self.profiles_dir, profile_name)
+        os.makedirs(profile_path, exist_ok=True)
+        
+        # Print instructions for the user with rich formatting
+        border = f"{'='*80}"
+        self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN})
+        self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN})
+        self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
+        
+        self.logger.info("\nInstructions:", tag="PROFILE")
+        self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
+        self.logger.info("{segment}, configure settings, etc. as needed.", tag="PROFILE", params={"segment": "2. Log in to websites"}, colors={"segment": LogColor.CYAN})
+        self.logger.info("3. When you're done, {segment} to close the browser.", tag="PROFILE", params={"segment": "press 'q' in this terminal"}, colors={"segment": LogColor.YELLOW})
+        self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
+        self.logger.info("{border}", tag="PROFILE", params={"border": f"{border}\n"}, colors={"border": LogColor.CYAN})
+        
+        browser_config.headless = False
+        browser_config.user_data_dir = profile_path
+        
+        
+        # Create managed browser instance
+        managed_browser = ManagedBrowser(
+            browser_config=browser_config,
+            # user_data_dir=profile_path,
+            # headless=False,  # Must be visible
+            logger=self.logger,
+            # debugging_port=browser_config.debugging_port
+        )
+        
+        # Set up signal handlers to ensure cleanup on interrupt
+        original_sigint = signal.getsignal(signal.SIGINT)
+        original_sigterm = signal.getsignal(signal.SIGTERM)
+        
+        # Define cleanup handler for signals
+        async def cleanup_handler(sig, frame):
+            self.logger.warning("\nCleaning up browser process...", tag="PROFILE")
+            await managed_browser.cleanup()
+            # Restore original signal handlers
+            signal.signal(signal.SIGINT, original_sigint)
+            signal.signal(signal.SIGTERM, original_sigterm)
+            if sig == signal.SIGINT:
+                self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE")
+                sys.exit(1)
+                
+        # Set signal handlers
+        def sigint_handler(sig, frame):
+            asyncio.create_task(cleanup_handler(sig, frame))
+        
+        signal.signal(signal.SIGINT, sigint_handler)
+        signal.signal(signal.SIGTERM, sigint_handler)
+        
+        # Event to signal when user is done with the browser
+        user_done_event = asyncio.Event()
+        
+        # Run keyboard input loop in a separate task
+        async def listen_for_quit_command():
+            import termios
+            import tty
+            import select
+            
+            # First output the prompt
+            self.logger.info("Press 'q' when you've finished using the browser...", tag="PROFILE")
+            
+            # Save original terminal settings
+            fd = sys.stdin.fileno()
+            old_settings = termios.tcgetattr(fd)
+            
+            try:
+                # Switch to non-canonical mode (no line buffering)
+                tty.setcbreak(fd)
+                
+                while True:
+                    # Check if input is available (non-blocking)
+                    readable, _, _ = select.select([sys.stdin], [], [], 0.5)
+                    if readable:
+                        key = sys.stdin.read(1)
+                        if key.lower() == 'q':
+                            self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color=LogColor.GREEN)
+                            user_done_event.set()
+                            return
+                    
+                    # Check if the browser process has already exited
+                    if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
+                        self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
+                        user_done_event.set()
+                        return
+                        
+                    await asyncio.sleep(0.1)
+            
+            finally:
+                # Restore terminal settings 
+                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+        
+        try:
+            # Start the browser
+            await managed_browser.start()
+            
+            # Check if browser started successfully
+            browser_process = managed_browser.browser_process
+            if not browser_process:
+                self.logger.error("Failed to start browser process.", tag="PROFILE")
+                return None
+            
+            self.logger.info("Browser launched. Waiting for you to finish...", tag="PROFILE") 
+            
+            # Start listening for keyboard input
+            listener_task = asyncio.create_task(listen_for_quit_command())
+            
+            # Wait for either the user to press 'q' or for the browser process to exit naturally
+            while not user_done_event.is_set() and browser_process.poll() is None:
+                await asyncio.sleep(0.5)
+            
+            # Cancel the listener task if it's still running
+            if not listener_task.done():
+                listener_task.cancel()
+                try:
+                    await listener_task
+                except asyncio.CancelledError:
+                    pass
+            
+            # If the browser is still running and the user pressed 'q', terminate it
+            if browser_process.poll() is None and user_done_event.is_set():
+                self.logger.info("Terminating browser process...", tag="PROFILE")
+                await managed_browser.cleanup()
+            
+            self.logger.success(f"Browser closed. Profile saved at: {profile_path}", tag="PROFILE")
+                
+        except Exception as e:
+            self.logger.error(f"Error creating profile: {e!s}", tag="PROFILE")
+            await managed_browser.cleanup()
+            return None
+        finally:
+            # Restore original signal handlers
+            signal.signal(signal.SIGINT, original_sigint)
+            signal.signal(signal.SIGTERM, original_sigterm)
+            
+            # Make sure browser is fully cleaned up
+            await managed_browser.cleanup()
+        
+        # Return the profile path
+        return profile_path
+    
+    def list_profiles(self) -> List[Dict[str, Any]]:
+        """
+        Lists all available browser profiles in the Crawl4AI profiles directory.
+        
+        Returns:
+            list: A list of dictionaries containing profile information:
+                  [{"name": "profile_name", "path": "/path/to/profile", "created": datetime, "type": "chromium|firefox"}]
+                  
+        Example:
+            ```python
+            profiler = BrowserProfiler()
+            
+            # List all available profiles
+            profiles = profiler.list_profiles()
+            
+            for profile in profiles:
+                print(f"Profile: {profile['name']}")
+                print(f"  Path: {profile['path']}")
+                print(f"  Created: {profile['created']}")
+                print(f"  Browser type: {profile['type']}")
+            ```
+        """
+        if not os.path.exists(self.profiles_dir):
+            return []
+            
+        profiles = []
+        
+        for name in os.listdir(self.profiles_dir):
+            profile_path = os.path.join(self.profiles_dir, name)
+            
+            # Skip if not a directory
+            if not os.path.isdir(profile_path):
+                continue
+                
+            # Check if this looks like a valid browser profile
+            # For Chromium: Look for Preferences file
+            # For Firefox: Look for prefs.js file
+            is_valid = False
+            
+            if os.path.exists(os.path.join(profile_path, "Preferences")) or \
+               os.path.exists(os.path.join(profile_path, "Default", "Preferences")):
+                is_valid = "chromium"
+            elif os.path.exists(os.path.join(profile_path, "prefs.js")):
+                is_valid = "firefox"
+                
+            if is_valid:
+                # Get creation time
+                created = datetime.datetime.fromtimestamp(
+                    os.path.getctime(profile_path)
+                )
+                
+                profiles.append({
+                    "name": name,
+                    "path": profile_path,
+                    "created": created,
+                    "type": is_valid
+                })
+                
+        # Sort by creation time, newest first
+        profiles.sort(key=lambda x: x["created"], reverse=True)
+        
+        return profiles
+    
+    def get_profile_path(self, profile_name: str) -> Optional[str]:
+        """
+        Get the full path to a profile by name.
+        
+        Args:
+            profile_name (str): Name of the profile (not the full path)
+            
+        Returns:
+            str: Full path to the profile directory, or None if not found
+            
+        Example:
+            ```python
+            profiler = BrowserProfiler()
+            
+            path = profiler.get_profile_path("my-profile")
+            if path:
+                print(f"Profile path: {path}")
+            else:
+                print("Profile not found")
+            ```
+        """
+        profile_path = os.path.join(self.profiles_dir, profile_name)
+        
+        # Check if path exists and is a valid profile
+        if not os.path.isdir(profile_path):
+            # Chrck if profile_name itself is full path
+            if os.path.isabs(profile_name):
+                profile_path = profile_name
+            else:
+                return None
+        
+        # Look for profile indicators
+        is_profile = (
+            os.path.exists(os.path.join(profile_path, "Preferences")) or
+            os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
+            os.path.exists(os.path.join(profile_path, "prefs.js"))
+        )
+        
+        if not is_profile:
+            return None  # Not a valid browser profile
+            
+        return profile_path
+    
+    def delete_profile(self, profile_name_or_path: str) -> bool:
+        """
+        Delete a browser profile by name or path.
+        
+        Args:
+            profile_name_or_path (str): Name of the profile or full path to profile directory
+            
+        Returns:
+            bool: True if the profile was deleted successfully, False otherwise
+            
+        Example:
+            ```python
+            profiler = BrowserProfiler()
+            
+            # Delete by name
+            success = profiler.delete_profile("my-profile")
+            
+            # Delete by path
+            success = profiler.delete_profile("/path/to/.crawl4ai/profiles/my-profile")
+            ```
+        """
+        # Determine if input is a name or a path
+        if os.path.isabs(profile_name_or_path):
+            # Full path provided
+            profile_path = profile_name_or_path
+        else:
+            # Just a name provided, construct path
+            profile_path = os.path.join(self.profiles_dir, profile_name_or_path)
+        
+        # Check if path exists and is a valid profile
+        if not os.path.isdir(profile_path):
+            return False
+            
+        # Look for profile indicators
+        is_profile = (
+            os.path.exists(os.path.join(profile_path, "Preferences")) or
+            os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
+            os.path.exists(os.path.join(profile_path, "prefs.js"))
+        )
+        
+        if not is_profile:
+            return False  # Not a valid browser profile
+            
+        # Delete the profile directory
+        try:
+            shutil.rmtree(profile_path)
+            return True
+        except Exception:
+            return False
+            
+    async def interactive_manager(self, crawl_callback=None):
+        """
+        Launch an interactive profile management console.
+        
+        Args:
+            crawl_callback (callable, optional): Function to call when selecting option to use 
+                a profile for crawling. It will be called with (profile_path, url).
+                
+        Example:
+            ```python
+            profiler = BrowserProfiler()
+            
+            # Define a custom crawl function
+            async def my_crawl_function(profile_path, url):
+                print(f"Crawling {url} with profile {profile_path}")
+                # Implement your crawling logic here
+                
+            # Start interactive manager
+            await profiler.interactive_manager(crawl_callback=my_crawl_function)
+            ```
+        """
+        while True:
+            self.logger.info("\nProfile Management Options:", tag="MENU")
+            self.logger.info("1. Create a new profile", tag="MENU", base_color=LogColor.GREEN)
+            self.logger.info("2. List available profiles", tag="MENU", base_color=LogColor.YELLOW)
+            self.logger.info("3. Delete a profile", tag="MENU", base_color=LogColor.RED)
+            
+            # Only show crawl option if callback provided
+            if crawl_callback:
+                self.logger.info("4. Use a profile to crawl a website", tag="MENU", base_color=LogColor.CYAN)
+                self.logger.info("5. Exit", tag="MENU", base_color=LogColor.MAGENTA)
+                exit_option = "5"
+            else:
+                self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
+                exit_option = "4"
+            
+            self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
+            choice = input()
+            
+            if choice == "1":
+                # Create new profile
+                self.console.print("[green]Enter a name for the new profile (or press Enter for auto-generated name): [/green]", end="")
+                name = input()
+                await self.create_profile(name or None)
+                
+            elif choice == "2":
+                # List profiles
+                profiles = self.list_profiles()
+                
+                if not profiles:
+                    self.logger.warning("  No profiles found. Create one first with option 1.", tag="PROFILES")
+                    continue
+                
+                # Print profile information 
+                self.logger.info("\nAvailable profiles:", tag="PROFILES")
+                for i, profile in enumerate(profiles):
+                    self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
+                    self.logger.info(f"    Path: {profile['path']}", tag="PROFILES", base_color=LogColor.YELLOW)
+                    self.logger.info(f"    Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
+                    self.logger.info(f"    Browser type: {profile['type']}", tag="PROFILES")
+                    self.logger.info("", tag="PROFILES")  # Empty line for spacing
+                
+            elif choice == "3":
+                # Delete profile
+                profiles = self.list_profiles()
+                if not profiles:
+                    self.logger.warning("No profiles found to delete", tag="PROFILES")
+                    continue
+                    
+                # Display numbered list
+                self.logger.info("\nAvailable profiles:", tag="PROFILES", base_color=LogColor.YELLOW)
+                for i, profile in enumerate(profiles):
+                    self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
+                    
+                # Get profile to delete
+                self.console.print("[red]Enter the number of the profile to delete (or 'c' to cancel): [/red]", end="")
+                profile_idx = input()
+                if profile_idx.lower() == 'c':
+                    continue
+                    
+                try:
+                    idx = int(profile_idx) - 1
+                    if 0 <= idx < len(profiles):
+                        profile_name = profiles[idx]["name"]
+                        self.logger.info(f"Deleting profile: [yellow]{profile_name}[/yellow]", tag="PROFILES")
+                        
+                        # Confirm deletion
+                        self.console.print("[red]Are you sure you want to delete this profile? (y/n): [/red]", end="")
+                        confirm = input()
+                        if confirm.lower() == 'y':
+                            success = self.delete_profile(profiles[idx]["path"])
+                            
+                            if success:
+                                self.logger.success(f"Profile {profile_name} deleted successfully", tag="PROFILES")
+                            else:
+                                self.logger.error(f"Failed to delete profile {profile_name}", tag="PROFILES")
+                    else:
+                        self.logger.error("Invalid profile number", tag="PROFILES")
+                except ValueError:
+                    self.logger.error("Please enter a valid number", tag="PROFILES")
+                    
+            elif choice == "4" and crawl_callback:
+                # Use profile to crawl a site
+                profiles = self.list_profiles()
+                if not profiles:
+                    self.logger.warning("No profiles found. Create one first.", tag="PROFILES")
+                    continue
+                    
+                # Display numbered list
+                self.logger.info("\nAvailable profiles:", tag="PROFILES", base_color=LogColor.YELLOW)
+                for i, profile in enumerate(profiles):
+                    self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
+                    
+                # Get profile to use
+                self.console.print("[cyan]Enter the number of the profile to use (or 'c' to cancel): [/cyan]", end="")
+                profile_idx = input()
+                if profile_idx.lower() == 'c':
+                    continue
+                    
+                try:
+                    idx = int(profile_idx) - 1
+                    if 0 <= idx < len(profiles):
+                        profile_path = profiles[idx]["path"]
+                        self.console.print("[cyan]Enter the URL to crawl: [/cyan]", end="")
+                        url = input()
+                        if url:
+                            # Call the provided crawl callback
+                            await crawl_callback(profile_path, url)
+                        else:
+                            self.logger.error("No URL provided", tag="CRAWL")
+                    else:
+                        self.logger.error("Invalid profile number", tag="PROFILES")
+                except ValueError:
+                    self.logger.error("Please enter a valid number", tag="PROFILES")
+                    
+            elif choice == exit_option:
+                # Exit
+                self.logger.info("Exiting profile management", tag="MENU")
+                break
+                
+            else:
+                self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
+
+    async def launch_standalone_browser(self, 
+                                  browser_type: str = "chromium",
+                                  user_data_dir: Optional[str] = None,
+                                  debugging_port: int = 9222,
+                                  headless: bool = False,
+                                  save_as_builtin: bool = False) -> Optional[str]:
+        """
+        Launch a standalone browser with CDP debugging enabled and keep it running
+        until the user presses 'q'. Returns and displays the CDP URL.
+        
+        Args:
+            browser_type (str): Type of browser to launch ('chromium' or 'firefox')
+            user_data_dir (str, optional): Path to user profile directory
+            debugging_port (int): Port to use for CDP debugging
+            headless (bool): Whether to run in headless mode
+            
+        Returns:
+            str: CDP URL for the browser, or None if launch failed
+            
+        Example:
+            ```python
+            profiler = BrowserProfiler()
+            cdp_url = await profiler.launch_standalone_browser(
+                user_data_dir="/path/to/profile",
+                debugging_port=9222
+            )
+            # Use cdp_url to connect to the browser
+            ```
+        """
+        # Use the provided directory if specified, otherwise create a temporary directory
+        if user_data_dir:
+            # Directory is provided directly, ensure it exists
+            profile_path = user_data_dir
+            os.makedirs(profile_path, exist_ok=True)
+        else:
+            # Create a temporary profile directory
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            profile_name = f"temp_{timestamp}_{uuid.uuid4().hex[:6]}"
+            profile_path = os.path.join(self.profiles_dir, profile_name)
+            os.makedirs(profile_path, exist_ok=True)
+        
+        # Print initial information
+        border = f"{'='*80}"
+        self.logger.info("{border}", tag="CDP", params={"border": border}, colors={"border": LogColor.CYAN})
+        self.logger.info("Launching standalone browser with CDP debugging", tag="CDP")
+        self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN})
+        self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
+        self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
+        self.logger.info(f"Headless mode: {headless}", tag="CDP")
+        
+        # Create managed browser instance
+        managed_browser = ManagedBrowser(
+            browser_type=browser_type,
+            user_data_dir=profile_path,
+            headless=headless,
+            logger=self.logger,
+            debugging_port=debugging_port
+        )
+        
+        # Set up signal handlers to ensure cleanup on interrupt
+        original_sigint = signal.getsignal(signal.SIGINT)
+        original_sigterm = signal.getsignal(signal.SIGTERM)
+        
+        # Define cleanup handler for signals
+        async def cleanup_handler(sig, frame):
+            self.logger.warning("\nCleaning up browser process...", tag="CDP")
+            await managed_browser.cleanup()
+            # Restore original signal handlers
+            signal.signal(signal.SIGINT, original_sigint)
+            signal.signal(signal.SIGTERM, original_sigterm)
+            if sig == signal.SIGINT:
+                self.logger.error("Browser terminated by user.", tag="CDP")
+                sys.exit(1)
+                    
+        # Set signal handlers
+        def sigint_handler(sig, frame):
+            asyncio.create_task(cleanup_handler(sig, frame))
+        
+        signal.signal(signal.SIGINT, sigint_handler)
+        signal.signal(signal.SIGTERM, sigint_handler)
+        
+        # Event to signal when user wants to exit
+        user_done_event = asyncio.Event()
+        
+        # Run keyboard input loop in a separate task
+        async def listen_for_quit_command():
+            import termios
+            import tty
+            import select
+            
+            # First output the prompt
+            self.logger.info("Press 'q' to stop the browser and exit...", tag="CDP")
+            
+            # Save original terminal settings
+            fd = sys.stdin.fileno()
+            old_settings = termios.tcgetattr(fd)
+            
+            try:
+                # Switch to non-canonical mode (no line buffering)
+                tty.setcbreak(fd)
+                
+                while True:
+                    # Check if input is available (non-blocking)
+                    readable, _, _ = select.select([sys.stdin], [], [], 0.5)
+                    if readable:
+                        key = sys.stdin.read(1)
+                        if key.lower() == 'q':
+                            self.logger.info("Closing browser...", tag="CDP")
+                            user_done_event.set()
+                            return
+                    
+                    # Check if the browser process has already exited
+                    if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
+                        self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
+                        user_done_event.set()
+                        return
+                        
+                    await asyncio.sleep(0.1)
+            
+            finally:
+                # Restore terminal settings 
+                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+                
+        # Function to retrieve and display CDP JSON config
+        async def get_cdp_json(port):
+            import aiohttp
+            cdp_url = f"http://localhost:{port}"
+            json_url = f"{cdp_url}/json/version"
+            
+            try:
+                async with aiohttp.ClientSession() as session:
+                    # Try multiple times in case the browser is still starting up
+                    for _ in range(10):
+                        try:
+                            async with session.get(json_url) as response:
+                                if response.status == 200:
+                                    data = await response.json()
+                                    return cdp_url, data
+                        except Exception:
+                            pass
+                        
+                        await asyncio.sleep(0.5)
+                    
+                    return cdp_url, None
+            except Exception as e:
+                self.logger.error(f"Error fetching CDP JSON: {str(e)}", tag="CDP")
+                return cdp_url, None
+        
+        cdp_url = None
+        config_json = None
+        
+        try:
+            # Start the browser
+            await managed_browser.start()
+            
+            # Check if browser started successfully
+            browser_process = managed_browser.browser_process
+            if not browser_process:
+                self.logger.error("Failed to start browser process.", tag="CDP")
+                return None
+            
+            self.logger.info("Browser launched successfully. Retrieving CDP information...", tag="CDP") 
+            
+            # Get CDP URL and JSON config
+            cdp_url, config_json = await get_cdp_json(debugging_port)
+            
+            if cdp_url:
+                self.logger.success(f"CDP URL: {cdp_url}", tag="CDP")
+                
+                if config_json:
+                    # Display relevant CDP information
+                    self.logger.info(f"Browser: {config_json.get('Browser', 'Unknown')}", tag="CDP", colors={"Browser": LogColor.CYAN})
+                    self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP", colors={"Protocol-Version": LogColor.CYAN})
+                    if 'webSocketDebuggerUrl' in config_json:
+                        self.logger.info("WebSocket URL: {webSocketDebuggerUrl}", tag="CDP", params={"webSocketDebuggerUrl": config_json['webSocketDebuggerUrl']}, colors={"webSocketDebuggerUrl": LogColor.GREEN})
+                else:
+                    self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP")
+            else:
+                self.logger.error(f"Failed to get CDP URL on port {debugging_port}", tag="CDP")
+                await managed_browser.cleanup()
+                return None
+            
+            # Start listening for keyboard input
+            listener_task = asyncio.create_task(listen_for_quit_command())
+            
+            # Wait for the user to press 'q' or for the browser process to exit naturally
+            while not user_done_event.is_set() and browser_process.poll() is None:
+                await asyncio.sleep(0.5)
+            
+            # Cancel the listener task if it's still running
+            if not listener_task.done():
+                listener_task.cancel()
+                try:
+                    await listener_task
+                except asyncio.CancelledError:
+                    pass
+            
+            # If the browser is still running and the user pressed 'q', terminate it
+            if browser_process.poll() is None and user_done_event.is_set():
+                self.logger.info("Terminating browser process...", tag="CDP")
+                await managed_browser.cleanup()
+            
+            self.logger.success("Browser closed.", tag="CDP")
+                
+        except Exception as e:
+            self.logger.error(f"Error launching standalone browser: {str(e)}", tag="CDP")
+            await managed_browser.cleanup()
+            return None
+        finally:
+            # Restore original signal handlers
+            signal.signal(signal.SIGINT, original_sigint)
+            signal.signal(signal.SIGTERM, original_sigterm)
+            
+            # Make sure browser is fully cleaned up
+            await managed_browser.cleanup()
+        
+        # Return the CDP URL
+        return cdp_url
+    
+    async def launch_builtin_browser(self, 
+                                 browser_type: str = "chromium",
+                                 debugging_port: int = 9222,
+                                 headless: bool = True) -> Optional[str]:
+        """
+        Launch a browser in the background for use as the builtin browser.
+        
+        Args:
+            browser_type (str): Type of browser to launch ('chromium' or 'firefox')
+            debugging_port (int): Port to use for CDP debugging
+            headless (bool): Whether to run in headless mode
+            
+        Returns:
+            str: CDP URL for the browser, or None if launch failed
+        """
+        # Check if there's an existing browser still running
+        browser_info = self.get_builtin_browser_info()
+        if browser_info and self._is_browser_running(browser_info.get('pid')):
+            self.logger.info("Builtin browser is already running", tag="BUILTIN")
+            return browser_info.get('cdp_url')
+        
+        # Create a user data directory for the builtin browser
+        user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
+        os.makedirs(user_data_dir, exist_ok=True)
+        
+        # Create managed browser instance
+        managed_browser = ManagedBrowser(
+            browser_type=browser_type,
+            user_data_dir=user_data_dir,
+            headless=headless,
+            logger=self.logger,
+            debugging_port=debugging_port
+        )
+        
+        try:
+            # Start the browser
+            await managed_browser.start()
+            
+            # Check if browser started successfully
+            browser_process = managed_browser.browser_process
+            if not browser_process:
+                self.logger.error("Failed to start browser process.", tag="BUILTIN")
+                return None
+            
+            # Get CDP URL
+            cdp_url = f"http://localhost:{debugging_port}"
+            
+            # Try to verify browser is responsive by fetching version info
+            import aiohttp
+            json_url = f"{cdp_url}/json/version"
+            config_json = None
+            
+            try:
+                async with aiohttp.ClientSession() as session:
+                    for _ in range(10):  # Try multiple times
+                        try:
+                            async with session.get(json_url) as response:
+                                if response.status == 200:
+                                    config_json = await response.json()
+                                    break
+                        except Exception:
+                            pass
+                        await asyncio.sleep(0.5)
+            except Exception as e:
+                self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN")
+            
+            # Save browser info
+            browser_info = {
+                'pid': browser_process.pid,
+                'cdp_url': cdp_url,
+                'user_data_dir': user_data_dir,
+                'browser_type': browser_type,
+                'debugging_port': debugging_port,
+                'start_time': time.time(),
+                'config': config_json
+            }
+            
+            with open(self.builtin_config_file, 'w') as f:
+                json.dump(browser_info, f, indent=2)
+                
+            # Detach from the browser process - don't keep any references
+            # This is important to allow the Python script to exit while the browser continues running
+            # We'll just record the PID and other info, and the browser will run independently
+            managed_browser.browser_process = None
+                
+            self.logger.success(f"Builtin browser launched at CDP URL: {cdp_url}", tag="BUILTIN")
+            return cdp_url
+            
+        except Exception as e:
+            self.logger.error(f"Error launching builtin browser: {str(e)}", tag="BUILTIN")
+            if managed_browser:
+                await managed_browser.cleanup()
+            return None
+    
+    def get_builtin_browser_info(self) -> Optional[Dict[str, Any]]:
+        """
+        Get information about the builtin browser.
+        
+        Returns:
+            dict: Browser information or None if no builtin browser is configured
+        """
+        if not os.path.exists(self.builtin_config_file):
+            return None
+            
+        try:
+            with open(self.builtin_config_file, 'r') as f:
+                browser_info = json.load(f)
+                
+            # Check if the browser is still running
+            if not self._is_browser_running(browser_info.get('pid')):
+                self.logger.warning("Builtin browser is not running", tag="BUILTIN")
+                return None
+                
+            return browser_info
+        except Exception as e:
+            self.logger.error(f"Error reading builtin browser config: {str(e)}", tag="BUILTIN")
+            return None
+            
+    def _is_browser_running(self, pid: Optional[int]) -> bool:
+        """Check if a process with the given PID is running"""
+        if not pid:
+            return False
+            
+        try:
+            # Check if the process exists
+            if sys.platform == "win32":
+                process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"], 
+                                         capture_output=True, text=True)
+                return str(pid) in process.stdout
+            else:
+                # Unix-like systems
+                os.kill(pid, 0)  # This doesn't actually kill the process, just checks if it exists
+            return True
+        except (ProcessLookupError, PermissionError, OSError):
+            return False
+            
+    async def kill_builtin_browser(self) -> bool:
+        """
+        Kill the builtin browser if it's running.
+        
+        Returns:
+            bool: True if the browser was killed, False otherwise
+        """
+        browser_info = self.get_builtin_browser_info()
+        if not browser_info:
+            self.logger.warning("No builtin browser found", tag="BUILTIN")
+            return False
+            
+        pid = browser_info.get('pid')
+        if not pid:
+            return False
+            
+        try:
+            if sys.platform == "win32":
+                subprocess.run(["taskkill", "/F", "/PID", str(pid)], check=True)
+            else:
+                os.kill(pid, signal.SIGTERM)
+                # Wait for termination
+                for _ in range(5):
+                    if not self._is_browser_running(pid):
+                        break
+                    await asyncio.sleep(0.5)
+                else:
+                    # Force kill if still running
+                    os.kill(pid, signal.SIGKILL)
+                    
+            # Remove config file
+            if os.path.exists(self.builtin_config_file):
+                os.unlink(self.builtin_config_file)
+                
+            self.logger.success("Builtin browser terminated", tag="BUILTIN")
+            return True
+        except Exception as e:
+            self.logger.error(f"Error killing builtin browser: {str(e)}", tag="BUILTIN")
+            return False
+    
+    async def get_builtin_browser_status(self) -> Dict[str, Any]:
+        """
+        Get status information about the builtin browser.
+        
+        Returns:
+            dict: Status information with running, cdp_url, and info fields
+        """
+        browser_info = self.get_builtin_browser_info()
+        
+        if not browser_info:
+            return {
+                'running': False,
+                'cdp_url': None,
+                'info': None
+            }
+            
+        return {
+            'running': True,
+            'cdp_url': browser_info.get('cdp_url'),
+            'info': browser_info
+        }
+
+
+if __name__ == "__main__":
+    # Example usage
+    profiler = BrowserProfiler()
+    
+    # Create a new profile
+    import os
+    from pathlib import Path
+    home_dir = Path.home()
+    profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
+
+        
+            
+    # Launch a standalone browser
+    asyncio.run(profiler.launch_standalone_browser())
+    
+    # List profiles
+    profiles = profiler.list_profiles()
+    for profile in profiles:
+        print(f"Profile: {profile['name']}, Path: {profile['path']}")
+    
+    # Delete a profile
+    success = profiler.delete_profile("my-profile")
+    if success:
+        print("Profile deleted successfully")
+    else:
+        print("Failed to delete profile")
\ No newline at end of file
diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py
index ca188d1d..f46cb667 100644
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -4,7 +4,6 @@ from collections import Counter
 import string
 from .model_loader import load_nltk_punkt
 
-
 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
     """
@@ -72,6 +71,7 @@ class NlpSentenceChunking(ChunkingStrategy):
         """
         Initialize the NlpSentenceChunking object.
         """
+        from crawl4ai.le.legacy.model_loader import load_nltk_punkt
         load_nltk_punkt()
 
     def chunk(self, text: str) -> list:
diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py
index b2d2199e..51477d6b 100644
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1,123 +1,1418 @@
 import click
+import os
 import sys
-import asyncio
-from typing import List
-from .docs_manager import DocsManager
-from .async_logger import AsyncLogger
+import time
 
-logger = AsyncLogger(verbose=True)
-docs_manager = DocsManager(logger)
+import humanize
+from typing import Dict, Any, Optional, List
+import json
+import yaml
+import anyio
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.prompt import Prompt, Confirm
 
-
-def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
-    """Print formatted table with headers and rows"""
-    widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
-    border = "+" + "+".join("-" * (w + 2 * padding) for w in widths) + "+"
-
-    def format_row(row):
-        return (
-            "|"
-            + "|".join(
-                f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
-                for cell, w in zip(row, widths)
-            )
-            + "|"
-        )
-
-    click.echo(border)
-    click.echo(format_row(headers))
-    click.echo(border)
-    for row in rows:
-        click.echo(format_row(row))
-    click.echo(border)
-
-
-@click.group()
-def cli():
-    """Crawl4AI Command Line Interface"""
-    pass
-
-
-@cli.group()
-def docs():
-    """Documentation operations"""
-    pass
-
-
-@docs.command()
-@click.argument("sections", nargs=-1)
-@click.option(
-    "--mode", type=click.Choice(["extended", "condensed"]), default="extended"
+from crawl4ai import (
+    CacheMode,
+    AsyncWebCrawler, 
+    CrawlResult,
+    BrowserConfig, 
+    CrawlerRunConfig,
+    LLMExtractionStrategy, 
+    LXMLWebScrapingStrategy,
+    JsonCssExtractionStrategy,
+    JsonXPathExtractionStrategy,
+    BM25ContentFilter, 
+    PruningContentFilter,
+    BrowserProfiler,
+    DefaultMarkdownGenerator,
+    LLMConfig
 )
-def combine(sections: tuple, mode: str):
-    """Combine documentation sections"""
+from crawl4ai.config import USER_SETTINGS
+from litellm import completion
+from pathlib import Path
+
+
+# Initialize rich console
+console = Console()
+
+def get_global_config() -> dict:
+    config_dir = Path.home() / ".crawl4ai"
+    config_file = config_dir / "global.yml"
+    
+    if not config_file.exists():
+        config_dir.mkdir(parents=True, exist_ok=True)
+        return {}
+        
+    with open(config_file) as f:
+        return yaml.safe_load(f) or {}
+
+def save_global_config(config: dict):
+    config_file = Path.home() / ".crawl4ai" / "global.yml"
+    with open(config_file, "w") as f:
+        yaml.dump(config, f)
+
+def setup_llm_config() -> tuple[str, str]:
+    config = get_global_config()
+    provider = config.get("DEFAULT_LLM_PROVIDER")
+    token = config.get("DEFAULT_LLM_PROVIDER_TOKEN")
+    
+    if not provider:
+        click.echo("\nNo default LLM provider configured.")
+        click.echo("Provider format: 'company/model' (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')")
+        click.echo("See available providers at: https://docs.litellm.ai/docs/providers")
+        provider = click.prompt("Enter provider")
+        
+    if not provider.startswith("ollama/"):
+        if not token:
+            token = click.prompt("Enter API token for " + provider, hide_input=True)
+    else:
+        token = "no-token"
+    
+    if not config.get("DEFAULT_LLM_PROVIDER") or not config.get("DEFAULT_LLM_PROVIDER_TOKEN"):
+        config["DEFAULT_LLM_PROVIDER"] = provider
+        config["DEFAULT_LLM_PROVIDER_TOKEN"] = token
+        save_global_config(config)
+        click.echo("\nConfiguration saved to ~/.crawl4ai/global.yml")
+    
+    return provider, token
+
+async def stream_llm_response(url: str, markdown: str, query: str, provider: str, token: str):
+    response = completion(
+        model=provider,
+        api_key=token,
+        messages=[
+            {
+                "content": f"You are Crawl4ai assistant, answering user question based on the provided context which is crawled from {url}.",
+                "role": "system"
+            },
+            {
+                "content": f"<|start of context|>\n{markdown}\n<|end of context|>\n\n{query}",
+                "role": "user"
+            },
+        ],
+        stream=True,
+    )
+    
+    for chunk in response:
+        if content := chunk["choices"][0]["delta"].get("content"):
+            print(content, end="", flush=True)
+    print()  # New line at end
+
+
+
+def parse_key_values(ctx, param, value) -> Dict[str, Any]:
+    if not value:
+        return {}
+    result = {}
+    pairs = value.split(',')
+    for pair in pairs:
+        try:
+            k, v = pair.split('=', 1)
+            # Handle common value types 
+            if v.lower() == 'true': v = True
+            elif v.lower() == 'false': v = False
+            elif v.isdigit(): v = int(v)
+            elif v.replace('.','',1).isdigit(): v = float(v)
+            elif v.startswith('[') and v.endswith(']'):
+                v = [x.strip() for x in v[1:-1].split(',') if x.strip()]
+            elif v.startswith('{') and v.endswith('}'):
+                try:
+                    v = json.loads(v)
+                except json.JSONDecodeError:
+                    raise click.BadParameter(f'Invalid JSON object: {v}')
+            result[k.strip()] = v
+        except ValueError:
+            raise click.BadParameter(f'Invalid key=value pair: {pair}')
+    return result
+
+def load_config_file(path: Optional[str]) -> dict:
+    if not path:
+        return {}
+    
     try:
-        asyncio.run(docs_manager.ensure_docs_exist())
-        click.echo(docs_manager.generate(sections, mode))
+        with open(path) as f:
+            if path.endswith((".yaml", ".yml")):
+                return yaml.safe_load(f)
+            return json.load(f)
     except Exception as e:
-        logger.error(str(e), tag="ERROR")
-        sys.exit(1)
+        raise click.BadParameter(f'Error loading config file {path}: {str(e)}')
 
+def load_schema_file(path: Optional[str]) -> dict:
+    if not path:
+        return None
+    return load_config_file(path)
 
-@docs.command()
-@click.argument("query")
-@click.option("--top-k", "-k", default=5)
-@click.option("--build-index", is_flag=True, help="Build index if missing")
-def search(query: str, top_k: int, build_index: bool):
-    """Search documentation"""
-    try:
-        result = docs_manager.search(query, top_k)
-        if result == "No search index available. Call build_search_index() first.":
-            if build_index or click.confirm("No search index found. Build it now?"):
-                asyncio.run(docs_manager.llm_text.generate_index_files())
-                result = docs_manager.search(query, top_k)
-        click.echo(result)
-    except Exception as e:
-        click.echo(f"Error: {str(e)}", err=True)
-        sys.exit(1)
+async def run_crawler(url: str, browser_cfg: BrowserConfig, crawler_cfg: CrawlerRunConfig, verbose: bool):
+    if verbose:
+        click.echo("Starting crawler with configurations:")
+        click.echo(f"Browser config: {browser_cfg.dump()}")
+        click.echo(f"Crawler config: {crawler_cfg.dump()}")
 
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        try:
+            result = await crawler.arun(url=url, config=crawler_cfg)
+            return result
+        except Exception as e:
+            raise click.ClickException(f"Crawling failed: {str(e)}")
 
-@docs.command()
-def update():
-    """Update docs from GitHub"""
-    try:
-        asyncio.run(docs_manager.fetch_docs())
-        click.echo("Documentation updated successfully")
-    except Exception as e:
-        click.echo(f"Error: {str(e)}", err=True)
-        sys.exit(1)
+def show_examples():
+    examples = """
+🚀 Crawl4AI CLI Examples
 
+1️⃣  Basic Usage:
+    # Simple crawl with default settings
+    crwl https://example.com
 
-@docs.command()
-@click.option("--force-facts", is_flag=True, help="Force regenerate fact files")
-@click.option("--clear-cache", is_flag=True, help="Clear BM25 cache")
-def index(force_facts: bool, clear_cache: bool):
-    """Build or rebuild search indexes"""
-    try:
-        asyncio.run(docs_manager.ensure_docs_exist())
-        asyncio.run(
-            docs_manager.llm_text.generate_index_files(
-                force_generate_facts=force_facts, clear_bm25_cache=clear_cache
-            )
+    # Get markdown output
+    crwl https://example.com -o markdown
+
+    # Verbose JSON output with cache bypass
+    crwl https://example.com -o json -v --bypass-cache
+
+2️⃣  Using Config Files:
+    # Using browser and crawler configs
+    crwl https://example.com -B browser.yml -C crawler.yml
+
+    # CSS-based extraction
+    crwl https://example.com -e extract_css.yml -s css_schema.json -o json
+
+    # LLM-based extraction with config file
+    crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json
+    
+    # Quick LLM-based JSON extraction (prompts for LLM provider first time)
+    crwl https://example.com -j  # Auto-extracts structured data
+    crwl https://example.com -j "Extract product details including name, price, and features"  # With specific instructions
+
+3️⃣  Direct Parameters:
+    # Browser settings
+    crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+
+    # Crawler settings
+    crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+
+4️⃣  Profile Management for Identity-Based Crawling:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create, list, and delete browser profiles for identity-based crawling
+    # Use a profile for crawling (keeps you logged in)
+    crwl https://example.com -p my-profile-name
+
+    # Example: Crawl a site that requires login
+    # 1. First create a profile and log in:
+    crwl profiles
+    # 2. Then use that profile to crawl the authenticated site:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
+
+5️⃣  CDP Mode for Browser Automation:
+    # Launch browser with CDP debugging on default port 9222
+    crwl cdp
+
+    # Use a specific profile and custom port
+    crwl cdp -p my-profile -P 9223
+
+    # Launch headless browser with CDP enabled
+    crwl cdp --headless
+
+    # Launch in incognito mode (ignores profile)
+    crwl cdp --incognito
+
+    # Use the CDP URL with other tools (Puppeteer, Playwright, etc.)
+    # The URL will be displayed in the terminal when the browser starts
+
+    
+6️⃣  Sample Config Files:
+
+browser.yml:
+    headless: true
+    viewport_width: 1280
+    user_agent_mode: "random"
+    verbose: true
+    ignore_https_errors: true
+
+extract_css.yml:
+    type: "json-css"
+    params:
+        verbose: true
+
+css_schema.json:
+    {
+      "name": "ArticleExtractor",
+      "baseSelector": ".article",
+      "fields": [
+        {
+          "name": "title",
+          "selector": "h1.title",
+          "type": "text"
+        },
+        {
+          "name": "link",
+          "selector": "a.read-more",
+          "type": "attribute",
+          "attribute": "href"
+        }
+      ]
+    }
+
+extract_llm.yml:
+    type: "llm"
+    provider: "openai/gpt-4"
+    instruction: "Extract all articles with their titles and links"
+    api_token: "your-token"
+    params:
+        temperature: 0.3
+        max_tokens: 1000
+
+llm_schema.json:
+    {
+      "title": "Article",
+      "type": "object",
+      "properties": {
+        "title": {
+          "type": "string",
+          "description": "The title of the article"
+        },
+        "link": {
+          "type": "string",
+          "description": "URL to the full article"
+        }
+      }
+    }
+
+7️⃣  Advanced Usage:
+    # Combine configs with direct parameters
+    crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
+
+    # Full extraction pipeline with config files
+    crwl https://example.com \\
+        -B browser.yml \\
+        -C crawler.yml \\
+        -e extract_llm.yml \\
+        -s llm_schema.json \\
+        -o json \\
+        -v
+        
+    # Quick LLM-based extraction with specific instructions
+    crwl https://amazon.com/dp/B01DFKC2SO \\
+        -j "Extract product title, current price, original price, rating, and all product specifications" \\
+        -b "headless=true,viewport_width=1280" \\
+        -v
+
+    # Content filtering with BM25
+    crwl https://example.com \\
+        -f filter_bm25.yml \\
+        -o markdown-fit
+
+    # Authenticated crawling with profile
+    crwl https://login-required-site.com \\
+        -p my-authenticated-profile \\
+        -c "css_selector=.dashboard-content" \\
+        -o markdown
+
+For more documentation visit: https://github.com/unclecode/crawl4ai
+
+8️⃣  Q&A with LLM:
+    # Ask a question about the content
+    crwl https://example.com -q "What is the main topic discussed?"
+
+    # First view content, then ask questions
+    crwl https://example.com -o markdown  # See the crawled content first
+    crwl https://example.com -q "Summarize the key points"
+    crwl https://example.com -q "What are the conclusions?"
+
+    # Advanced crawling with Q&A
+    crwl https://example.com \\
+        -B browser.yml \\
+        -c "css_selector=article,scan_full_page=true" \\
+        -q "What are the pros and cons mentioned?"
+
+    Note: First time using -q will prompt for LLM provider and API token.
+    These will be saved in ~/.crawl4ai/global.yml for future use.
+    
+    Supported provider format: 'company/model'
+    Examples:
+      - ollama/llama3.3
+      - openai/gpt-4
+      - anthropic/claude-3-sonnet
+      - cohere/command
+      - google/gemini-pro
+    
+    See full list of providers: https://docs.litellm.ai/docs/providers
+    
+    # Set default LLM provider and token in advance
+    crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet"
+    crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token-here"
+    
+    # Set default browser behavior
+    crwl config set BROWSER_HEADLESS false  # Always show browser window
+    crwl config set USER_AGENT_MODE random  # Use random user agent
+
+9️⃣ Profile Management:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create a profile and use it for crawling
+    crwl profiles  # Create and set up your profile interactively
+    crwl https://example.com -p my-profile-name  # Use profile for crawling
+
+    # Example workflow for authenticated site
+    # 1. First create a profile and log in to the site:
+    crwl profiles  # Select "Create new profile" option
+    # 2. Then use that profile to crawl authenticated content:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
+
+🔄 Builtin Browser Management:
+    # Start a builtin browser (runs in the background)
+    crwl browser start
+    
+    # Check builtin browser status
+    crwl browser status
+    
+    # Open a visible window to see the browser
+    crwl browser view --url https://example.com
+    
+    # Stop the builtin browser
+    crwl browser stop
+    
+    # Restart with different options
+    crwl browser restart --browser-type chromium --port 9223 --no-headless
+    
+    # Use the builtin browser in your code
+    # (Just set browser_mode="builtin" in your BrowserConfig)
+    browser_config = BrowserConfig(
+        browser_mode="builtin", 
+        headless=True
+    )
+    
+    # Usage via CLI:
+    crwl https://example.com -b "browser_mode=builtin"
+"""
+    click.echo(examples)
+
+def get_directory_size(path: str) -> int:
+    """Calculate the total size of a directory in bytes"""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if not os.path.islink(fp):
+                total_size += os.path.getsize(fp)
+    return total_size
+
+def display_profiles_table(profiles: List[Dict[str, Any]]):
+    """Display a rich table of browser profiles"""
+    if not profiles:
+        console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]", 
+                          title="Browser Profiles", border_style="blue"))
+        return
+    
+    table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("#", style="dim", width=4)
+    table.add_column("Name", style="cyan", no_wrap=True)
+    table.add_column("Path", style="green")
+    table.add_column("Created", style="yellow")
+    table.add_column("Browser", style="magenta")
+    table.add_column("Size", style="blue", justify="right")
+    
+    for i, profile in enumerate(profiles):
+        # Calculate folder size
+        size = get_directory_size(profile["path"])
+        human_size = humanize.naturalsize(size)
+        
+        # Format creation date
+        created = profile["created"].strftime("%Y-%m-%d %H:%M")
+        
+        # Add row to table
+        table.add_row(
+            str(i+1), 
+            profile["name"], 
+            profile["path"], 
+            created, 
+            profile["type"].capitalize(), 
+            human_size
         )
-        click.echo("Search indexes built successfully")
-    except Exception as e:
-        click.echo(f"Error: {str(e)}", err=True)
-        sys.exit(1)
+    
+    console.print(table)
 
-
-# Add docs list command
-@docs.command()
-def list():
-    """List available documentation sections"""
+async def create_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile creation wizard"""
+    console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n"
+                      "This will open a browser window for you to set up your identity.\n"
+                      "Log in to sites, adjust settings, then press 'q' to save.",
+                      border_style="cyan"))
+    
+    profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}")
+    
+    console.print("[cyan]Creating profile...[/cyan]")
+    console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]")
+    
+    # Create the profile
     try:
-        sections = docs_manager.list()
-        print_table(["Sections"], [[section] for section in sections])
-
+        profile_path = await profiler.create_profile(profile_name)
+        
+        if profile_path:
+            console.print(f"[green]Profile successfully created at:[/green] {profile_path}")
+        else:
+            console.print("[red]Failed to create profile.[/red]")
     except Exception as e:
-        click.echo(f"Error: {str(e)}", err=True)
+        console.print(f"[red]Error creating profile: {str(e)}[/red]")
+
+def delete_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile deletion"""
+    profiles = profiler.list_profiles()
+    
+    if not profiles:
+        console.print("[yellow]No profiles found to delete.[/yellow]")
+        return
+    
+    # Display profiles
+    display_profiles_table(profiles)
+    
+    # Get profile selection
+    idx = Prompt.ask(
+        "[red]Enter number of profile to delete[/red]", 
+        console=console,
+        choices=[str(i+1) for i in range(len(profiles))],
+        show_choices=False
+    )
+    
+    try:
+        idx = int(idx) - 1
+        profile = profiles[idx]
+        
+        # Confirm deletion
+        if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"):
+            success = profiler.delete_profile(profile["path"])
+            
+            if success:
+                console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
+            else:
+                console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]")
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection.[/red]")
+        
+async def crawl_with_profile_cli(profile_path, url):
+    """Use a profile to crawl a website via CLI"""
+    console.print(f"[cyan]Crawling [bold]{url}[/bold] using profile at [bold]{profile_path}[/bold][/cyan]")
+    
+    # Create browser config with the profile
+    browser_cfg = BrowserConfig(
+        headless=False,  # Set to False to see the browser in action
+        use_managed_browser=True,
+        user_data_dir=profile_path
+    )
+    
+    # Default crawler config
+    crawler_cfg = CrawlerRunConfig()
+    
+    # Ask for output format
+    output_format = Prompt.ask(
+        "[cyan]Output format[/cyan]",
+        choices=["all", "json", "markdown", "md", "title"],
+        default="markdown"
+    )
+    
+    try:
+        # Run the crawler
+        result = await run_crawler(url, browser_cfg, crawler_cfg, True)
+        
+        # Handle output
+        if output_format == "all":
+            console.print(json.dumps(result.model_dump(), indent=2))
+        elif output_format == "json":
+            console.print(json.dumps(json.loads(result.extracted_content), indent=2))
+        elif output_format in ["markdown", "md"]:
+            console.print(result.markdown.raw_markdown)
+        elif output_format == "title":
+            console.print(result.metadata.get("title", "No title found"))
+        
+        console.print(f"[green]Successfully crawled[/green] {url}")
+        return result
+    except Exception as e:
+        console.print(f"[red]Error crawling:[/red] {str(e)}")
+        return None
+        
+async def use_profile_to_crawl():
+    """Interactive profile selection for crawling"""
+    profiler = BrowserProfiler()
+    profiles = profiler.list_profiles()
+    
+    if not profiles:
+        console.print("[yellow]No profiles found. Create one first.[/yellow]")
+        return
+        
+    # Display profiles
+    display_profiles_table(profiles)
+    
+    # Get profile selection
+    idx = Prompt.ask(
+        "[cyan]Enter number of profile to use[/cyan]", 
+        console=console,
+        choices=[str(i+1) for i in range(len(profiles))],
+        show_choices=False
+    )
+    
+    try:
+        idx = int(idx) - 1
+        profile = profiles[idx]
+        
+        # Get URL
+        url = Prompt.ask("[cyan]Enter URL to crawl[/cyan]")
+        if url:
+            # Crawl with the selected profile
+            await crawl_with_profile_cli(profile["path"], url)
+        else:
+            console.print("[red]No URL provided[/red]")
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection[/red]")
+
+async def manage_profiles():
+    """Interactive profile management menu"""
+    profiler = BrowserProfiler()
+    
+    options = {
+        "1": "List profiles",
+        "2": "Create new profile",
+        "3": "Delete profile",
+        "4": "Use a profile to crawl a website",
+        "5": "Exit",
+    }
+    
+    while True:
+        console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
+        
+        for key, value in options.items():
+            color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "blue" if key == "4" else "cyan"
+            console.print(f"[{color}]{key}[/{color}]. {value}")
+        
+        choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
+        
+        if choice == "1":
+            # List profiles
+            profiles = profiler.list_profiles()
+            display_profiles_table(profiles)
+        
+        elif choice == "2":
+            # Create profile
+            await create_profile_interactive(profiler)
+        
+        elif choice == "3":
+            # Delete profile
+            delete_profile_interactive(profiler)
+            
+        elif choice == "4":
+            # Use profile to crawl
+            await use_profile_to_crawl()
+        
+        elif choice == "5":
+            # Exit
+            console.print("[cyan]Exiting profile manager.[/cyan]")
+            break
+        
+        # Add a separator between operations
+        console.print("\n")
+
+
+
+@click.group(context_settings={"help_option_names": ["-h", "--help"]})
+def cli():
+    """Crawl4AI CLI - Web content extraction and browser profile management tool"""
+    pass
+
+
+@cli.group("browser")
+def browser_cmd():
+    """Manage browser instances for Crawl4AI
+    
+    Commands to manage browser instances for Crawl4AI, including:
+    - status - Check status of the builtin browser
+    - start - Start a new builtin browser
+    - stop - Stop the running builtin browser
+    - restart - Restart the builtin browser
+    """
+    pass
+    
+@browser_cmd.command("status")
+def browser_status_cmd():
+    """Show status of the builtin browser"""
+    profiler = BrowserProfiler()
+    
+    try:
+        status = anyio.run(profiler.get_builtin_browser_status)
+        
+        if status["running"]:
+            info = status["info"]
+            console.print(Panel(
+                f"[green]Builtin browser is running[/green]\n\n"
+                f"CDP URL: [cyan]{info['cdp_url']}[/cyan]\n"
+                f"Process ID: [yellow]{info['pid']}[/yellow]\n"
+                f"Browser type: [blue]{info['browser_type']}[/blue]\n"
+                f"User data directory: [magenta]{info['user_data_dir']}[/magenta]\n"
+                f"Started: [cyan]{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(info['start_time']))}[/cyan]",
+                title="Builtin Browser Status",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[yellow]Builtin browser is not running[/yellow]\n\n"
+                "Use 'crwl browser start' to start a builtin browser",
+                title="Builtin Browser Status",
+                border_style="yellow"
+            ))
+            
+    except Exception as e:
+        console.print(f"[red]Error checking browser status: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("start")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", 
+              help="Browser type (default: chromium)")
+@click.option("--port", "-p", type=int, default=9222, help="Debugging port (default: 9222)")
+@click.option("--headless/--no-headless", default=True, help="Run browser in headless mode")
+def browser_start_cmd(browser_type: str, port: int, headless: bool):
+    """Start a builtin browser instance
+    
+    This will start a persistent browser instance that can be used by Crawl4AI
+    by setting browser_mode="builtin" in BrowserConfig.
+    """
+    profiler = BrowserProfiler()
+    
+    # First check if browser is already running
+    status = anyio.run(profiler.get_builtin_browser_status)
+    if status["running"]:
+        console.print(Panel(
+            "[yellow]Builtin browser is already running[/yellow]\n\n"
+            f"CDP URL: [cyan]{status['cdp_url']}[/cyan]\n\n"
+            "Use 'crwl browser restart' to restart the browser",
+            title="Builtin Browser Start",
+            border_style="yellow"
+        ))
+        return
+    
+    try:
+        console.print(Panel(
+            f"[cyan]Starting builtin browser[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
+            title="Builtin Browser Start",
+            border_style="cyan"
+        ))
+        
+        cdp_url = anyio.run(
+            profiler.launch_builtin_browser,
+            browser_type,
+            port,
+            headless
+        )
+        
+        if cdp_url:
+            console.print(Panel(
+                f"[green]Builtin browser started successfully[/green]\n\n"
+                f"CDP URL: [cyan]{cdp_url}[/cyan]\n\n"
+                "This browser will be used automatically when setting browser_mode='builtin'",
+                title="Builtin Browser Start",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to start builtin browser[/red]",
+                title="Builtin Browser Start",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error starting builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("stop")
+def browser_stop_cmd():
+    """Stop the running builtin browser"""
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running
+        status = anyio.run(profiler.get_builtin_browser_status)
+        if not status["running"]:
+            console.print(Panel(
+                "[yellow]No builtin browser is currently running[/yellow]",
+                title="Builtin Browser Stop",
+                border_style="yellow"
+            ))
+            return
+            
+        console.print(Panel(
+            "[cyan]Stopping builtin browser...[/cyan]",
+            title="Builtin Browser Stop", 
+            border_style="cyan"
+        ))
+        
+        success = anyio.run(profiler.kill_builtin_browser)
+        
+        if success:
+            console.print(Panel(
+                "[green]Builtin browser stopped successfully[/green]",
+                title="Builtin Browser Stop",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to stop builtin browser[/red]",
+                title="Builtin Browser Stop",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error stopping builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("view")
+@click.option("--url", "-u", help="URL to navigate to (defaults to about:blank)")
+def browser_view_cmd(url: Optional[str]):
+    """
+    Open a visible window of the builtin browser
+    
+    This command connects to the running builtin browser and opens a visible window,
+    allowing you to see what the browser is currently viewing or navigate to a URL.
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running
+        status = anyio.run(profiler.get_builtin_browser_status)
+        if not status["running"]:
+            console.print(Panel(
+                "[yellow]No builtin browser is currently running[/yellow]\n\n"
+                "Use 'crwl browser start' to start a builtin browser first",
+                title="Builtin Browser View",
+                border_style="yellow"
+            ))
+            return
+        
+        info = status["info"]
+        cdp_url = info["cdp_url"]
+        
+        console.print(Panel(
+            f"[cyan]Opening visible window connected to builtin browser[/cyan]\n\n"
+            f"CDP URL: [green]{cdp_url}[/green]\n"
+            f"URL to load: [yellow]{url or 'about:blank'}[/yellow]",
+            title="Builtin Browser View",
+            border_style="cyan"
+        ))
+        
+        # Use the CDP URL to launch a new visible window
+        import subprocess
+        import os
+        
+        # Determine the browser command based on platform
+        if sys.platform == "darwin":  # macOS
+            browser_cmd = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"]
+        elif sys.platform == "win32":  # Windows
+            browser_cmd = ["C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"]
+        else:  # Linux
+            browser_cmd = ["google-chrome"]
+        
+        # Add arguments
+        browser_args = [
+            f"--remote-debugging-port={info['debugging_port']}",
+            "--remote-debugging-address=localhost",
+            "--no-first-run",
+            "--no-default-browser-check"
+        ]
+        
+        # Add URL if provided
+        if url:
+            browser_args.append(url)
+        
+        # Launch browser
+        try:
+            subprocess.Popen(browser_cmd + browser_args)
+            console.print("[green]Browser window opened. Close it when finished viewing.[/green]")
+        except Exception as e:
+            console.print(f"[red]Error launching browser: {str(e)}[/red]")
+            console.print(f"[yellow]Try connecting manually to {cdp_url} in Chrome or using the '--remote-debugging-port' flag.[/yellow]")
+    
+    except Exception as e:
+        console.print(f"[red]Error viewing builtin browser: {str(e)}[/red]")
         sys.exit(1)
 
+@browser_cmd.command("restart")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default=None, 
+              help="Browser type (defaults to same as current)")
+@click.option("--port", "-p", type=int, default=None, help="Debugging port (defaults to same as current)")
+@click.option("--headless/--no-headless", default=None, help="Run browser in headless mode")
+def browser_restart_cmd(browser_type: Optional[str], port: Optional[int], headless: Optional[bool]):
+    """Restart the builtin browser
+    
+    Stops the current builtin browser if running and starts a new one.
+    By default, uses the same configuration as the current browser.
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running and get its config
+        status = anyio.run(profiler.get_builtin_browser_status)
+        current_config = {}
+        
+        if status["running"]:
+            info = status["info"]
+            current_config = {
+                "browser_type": info["browser_type"],
+                "port": info["debugging_port"],
+                "headless": True  # Default assumption
+            }
+            
+            # Stop the browser
+            console.print(Panel(
+                "[cyan]Stopping current builtin browser...[/cyan]",
+                title="Builtin Browser Restart", 
+                border_style="cyan"
+            ))
+            
+            success = anyio.run(profiler.kill_builtin_browser)
+            if not success:
+                console.print(Panel(
+                    "[red]Failed to stop current browser[/red]",
+                    title="Builtin Browser Restart",
+                    border_style="red"
+                ))
+                sys.exit(1)
+        
+        # Use provided options or defaults from current config
+        browser_type = browser_type or current_config.get("browser_type", "chromium")
+        port = port or current_config.get("port", 9222)
+        headless = headless if headless is not None else current_config.get("headless", True)
+        
+        # Start a new browser
+        console.print(Panel(
+            f"[cyan]Starting new builtin browser[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
+            title="Builtin Browser Restart",
+            border_style="cyan"
+        ))
+        
+        cdp_url = anyio.run(
+            profiler.launch_builtin_browser,
+            browser_type,
+            port,
+            headless
+        )
+        
+        if cdp_url:
+            console.print(Panel(
+                f"[green]Builtin browser restarted successfully[/green]\n\n"
+                f"CDP URL: [cyan]{cdp_url}[/cyan]",
+                title="Builtin Browser Restart",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to restart builtin browser[/red]",
+                title="Builtin Browser Restart",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error restarting builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+
+@cli.command("cdp")
+@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)")
+@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", 
+              help="Browser type (default: chromium)")
+@click.option("--headless", is_flag=True, help="Run browser in headless mode")
+@click.option("--incognito", is_flag=True, help="Run in incognito/private mode (ignores user-data-dir)")
+def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless: bool, incognito: bool):
+    """Launch a standalone browser with CDP debugging enabled
+    
+    This command launches a browser with Chrome DevTools Protocol (CDP) debugging enabled,
+    prints the CDP URL, and keeps the browser running until you press 'q'.
+    
+    The CDP URL can be used for various automation and debugging tasks.
+    
+    Examples:
+        # Launch Chromium with CDP on default port 9222
+        crwl cdp
+        
+        # Use a specific directory for browser data and custom port
+        crwl cdp --user-data-dir ~/browser-data --port 9223
+        
+        # Launch in headless mode
+        crwl cdp --headless
+        
+        # Launch in incognito mode (ignores user-data-dir)
+        crwl cdp --incognito
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # Handle data directory
+        data_dir = None
+        if not incognito and user_data_dir:
+            # Expand user path (~/something)
+            expanded_path = os.path.expanduser(user_data_dir)
+            
+            # Create directory if it doesn't exist
+            if not os.path.exists(expanded_path):
+                console.print(f"[yellow]Directory '{expanded_path}' doesn't exist. Creating it.[/yellow]")
+                os.makedirs(expanded_path, exist_ok=True)
+            
+            data_dir = expanded_path
+        
+        # Print launch info
+        console.print(Panel(
+            f"[cyan]Launching browser with CDP debugging[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"User data directory: [cyan]{data_dir or 'Temporary directory'}[/cyan]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]\n"
+            f"Incognito: [cyan]{'Yes' if incognito else 'No'}[/cyan]\n\n"
+            f"[yellow]Press 'q' to quit when done[/yellow]",
+            title="CDP Browser",
+            border_style="cyan"
+        ))
+        
+        # Run the browser
+        cdp_url = anyio.run(
+            profiler.launch_standalone_browser,
+            browser_type,
+            data_dir,
+            port,
+            headless
+        )
+        
+        if not cdp_url:
+            console.print("[red]Failed to launch browser or get CDP URL[/red]")
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error launching CDP browser: {str(e)}[/red]")
+        sys.exit(1)
+
+
+@cli.command("crawl")
+@click.argument("url", required=True)
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
+@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
+@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, 
+           extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
+           output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl a website and extract content
+    
+    Simple Usage:
+        crwl crawl https://example.com
+    """
+    
+    # Handle profile option
+    if profile:
+        profiler = BrowserProfiler()
+        profile_path = profiler.get_profile_path(profile)
+        
+        if not profile_path:
+            profiles = profiler.list_profiles()
+            
+            if profiles:
+                console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]")
+                display_profiles_table(profiles)
+            else:
+                console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]")
+            
+            return
+        
+        # Include the profile in browser config
+        if not browser:
+            browser = {}
+        browser["user_data_dir"] = profile_path
+        browser["use_managed_browser"] = True
+        
+        if verbose:
+            console.print(f"[green]Using browser profile:[/green] {profile}")
+            
+    try:
+        # Load base configurations
+        browser_cfg = BrowserConfig.load(load_config_file(browser_config))
+        crawler_cfg = CrawlerRunConfig.load(load_config_file(crawler_config))
+        
+        # Override with CLI params
+        if browser:
+            browser_cfg = browser_cfg.clone(**browser)
+        if crawler:
+            crawler_cfg = crawler_cfg.clone(**crawler)
+            
+        # Handle content filter config
+        if filter_config or output in ["markdown-fit", "md-fit"]:
+            if filter_config:
+                filter_conf = load_config_file(filter_config)
+            elif not filter_config and output in ["markdown-fit", "md-fit"]:
+                filter_conf = {
+                    "type": "pruning",
+                    "query": "",
+                    "threshold": 0.48
+                }
+            if filter_conf["type"] == "bm25":
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = BM25ContentFilter(
+                        user_query=filter_conf.get("query"),
+                        bm25_threshold=filter_conf.get("threshold", 1.0)
+                    )
+                )
+            elif filter_conf["type"] == "pruning":
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = PruningContentFilter(
+                        user_query=filter_conf.get("query"),
+                        threshold=filter_conf.get("threshold", 0.48)
+                    )
+                )
+        
+        # Handle json-extract option (takes precedence over extraction-config)
+        if json_extract is not None:
+            # Get LLM provider and token
+            provider, token = setup_llm_config()
+            
+            # Default sophisticated instruction for structured data extraction
+            default_instruction = """Analyze the web page content and extract structured data as JSON. 
+If the page contains a list of items with repeated patterns, extract all items in an array. 
+If the page is an article or contains unique content, extract a comprehensive JSON object with all relevant information.
+Look at the content, intention of content, what it offers and find the data item(s) in the page.
+Always return valid, properly formatted JSON."""
+            
+            
+            default_instruction_with_user_query = """Analyze the web page content and extract structured data as JSON, following the below instruction and explanation of schema and always return valid, properly formatted JSON. \n\nInstruction:\n\n""" + json_extract
+            
+            # Determine instruction based on whether json_extract is empty or has content
+            instruction = default_instruction_with_user_query if json_extract else default_instruction
+            
+            # Create LLM extraction strategy
+            crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                llm_config=LLMConfig(provider=provider, api_token=token),
+                instruction=instruction,
+                schema=load_schema_file(schema),  # Will be None if no schema is provided
+                extraction_type="schema", #if schema else "block",
+                apply_chunking=False,
+                force_json_response=True,
+                verbose=verbose,
+            )
+            
+            # Set output to JSON if not explicitly specified
+            if output == "all":
+                output = "json"
+                
+        # Handle extraction strategy from config file (only if json-extract wasn't used)
+        elif extraction_config:
+            extract_conf = load_config_file(extraction_config)
+            schema_data = load_schema_file(schema)
+            
+            # Check if type does not exist show proper message
+            if not extract_conf.get("type"):
+                raise click.ClickException("Extraction type not specified")
+            if extract_conf["type"] not in ["llm", "json-css", "json-xpath"]:
+                raise click.ClickException(f"Invalid extraction type: {extract_conf['type']}")
+            
+            if extract_conf["type"] == "llm":
+                # if no provider show error emssage
+                if not extract_conf.get("provider") or not extract_conf.get("api_token"):
+                    raise click.ClickException("LLM provider and API token are required for LLM extraction")
+
+                crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                    llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
+                    instruction=extract_conf["instruction"],
+                    schema=schema_data,
+                    **extract_conf.get("params", {})
+                )
+            elif extract_conf["type"] == "json-css":
+                crawler_cfg.extraction_strategy = JsonCssExtractionStrategy(
+                    schema=schema_data
+                )
+            elif extract_conf["type"] == "json-xpath":
+                crawler_cfg.extraction_strategy = JsonXPathExtractionStrategy(
+                    schema=schema_data
+                )
+                
+
+        # No cache
+        if bypass_cache:
+            crawler_cfg.cache_mode = CacheMode.BYPASS
+
+        crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()    
+
+        config = get_global_config()
+        
+        browser_cfg.verbose = config.get("VERBOSE", False)
+        crawler_cfg.verbose = config.get("VERBOSE", False)
+        
+        # Run crawler
+        result : CrawlResult = anyio.run(
+            run_crawler,
+            url,
+            browser_cfg,
+            crawler_cfg,
+            verbose
+        )
+
+        # Handle question
+        if question:
+            provider, token = setup_llm_config()
+            markdown = result.markdown.raw_markdown
+            anyio.run(stream_llm_response, url, markdown, question, provider, token)
+            return
+        
+        # Handle output
+        if not output_file:
+            if output == "all":
+                click.echo(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                print(result.extracted_content)
+                extracted_items = json.loads(result.extracted_content)
+                click.echo(json.dumps(extracted_items, indent=2))
+                
+            elif output in ["markdown", "md"]:
+                click.echo(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                click.echo(result.markdown.fit_markdown)
+        else:
+            if output == "all":
+                with open(output_file, "w") as f:
+                    f.write(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                with open(output_file, "w") as f:
+                    f.write(result.extracted_content)
+            elif output in ["markdown", "md"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.fit_markdown)
+            
+    except Exception as e:
+        raise click.ClickException(str(e))
+
+@cli.command("examples")
+def examples_cmd():
+    """Show usage examples"""
+    show_examples()
+
+@cli.group("config")
+def config_cmd():
+    """Manage global configuration settings
+    
+    Commands to view and update global configuration settings:
+    - list: Display all current configuration settings
+    - get: Get the value of a specific setting
+    - set: Set the value of a specific setting
+    """
+    pass
+
+@config_cmd.command("list")
+def config_list_cmd():
+    """List all configuration settings"""
+    config = get_global_config()
+    
+    table = Table(title="Crawl4AI Configuration", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("Setting", style="cyan")
+    table.add_column("Value", style="green")
+    table.add_column("Default", style="yellow")
+    table.add_column("Description", style="white")
+    
+    for key, setting in USER_SETTINGS.items():
+        value = config.get(key, setting["default"])
+        
+        # Handle secret values
+        display_value = value
+        if setting.get("secret", False) and value:
+            display_value = "********"
+            
+        # Handle boolean values
+        if setting["type"] == "boolean":
+            display_value = str(value).lower()
+            default_value = str(setting["default"]).lower()
+        else:
+            default_value = str(setting["default"])
+        
+        table.add_row(
+            key,
+            str(display_value),
+            default_value,
+            setting["description"]
+        )
+    
+    console.print(table)
+
+@config_cmd.command("get")
+@click.argument("key", required=True)
+def config_get_cmd(key: str):
+    """Get a specific configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        return
+    
+    value = config.get(key, USER_SETTINGS[key]["default"])
+    
+    # Handle secret values
+    display_value = value
+    if USER_SETTINGS[key].get("secret", False) and value:
+        display_value = "********"
+    
+    console.print(f"[cyan]{key}[/cyan] = [green]{display_value}[/green]")
+    console.print(f"[dim]Description: {USER_SETTINGS[key]['description']}[/dim]")
+
+@config_cmd.command("set")
+@click.argument("key", required=True)
+@click.argument("value", required=True)
+def config_set_cmd(key: str, value: str):
+    """Set a configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        console.print(f"[yellow]Available settings: {', '.join(USER_SETTINGS.keys())}[/yellow]")
+        return
+    
+    setting = USER_SETTINGS[key]
+    
+    # Type conversion and validation
+    if setting["type"] == "boolean":
+        if value.lower() in ["true", "yes", "1", "y"]:
+            typed_value = True
+        elif value.lower() in ["false", "no", "0", "n"]:
+            typed_value = False
+        else:
+            console.print(f"[red]Error: Invalid boolean value. Use 'true' or 'false'.[/red]")
+            return
+    elif setting["type"] == "string":
+        typed_value = value
+        
+        # Check if the value should be one of the allowed options
+        if "options" in setting and value not in setting["options"]:
+            console.print(f"[red]Error: Value must be one of: {', '.join(setting['options'])}[/red]")
+            return
+    
+    # Update config
+    config[key] = typed_value
+    save_global_config(config)
+    
+    # Handle secret values for display
+    display_value = typed_value
+    if setting.get("secret", False) and typed_value:
+        display_value = "********"
+        
+    console.print(f"[green]Successfully set[/green] [cyan]{key}[/cyan] = [green]{display_value}[/green]")
+
+@cli.command("profiles")
+def profiles_cmd():
+    """Manage browser profiles interactively
+    
+    Launch an interactive browser profile manager where you can:
+    - List all existing profiles
+    - Create new profiles for authenticated browsing
+    - Delete unused profiles
+    """
+    # Run interactive profile manager
+    anyio.run(manage_profiles)
+
+@cli.command(name="")
+@click.argument("url", required=False)
+@click.option("--example", is_flag=True, help="Show usage examples")
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
+@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
+        extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
+        output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl4AI CLI - Web content extraction tool
+
+    Simple Usage:
+        crwl https://example.com
+    
+    Run with --example to see detailed usage examples.
+    
+    Other commands:
+        crwl profiles   - Manage browser profiles for identity-based crawling
+        crwl crawl      - Crawl a website with advanced options
+        crwl cdp        - Launch browser with CDP debugging enabled
+        crwl browser    - Manage builtin browser (start, stop, status, restart)
+        crwl config     - Manage global configuration settings
+        crwl examples   - Show more usage examples
+        
+    Configuration Examples:
+        crwl config list                         - List all configuration settings
+        crwl config get DEFAULT_LLM_PROVIDER     - Show current LLM provider
+        crwl config set VERBOSE true             - Enable verbose mode globally
+        crwl config set BROWSER_HEADLESS false   - Default to visible browser
+    """
+
+    if example:
+        show_examples()
+        return
+        
+    if not url:
+        # Show help without error message
+        ctx = click.get_current_context()
+        click.echo(ctx.get_help())
+        return
+        
+    # Forward to crawl command
+    ctx = click.get_current_context()
+    ctx.invoke(
+        crawl_cmd, 
+        url=url, 
+        browser_config=browser_config,
+        crawler_config=crawler_config,
+        filter_config=filter_config,
+        extraction_config=extraction_config,
+        json_extract=json_extract,
+        schema=schema,
+        browser=browser,
+        crawler=crawler,
+        output=output,
+        bypass_cache=bypass_cache,
+        question=question,
+        verbose=verbose,
+        profile=profile
+    )
+
+def main():
+    import sys
+    if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
+        sys.argv.insert(1, "crawl")
+    cli()
 
 if __name__ == "__main__":
-    cli()
+    main()
\ No newline at end of file
diff --git a/crawl4ai/components/crawler_monitor.py b/crawl4ai/components/crawler_monitor.py
new file mode 100644
index 00000000..49bf9a15
--- /dev/null
+++ b/crawl4ai/components/crawler_monitor.py
@@ -0,0 +1,837 @@
+import time
+import uuid
+import threading
+import psutil
+from datetime import datetime, timedelta
+from typing import Dict, Optional, List
+import threading
+from rich.console import Console
+from rich.layout import Layout
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+from rich.live import Live
+from rich import box
+from ..models import CrawlStatus
+
+class TerminalUI:
+    """Terminal user interface for CrawlerMonitor using rich library."""
+    
+    def __init__(self, refresh_rate: float = 1.0, max_width: int = 120):
+        """
+        Initialize the terminal UI.
+        
+        Args:
+            refresh_rate: How often to refresh the UI (in seconds)
+            max_width: Maximum width of the UI in characters
+        """
+        self.console = Console(width=max_width)
+        self.layout = Layout()
+        self.refresh_rate = refresh_rate
+        self.stop_event = threading.Event()
+        self.ui_thread = None
+        self.monitor = None  # Will be set by CrawlerMonitor
+        self.max_width = max_width
+        
+        # Setup layout - vertical layout (top to bottom)
+        self.layout.split(
+            Layout(name="header", size=3),
+            Layout(name="pipeline_status", size=10),
+            Layout(name="task_details", ratio=1),
+            Layout(name="footer", size=3)  # Increased footer size to fit all content
+        )
+        
+    def start(self, monitor):
+        """Start the UI thread."""
+        self.monitor = monitor
+        self.stop_event.clear()
+        self.ui_thread = threading.Thread(target=self._ui_loop)
+        self.ui_thread.daemon = True
+        self.ui_thread.start()
+        
+    def stop(self):
+        """Stop the UI thread."""
+        if self.ui_thread and self.ui_thread.is_alive():
+            self.stop_event.set()
+            # Only try to join if we're not in the UI thread
+            # This prevents "cannot join current thread" errors
+            if threading.current_thread() != self.ui_thread:
+                self.ui_thread.join(timeout=5.0)
+    
+    def _ui_loop(self):
+        """Main UI rendering loop."""
+        import sys
+        import select
+        import termios
+        import tty
+        
+        # Setup terminal for non-blocking input
+        old_settings = termios.tcgetattr(sys.stdin)
+        try:
+            tty.setcbreak(sys.stdin.fileno())
+            
+            # Use Live display to render the UI
+            with Live(self.layout, refresh_per_second=1/self.refresh_rate, screen=True) as live:
+                self.live = live  # Store the live display for updates
+                
+                # Main UI loop
+                while not self.stop_event.is_set():
+                    self._update_display()
+                    
+                    # Check for key press (non-blocking)
+                    if select.select([sys.stdin], [], [], 0)[0]:
+                        key = sys.stdin.read(1)
+                        # Check for 'q' to quit
+                        if key == 'q':
+                            # Signal stop but don't call monitor.stop() from UI thread
+                            # as it would cause the thread to try to join itself
+                            self.stop_event.set()
+                            self.monitor.is_running = False
+                            break
+                    
+                    time.sleep(self.refresh_rate)
+                    
+                    # Just check if the monitor was stopped
+                    if not self.monitor.is_running:
+                        break
+        finally:
+            # Restore terminal settings
+            termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
+    
+    def _update_display(self):
+        """Update the terminal display with current statistics."""
+        if not self.monitor:
+            return
+            
+        # Update crawler status panel
+        self.layout["header"].update(self._create_status_panel())
+        
+        # Update pipeline status panel and task details panel
+        self.layout["pipeline_status"].update(self._create_pipeline_panel())
+        self.layout["task_details"].update(self._create_task_details_panel())
+        
+        # Update footer
+        self.layout["footer"].update(self._create_footer())
+    
+    def _create_status_panel(self) -> Panel:
+        """Create the crawler status panel."""
+        summary = self.monitor.get_summary()
+        
+        # Format memory status with icon
+        memory_status = self.monitor.get_memory_status()
+        memory_icon = "🟢"  # Default NORMAL
+        if memory_status == "PRESSURE":
+            memory_icon = "🟠"
+        elif memory_status == "CRITICAL":
+            memory_icon = "🔴"
+        
+        # Get current memory usage
+        current_memory = psutil.Process().memory_info().rss / (1024 * 1024)  # MB
+        memory_percent = (current_memory / psutil.virtual_memory().total) * 100
+        
+        # Format runtime
+        runtime = self.monitor._format_time(time.time() - self.monitor.start_time if self.monitor.start_time else 0)
+        
+        # Create the status text
+        status_text = Text()
+        status_text.append(f"Web Crawler Dashboard | Runtime: {runtime} | Memory: {memory_percent:.1f}% {memory_icon}\n")
+        status_text.append(f"Status: {memory_status} | URLs: {summary['urls_completed']}/{summary['urls_total']} | ")
+        status_text.append(f"Peak Mem: {summary['peak_memory_percent']:.1f}% at {self.monitor._format_time(summary['peak_memory_time'])}")
+        
+        return Panel(status_text, title="Crawler Status", border_style="blue")
+    
+    def _create_pipeline_panel(self) -> Panel:
+        """Create the pipeline status panel."""
+        summary = self.monitor.get_summary()
+        queue_stats = self.monitor.get_queue_stats()
+        
+        # Create a table for status counts
+        table = Table(show_header=True, box=None)
+        table.add_column("Status", style="cyan")
+        table.add_column("Count", justify="right")
+        table.add_column("Percentage", justify="right")
+        table.add_column("Stat", style="cyan")
+        table.add_column("Value", justify="right")
+        
+        # Calculate overall progress
+        progress = f"{summary['urls_completed']}/{summary['urls_total']}"
+        progress_percent = f"{summary['completion_percentage']:.1f}%"
+        
+        # Add rows for each status
+        table.add_row(
+            "Overall Progress", 
+            progress, 
+            progress_percent,
+            "Est. Completion", 
+            summary.get('estimated_completion_time', "N/A")
+        )
+        
+        # Add rows for each status
+        status_counts = summary['status_counts']
+        total = summary['urls_total'] or 1  # Avoid division by zero
+        
+        # Status rows
+        table.add_row(
+            "Completed", 
+            str(status_counts.get(CrawlStatus.COMPLETED.name, 0)),
+            f"{status_counts.get(CrawlStatus.COMPLETED.name, 0) / total * 100:.1f}%",
+            "Avg. Time/URL",
+            f"{summary.get('avg_task_duration', 0):.2f}s"
+        )
+        
+        table.add_row(
+            "Failed", 
+            str(status_counts.get(CrawlStatus.FAILED.name, 0)),
+            f"{status_counts.get(CrawlStatus.FAILED.name, 0) / total * 100:.1f}%",
+            "Concurrent Tasks",
+            str(status_counts.get(CrawlStatus.IN_PROGRESS.name, 0))
+        )
+        
+        table.add_row(
+            "In Progress", 
+            str(status_counts.get(CrawlStatus.IN_PROGRESS.name, 0)),
+            f"{status_counts.get(CrawlStatus.IN_PROGRESS.name, 0) / total * 100:.1f}%",
+            "Queue Size",
+            str(queue_stats['total_queued'])
+        )
+        
+        table.add_row(
+            "Queued", 
+            str(status_counts.get(CrawlStatus.QUEUED.name, 0)),
+            f"{status_counts.get(CrawlStatus.QUEUED.name, 0) / total * 100:.1f}%",
+            "Max Wait Time",
+            f"{queue_stats['highest_wait_time']:.1f}s"
+        )
+        
+        # Requeued is a special case as it's not a status
+        requeued_count = summary.get('requeued_count', 0)
+        table.add_row(
+            "Requeued", 
+            str(requeued_count),
+            f"{summary.get('requeue_rate', 0):.1f}%",
+            "Avg Wait Time",
+            f"{queue_stats['avg_wait_time']:.1f}s"
+        )
+        
+        # Add empty row for spacing
+        table.add_row(
+            "", 
+            "",
+            "",
+            "Requeue Rate",
+            f"{summary.get('requeue_rate', 0):.1f}%"
+        )
+        
+        return Panel(table, title="Pipeline Status", border_style="green")
+    
+    def _create_task_details_panel(self) -> Panel:
+        """Create the task details panel."""
+        # Create a table for task details
+        table = Table(show_header=True, expand=True)
+        table.add_column("Task ID", style="cyan", no_wrap=True, width=10)
+        table.add_column("URL", style="blue", ratio=3)
+        table.add_column("Status", style="green", width=15)
+        table.add_column("Memory", justify="right", width=8)
+        table.add_column("Peak", justify="right", width=8)
+        table.add_column("Duration", justify="right", width=10)
+        
+        # Get all task stats
+        task_stats = self.monitor.get_all_task_stats()
+        
+        # Add summary row
+        active_tasks = sum(1 for stats in task_stats.values() 
+                          if stats['status'] == CrawlStatus.IN_PROGRESS.name)
+        
+        total_memory = sum(stats['memory_usage'] for stats in task_stats.values())
+        total_peak = sum(stats['peak_memory'] for stats in task_stats.values())
+        
+        # Summary row with separators
+        table.add_row(
+            "SUMMARY", 
+            f"Total: {len(task_stats)}", 
+            f"Active: {active_tasks}",
+            f"{total_memory:.1f}",
+            f"{total_peak:.1f}",
+            "N/A"
+        )
+        
+        # Add a separator
+        table.add_row("—" * 10, "—" * 20, "—" * 10, "—" * 8, "—" * 8, "—" * 10)
+        
+        # Status icons
+        status_icons = {
+            CrawlStatus.QUEUED.name: "⏳",
+            CrawlStatus.IN_PROGRESS.name: "🔄",
+            CrawlStatus.COMPLETED.name: "✅",
+            CrawlStatus.FAILED.name: "❌"
+        }
+        
+        # Calculate how many rows we can display based on available space
+        # We can display more rows now that we have a dedicated panel
+        display_count = min(len(task_stats), 20)  # Display up to 20 tasks
+        
+        # Add rows for each task
+        for task_id, stats in sorted(
+            list(task_stats.items())[:display_count],
+            # Sort: 1. IN_PROGRESS first, 2. QUEUED, 3. COMPLETED/FAILED by recency
+            key=lambda x: (
+                0 if x[1]['status'] == CrawlStatus.IN_PROGRESS.name else 
+                1 if x[1]['status'] == CrawlStatus.QUEUED.name else 
+                2,
+                -1 * (x[1].get('end_time', 0) or 0)  # Most recent first
+            )
+        ):
+            # Truncate task_id and URL for display
+            short_id = task_id[:8]
+            url = stats['url']
+            if len(url) > 50:  # Allow longer URLs in the dedicated panel
+                url = url[:47] + "..."
+                
+            # Format status with icon
+            status = f"{status_icons.get(stats['status'], '?')} {stats['status']}"
+            
+            # Add row
+            table.add_row(
+                short_id,
+                url,
+                status,
+                f"{stats['memory_usage']:.1f}",
+                f"{stats['peak_memory']:.1f}",
+                stats['duration'] if 'duration' in stats else "0:00"
+            )
+        
+        return Panel(table, title="Task Details", border_style="yellow")
+    
+    def _create_footer(self) -> Panel:
+        """Create the footer panel."""
+        from rich.columns import Columns
+        from rich.align import Align
+        
+        memory_status = self.monitor.get_memory_status()
+        memory_icon = "🟢"  # Default NORMAL
+        if memory_status == "PRESSURE":
+            memory_icon = "🟠"
+        elif memory_status == "CRITICAL":
+            memory_icon = "🔴"
+        
+        # Left section - memory status
+        left_text = Text()
+        left_text.append("Memory Status: ", style="bold")
+        status_style = "green" if memory_status == "NORMAL" else "yellow" if memory_status == "PRESSURE" else "red bold"
+        left_text.append(f"{memory_icon} {memory_status}", style=status_style)
+        
+        # Center section - copyright
+        center_text = Text("© Crawl4AI 2025 | Made by UnclecCode", style="cyan italic")
+        
+        # Right section - quit instruction
+        right_text = Text()
+        right_text.append("Press ", style="bold")
+        right_text.append("q", style="white on blue")
+        right_text.append(" to quit", style="bold")
+        
+        # Create columns with the three sections
+        footer_content = Columns(
+            [
+                Align.left(left_text),
+                Align.center(center_text),
+                Align.right(right_text)
+            ],
+            expand=True
+        )
+        
+        # Create a more visible footer panel
+        return Panel(
+            footer_content,
+            border_style="white",
+            padding=(0, 1)  # Add padding for better visibility
+        )
+
+
+class CrawlerMonitor:
+    """
+    Comprehensive monitoring and visualization system for tracking web crawler operations in real-time.
+    Provides a terminal-based dashboard that displays task statuses, memory usage, queue statistics,
+    and performance metrics.
+    """
+    
+    def __init__(
+        self,
+        urls_total: int = 0,
+        refresh_rate: float = 1.0,
+        enable_ui: bool = True,
+        max_width: int = 120
+    ):
+        """
+        Initialize the CrawlerMonitor.
+        
+        Args:
+            urls_total: Total number of URLs to be crawled
+            refresh_rate: How often to refresh the UI (in seconds)
+            enable_ui: Whether to display the terminal UI
+            max_width: Maximum width of the UI in characters
+        """
+        # Core monitoring attributes
+        self.stats = {}  # Task ID -> stats dict
+        self.memory_status = "NORMAL"
+        self.start_time = None
+        self.end_time = None
+        self.is_running = False
+        self.queue_stats = {
+            "total_queued": 0,
+            "highest_wait_time": 0.0,
+            "avg_wait_time": 0.0
+        }
+        self.urls_total = urls_total
+        self.urls_completed = 0
+        self.peak_memory_percent = 0.0
+        self.peak_memory_time = 0.0
+        
+        # Status counts
+        self.status_counts = {
+            CrawlStatus.QUEUED.name: 0,
+            CrawlStatus.IN_PROGRESS.name: 0,
+            CrawlStatus.COMPLETED.name: 0,
+            CrawlStatus.FAILED.name: 0
+        }
+        
+        # Requeue tracking
+        self.requeued_count = 0
+        
+        # Thread-safety
+        self._lock = threading.RLock()
+        
+        # Terminal UI
+        self.enable_ui = enable_ui
+        self.terminal_ui = TerminalUI(
+            refresh_rate=refresh_rate, 
+            max_width=max_width
+        ) if enable_ui else None
+    
+    def start(self):
+        """
+        Start the monitoring session.
+        
+        - Initializes the start_time
+        - Sets is_running to True
+        - Starts the terminal UI if enabled
+        """
+        with self._lock:
+            self.start_time = time.time()
+            self.is_running = True
+            
+            # Start the terminal UI
+            if self.enable_ui and self.terminal_ui:
+                self.terminal_ui.start(self)
+    
+    def stop(self):
+        """
+        Stop the monitoring session.
+        
+        - Records end_time
+        - Sets is_running to False
+        - Stops the terminal UI
+        - Generates final summary statistics
+        """
+        with self._lock:
+            self.end_time = time.time()
+            self.is_running = False
+            
+            # Stop the terminal UI
+            if self.enable_ui and self.terminal_ui:
+                self.terminal_ui.stop()
+    
+    def add_task(self, task_id: str, url: str):
+        """
+        Register a new task with the monitor.
+        
+        Args:
+            task_id: Unique identifier for the task
+            url: URL being crawled
+            
+        The task is initialized with:
+            - status: QUEUED
+            - url: The URL to crawl
+            - enqueue_time: Current time
+            - memory_usage: 0
+            - peak_memory: 0
+            - wait_time: 0
+            - retry_count: 0
+        """
+        with self._lock:
+            self.stats[task_id] = {
+                "task_id": task_id,
+                "url": url,
+                "status": CrawlStatus.QUEUED.name,
+                "enqueue_time": time.time(),
+                "start_time": None,
+                "end_time": None,
+                "memory_usage": 0.0,
+                "peak_memory": 0.0,
+                "error_message": "",
+                "wait_time": 0.0,
+                "retry_count": 0,
+                "duration": "0:00",
+                "counted_requeue": False
+            }
+            
+            # Update status counts
+            self.status_counts[CrawlStatus.QUEUED.name] += 1
+    
+    def update_task(
+        self, 
+        task_id: str, 
+        status: Optional[CrawlStatus] = None,
+        start_time: Optional[float] = None,
+        end_time: Optional[float] = None,
+        memory_usage: Optional[float] = None,
+        peak_memory: Optional[float] = None,
+        error_message: Optional[str] = None,
+        retry_count: Optional[int] = None,
+        wait_time: Optional[float] = None
+    ):
+        """
+        Update statistics for a specific task.
+        
+        Args:
+            task_id: Unique identifier for the task
+            status: New status (QUEUED, IN_PROGRESS, COMPLETED, FAILED)
+            start_time: When task execution started
+            end_time: When task execution ended
+            memory_usage: Current memory usage in MB
+            peak_memory: Maximum memory usage in MB
+            error_message: Error description if failed
+            retry_count: Number of retry attempts
+            wait_time: Time spent in queue
+            
+        Updates task statistics and updates status counts.
+        If status changes, decrements old status count and 
+        increments new status count.
+        """
+        with self._lock:
+            # Check if task exists
+            if task_id not in self.stats:
+                return
+            
+            task_stats = self.stats[task_id]
+            
+            # Update status counts if status is changing
+            old_status = task_stats["status"]
+            if status and status.name != old_status:
+                self.status_counts[old_status] -= 1
+                self.status_counts[status.name] += 1
+                
+                # Track completion
+                if status == CrawlStatus.COMPLETED:
+                    self.urls_completed += 1
+                
+                # Track requeues
+                if old_status in [CrawlStatus.COMPLETED.name, CrawlStatus.FAILED.name] and not task_stats.get("counted_requeue", False):
+                    self.requeued_count += 1
+                    task_stats["counted_requeue"] = True
+            
+            # Update task statistics
+            if status:
+                task_stats["status"] = status.name
+            if start_time is not None:
+                task_stats["start_time"] = start_time
+            if end_time is not None:
+                task_stats["end_time"] = end_time
+            if memory_usage is not None:
+                task_stats["memory_usage"] = memory_usage
+                
+                # Update peak memory if necessary
+                current_percent = (memory_usage / psutil.virtual_memory().total) * 100
+                if current_percent > self.peak_memory_percent:
+                    self.peak_memory_percent = current_percent
+                    self.peak_memory_time = time.time()
+                
+            if peak_memory is not None:
+                task_stats["peak_memory"] = peak_memory
+            if error_message is not None:
+                task_stats["error_message"] = error_message
+            if retry_count is not None:
+                task_stats["retry_count"] = retry_count
+            if wait_time is not None:
+                task_stats["wait_time"] = wait_time
+            
+            # Calculate duration
+            if task_stats["start_time"]:
+                end = task_stats["end_time"] or time.time()
+                duration = end - task_stats["start_time"]
+                task_stats["duration"] = self._format_time(duration)
+    
+    def update_memory_status(self, status: str):
+        """
+        Update the current memory status.
+        
+        Args:
+            status: Memory status (NORMAL, PRESSURE, CRITICAL, or custom)
+            
+        Also updates the UI to reflect the new status.
+        """
+        with self._lock:
+            self.memory_status = status
+    
+    def update_queue_statistics(
+        self,
+        total_queued: int,
+        highest_wait_time: float,
+        avg_wait_time: float
+    ):
+        """
+        Update statistics related to the task queue.
+        
+        Args:
+            total_queued: Number of tasks currently in queue
+            highest_wait_time: Longest wait time of any queued task
+            avg_wait_time: Average wait time across all queued tasks
+        """
+        with self._lock:
+            self.queue_stats = {
+                "total_queued": total_queued,
+                "highest_wait_time": highest_wait_time,
+                "avg_wait_time": avg_wait_time
+            }
+    
+    def get_task_stats(self, task_id: str) -> Dict:
+        """
+        Get statistics for a specific task.
+        
+        Args:
+            task_id: Unique identifier for the task
+            
+        Returns:
+            Dictionary containing all task statistics
+        """
+        with self._lock:
+            return self.stats.get(task_id, {}).copy()
+    
+    def get_all_task_stats(self) -> Dict[str, Dict]:
+        """
+        Get statistics for all tasks.
+        
+        Returns:
+            Dictionary mapping task_ids to their statistics
+        """
+        with self._lock:
+            return self.stats.copy()
+    
+    def get_memory_status(self) -> str:
+        """
+        Get the current memory status.
+        
+        Returns:
+            Current memory status string
+        """
+        with self._lock:
+            return self.memory_status
+    
+    def get_queue_stats(self) -> Dict:
+        """
+        Get current queue statistics.
+        
+        Returns:
+            Dictionary with queue statistics including:
+            - total_queued: Number of tasks in queue
+            - highest_wait_time: Longest wait time
+            - avg_wait_time: Average wait time
+        """
+        with self._lock:
+            return self.queue_stats.copy()
+    
+    def get_summary(self) -> Dict:
+        """
+        Get a summary of all crawler statistics.
+        
+        Returns:
+            Dictionary containing:
+            - runtime: Total runtime in seconds
+            - urls_total: Total URLs to process
+            - urls_completed: Number of completed URLs
+            - completion_percentage: Percentage complete
+            - status_counts: Count of tasks in each status
+            - memory_status: Current memory status
+            - peak_memory_percent: Highest memory usage
+            - peak_memory_time: When peak memory occurred
+            - avg_task_duration: Average task processing time
+            - estimated_completion_time: Projected finish time
+            - requeue_rate: Percentage of tasks requeued
+        """
+        with self._lock:
+            # Calculate runtime
+            current_time = time.time()
+            runtime = current_time - (self.start_time or current_time)
+            
+            # Calculate completion percentage
+            completion_percentage = 0
+            if self.urls_total > 0:
+                completion_percentage = (self.urls_completed / self.urls_total) * 100
+            
+            # Calculate average task duration for completed tasks
+            completed_tasks = [
+                task for task in self.stats.values() 
+                if task["status"] == CrawlStatus.COMPLETED.name and task.get("start_time") and task.get("end_time")
+            ]
+            
+            avg_task_duration = 0
+            if completed_tasks:
+                total_duration = sum(task["end_time"] - task["start_time"] for task in completed_tasks)
+                avg_task_duration = total_duration / len(completed_tasks)
+            
+            # Calculate requeue rate
+            requeue_rate = 0
+            if len(self.stats) > 0:
+                requeue_rate = (self.requeued_count / len(self.stats)) * 100
+            
+            # Calculate estimated completion time
+            estimated_completion_time = "N/A"
+            if avg_task_duration > 0 and self.urls_total > 0 and self.urls_completed > 0:
+                remaining_tasks = self.urls_total - self.urls_completed
+                estimated_seconds = remaining_tasks * avg_task_duration
+                estimated_completion_time = self._format_time(estimated_seconds)
+            
+            return {
+                "runtime": runtime,
+                "urls_total": self.urls_total,
+                "urls_completed": self.urls_completed,
+                "completion_percentage": completion_percentage,
+                "status_counts": self.status_counts.copy(),
+                "memory_status": self.memory_status,
+                "peak_memory_percent": self.peak_memory_percent,
+                "peak_memory_time": self.peak_memory_time,
+                "avg_task_duration": avg_task_duration,
+                "estimated_completion_time": estimated_completion_time,
+                "requeue_rate": requeue_rate,
+                "requeued_count": self.requeued_count
+            }
+    
+    def render(self):
+        """
+        Render the terminal UI.
+        
+        This is the main UI rendering loop that:
+        1. Updates all statistics
+        2. Formats the display
+        3. Renders the ASCII interface
+        4. Handles keyboard input
+        
+        Note: The actual rendering is handled by the TerminalUI class
+        which uses the rich library's Live display.
+        """
+        if self.enable_ui and self.terminal_ui:
+            # Force an update of the UI
+            if hasattr(self.terminal_ui, '_update_display'):
+                self.terminal_ui._update_display()
+    
+    def _format_time(self, seconds: float) -> str:
+        """
+        Format time in hours:minutes:seconds.
+        
+        Args:
+            seconds: Time in seconds
+            
+        Returns:
+            Formatted time string (e.g., "1:23:45")
+        """
+        delta = timedelta(seconds=int(seconds))
+        hours, remainder = divmod(delta.seconds, 3600)
+        minutes, seconds = divmod(remainder, 60)
+        
+        if hours > 0:
+            return f"{hours}:{minutes:02}:{seconds:02}"
+        else:
+            return f"{minutes}:{seconds:02}"
+    
+    def _calculate_estimated_completion(self) -> str:
+        """
+        Calculate estimated completion time based on current progress.
+        
+        Returns:
+            Formatted time string
+        """
+        summary = self.get_summary()
+        return summary.get("estimated_completion_time", "N/A")
+
+
+# Example code for testing
+if __name__ == "__main__":
+    # Initialize the monitor
+    monitor = CrawlerMonitor(urls_total=100)
+    
+    # Start monitoring
+    monitor.start()
+    
+    try:
+        # Simulate some tasks
+        for i in range(20):
+            task_id = str(uuid.uuid4())
+            url = f"https://example.com/page{i}"
+            monitor.add_task(task_id, url)
+            
+            # Simulate 20% of tasks are already running
+            if i < 4:
+                monitor.update_task(
+                    task_id=task_id,
+                    status=CrawlStatus.IN_PROGRESS,
+                    start_time=time.time() - 30,  # Started 30 seconds ago
+                    memory_usage=10.5
+                )
+                
+            # Simulate 10% of tasks are completed
+            if i >= 4 and i < 6:
+                start_time = time.time() - 60
+                end_time = time.time() - 15
+                monitor.update_task(
+                    task_id=task_id,
+                    status=CrawlStatus.IN_PROGRESS,
+                    start_time=start_time,
+                    memory_usage=8.2
+                )
+                monitor.update_task(
+                    task_id=task_id,
+                    status=CrawlStatus.COMPLETED,
+                    end_time=end_time,
+                    memory_usage=0,
+                    peak_memory=15.7
+                )
+                
+            # Simulate 5% of tasks fail
+            if i >= 6 and i < 7:
+                start_time = time.time() - 45
+                end_time = time.time() - 20
+                monitor.update_task(
+                    task_id=task_id,
+                    status=CrawlStatus.IN_PROGRESS,
+                    start_time=start_time,
+                    memory_usage=12.3
+                )
+                monitor.update_task(
+                    task_id=task_id,
+                    status=CrawlStatus.FAILED,
+                    end_time=end_time,
+                    memory_usage=0,
+                    peak_memory=18.2,
+                    error_message="Connection timeout"
+                )
+        
+        # Simulate memory pressure
+        monitor.update_memory_status("PRESSURE")
+        
+        # Simulate queue statistics
+        monitor.update_queue_statistics(
+            total_queued=16,  # 20 - 4 (in progress)
+            highest_wait_time=120.5,
+            avg_wait_time=60.2
+        )
+        
+        # Keep the monitor running for a demonstration
+        print("Crawler Monitor is running. Press 'q' to exit.")
+        while monitor.is_running:
+            time.sleep(0.1)
+            
+    except KeyboardInterrupt:
+        print("\nExiting crawler monitor...")
+    finally:
+        # Stop the monitor
+        monitor.stop()
+        print("Crawler monitor exited successfully.")
\ No newline at end of file
diff --git a/crawl4ai/config.py b/crawl4ai/config.py
index 3e26514a..08f56b83 100644
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -4,7 +4,8 @@ from dotenv import load_dotenv
 load_dotenv()  # Load environment variables from .env file
 
 # Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
-DEFAULT_PROVIDER = "openai/gpt-4o-mini"
+DEFAULT_PROVIDER = "openai/gpt-4o"
+DEFAULT_PROVIDER_API_KEY = "OPENAI_API_KEY"
 MODEL_REPO_BRANCH = "new-release-0.0.2"
 # Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
 PROVIDER_MODELS = {
@@ -15,10 +16,26 @@ PROVIDER_MODELS = {
     "openai/gpt-4o": os.getenv("OPENAI_API_KEY"),
     "openai/o1-mini": os.getenv("OPENAI_API_KEY"),
     "openai/o1-preview": os.getenv("OPENAI_API_KEY"),
+    "openai/o3-mini": os.getenv("OPENAI_API_KEY"),
+    "openai/o3-mini-high": os.getenv("OPENAI_API_KEY"),
     "anthropic/claude-3-haiku-20240307": os.getenv("ANTHROPIC_API_KEY"),
     "anthropic/claude-3-opus-20240229": os.getenv("ANTHROPIC_API_KEY"),
     "anthropic/claude-3-sonnet-20240229": os.getenv("ANTHROPIC_API_KEY"),
     "anthropic/claude-3-5-sonnet-20240620": os.getenv("ANTHROPIC_API_KEY"),
+    "gemini/gemini-pro": os.getenv("GEMINI_API_KEY"),
+    'gemini/gemini-1.5-pro': os.getenv("GEMINI_API_KEY"),
+    'gemini/gemini-2.0-flash': os.getenv("GEMINI_API_KEY"),
+    'gemini/gemini-2.0-flash-exp': os.getenv("GEMINI_API_KEY"),
+    'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"),
+    "deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"),
+}
+PROVIDER_MODELS_PREFIXES = {
+    "ollama": "no-token-needed",  # Any model from Ollama no need for API token
+    "groq": os.getenv("GROQ_API_KEY"),
+    "openai": os.getenv("OPENAI_API_KEY"),
+    "anthropic": os.getenv("ANTHROPIC_API_KEY"),
+    "gemini": os.getenv("GEMINI_API_KEY"),
+    "deepseek": os.getenv("DEEPSEEK_API_KEY"),
 }
 
 # Chunk token threshold
@@ -84,3 +101,46 @@ SHOW_DEPRECATION_WARNINGS = True
 SCREENSHOT_HEIGHT_TRESHOLD = 10000
 PAGE_TIMEOUT = 60000
 DOWNLOAD_PAGE_TIMEOUT = 60000
+
+# Global user settings with descriptions and default values
+USER_SETTINGS = {
+    "DEFAULT_LLM_PROVIDER": {
+        "default": "openai/gpt-4o",
+        "description": "Default LLM provider in 'company/model' format (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')",
+        "type": "string"
+    },
+    "DEFAULT_LLM_PROVIDER_TOKEN": {
+        "default": "",
+        "description": "API token for the default LLM provider",
+        "type": "string",
+        "secret": True
+    },
+    "VERBOSE": {
+        "default": False,
+        "description": "Enable verbose output for all commands",
+        "type": "boolean"
+    },
+    "BROWSER_HEADLESS": {
+        "default": True,
+        "description": "Run browser in headless mode by default",
+        "type": "boolean"
+    },
+    "BROWSER_TYPE": {
+        "default": "chromium",
+        "description": "Default browser type (chromium or firefox)",
+        "type": "string",
+        "options": ["chromium", "firefox"]
+    },
+    "CACHE_MODE": {
+        "default": "bypass",
+        "description": "Default cache mode (bypass, use, or refresh)",
+        "type": "string",
+        "options": ["bypass", "use", "refresh"]
+    },
+    "USER_AGENT_MODE": {
+        "default": "default",
+        "description": "Default user agent mode (default, random, or mobile)",
+        "type": "string",
+        "options": ["default", "random", "mobile"]
+    }
+}
diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 75702ec5..4102cbad 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -1,3 +1,4 @@
+import inspect
 import re
 import time
 from bs4 import BeautifulSoup, Tag
@@ -5,25 +6,46 @@ from typing import List, Tuple, Dict, Optional
 from rank_bm25 import BM25Okapi
 from collections import deque
 from bs4 import NavigableString, Comment
-from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data
+
+from .utils import (
+    clean_tokens,
+    perform_completion_with_backoff,
+    escape_json_string,
+    sanitize_html,
+    get_home_folder,
+    extract_xml_data,
+    merge_chunks,
+)
+from .types import LLMConfig
+from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
 from abc import ABC, abstractmethod
 import math
 from snowballstemmer import stemmer
-from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
 from .models import TokenUsage
 from .prompts import PROMPT_FILTER_CONTENT
-import os
 import json
 import hashlib
 from pathlib import Path
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from .async_logger import AsyncLogger, LogLevel
-from colorama import Fore, Style, init
+from concurrent.futures import ThreadPoolExecutor
+from .async_logger import AsyncLogger, LogLevel, LogColor
+
 
 class RelevantContentFilter(ABC):
     """Abstract base class for content filtering strategies"""
 
-    def __init__(self, user_query: str = None):
+    def __init__(
+        self,
+        user_query: str = None,
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+    ):
+        """
+        Initializes the RelevantContentFilter class with optional user query.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            verbose (bool): Enable verbose logging (default: False).
+        """
         self.user_query = user_query
         self.included_tags = {
             # Primary structure
@@ -92,6 +114,8 @@ class RelevantContentFilter(ABC):
             r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
         )
         self.min_word_count = 2
+        self.verbose = False
+        self.logger = logger
 
     @abstractmethod
     def filter_content(self, html: str) -> List[str]:
@@ -353,6 +377,7 @@ class RelevantContentFilter(ABC):
         except Exception:
             return str(tag)  # Fallback to original if anything fails
 
+
 class BM25ContentFilter(RelevantContentFilter):
     """
     Content filtering using BM25 algorithm with priority tag handling.
@@ -495,6 +520,7 @@ class BM25ContentFilter(RelevantContentFilter):
 
         return [self.clean_element(tag) for _, _, tag in selected_candidates]
 
+
 class PruningContentFilter(RelevantContentFilter):
     """
     Content filtering using pruning algorithm with dynamic threshold.
@@ -741,110 +767,130 @@ class PruningContentFilter(RelevantContentFilter):
                 class_id_score -= 0.5
         return class_id_score
 
+
 class LLMContentFilter(RelevantContentFilter):
-    """Content filtering using LLMs to generate relevant markdown."""
+    """Content filtering using LLMs to generate relevant markdown.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Applies LLMs to generate markdown for each chunk.
+    4. Filters out chunks below the threshold.
+    5. Sorts chunks by score in descending order.
+    6. Returns the top N chunks.
+
+    Attributes:
+        llm_config (LLMConfig): LLM configuration object.
+        instruction (str): Instruction for LLM markdown generation
+        chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9).
+        overlap_rate (float): Overlap rate for chunking (default: 0.5).
+        word_token_rate (float): Word token rate for chunking (default: 0.2).
+        verbose (bool): Enable verbose logging (default: False).
+        logger (AsyncLogger): Custom logger for LLM operations (optional).
+    """
+    _UNWANTED_PROPS = {
+        'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
+        'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
+        'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+        'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+    }
 
     def __init__(
         self,
-        provider: str = DEFAULT_PROVIDER,
-        api_token: Optional[str] = None,
+        llm_config: "LLMConfig" = None,
         instruction: str = None,
         chunk_token_threshold: int = int(1e9),
         overlap_rate: float = OVERLAP_RATE,
         word_token_rate: float = WORD_TOKEN_RATE,
+        # char_token_rate: float = WORD_TOKEN_RATE * 5,
+        # chunk_mode: str = "char",
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+        ignore_cache: bool = True,
+        # Deprecated properties
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
         base_url: Optional[str] = None,
         api_base: Optional[str] = None,
         extra_args: Dict = None,
-        verbose: bool = False,
-        logger: Optional[AsyncLogger] = None,
     ):
         super().__init__(None)
         self.provider = provider
-        self.api_token = (
-            api_token
-            or PROVIDER_MODELS.get(provider, "no-token")
-            or os.getenv("OPENAI_API_KEY")
-        )
+        self.api_token = api_token
+        self.base_url = base_url or api_base
+        self.llm_config = llm_config
         self.instruction = instruction
         self.chunk_token_threshold = chunk_token_threshold
         self.overlap_rate = overlap_rate
-        self.word_token_rate = word_token_rate
-        self.base_url = base_url
-        self.api_base = api_base or base_url
+        self.word_token_rate = word_token_rate or WORD_TOKEN_RATE
+        # self.chunk_mode: str = chunk_mode
+        # self.char_token_rate = char_token_rate or word_token_rate / 5
+        # self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
+        self.token_rate = word_token_rate or WORD_TOKEN_RATE
         self.extra_args = extra_args or {}
+        self.ignore_cache = ignore_cache
         self.verbose = verbose
-        
+
         # Setup logger with custom styling for LLM operations
         if logger:
             self.logger = logger
         elif verbose:
             self.logger = AsyncLogger(
-                verbose=True,
+                verbose=verbose,
                 icons={
                     **AsyncLogger.DEFAULT_ICONS,
                     "LLM": "★",  # Star for LLM operations
                     "CHUNK": "◈",  # Diamond for chunks
-                    "CACHE": "⚡", # Lightning for cache operations
+                    "CACHE": "⚡",  # Lightning for cache operations
                 },
                 colors={
                     **AsyncLogger.DEFAULT_COLORS,
-                    LogLevel.INFO: Fore.MAGENTA + Style.DIM,  # Dimmed purple for LLM ops
-                }
+                    LogLevel.INFO: LogColor.DIM_MAGENTA  # Dimmed purple for LLM ops
+                },
             )
         else:
             self.logger = None
-        
+
         self.usages = []
         self.total_usage = TokenUsage()
+    
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
 
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)  
+        
     def _get_cache_key(self, html: str, instruction: str) -> str:
         """Generate a unique cache key based on HTML and instruction"""
         content = f"{html}{instruction}"
         return hashlib.md5(content.encode()).hexdigest()
 
     def _merge_chunks(self, text: str) -> List[str]:
-        """Split text into chunks with overlap"""
-        # Calculate tokens and sections
-        total_tokens = len(text.split()) * self.word_token_rate
-        num_sections = max(1, math.floor(total_tokens / self.chunk_token_threshold))
-        adjusted_chunk_threshold = total_tokens / num_sections
+        """Split text into chunks with overlap using char or word mode."""
+        ov = int(self.chunk_token_threshold * self.overlap_rate)
+        sections = merge_chunks(
+            docs=[text],
+            target_size=self.chunk_token_threshold,
+            overlap=ov,
+            word_token_ratio=self.word_token_rate,
+        )
+        return sections
 
-        # Split into words
-        words = text.split()
-        chunks = []
-        current_chunk = []
-        current_token_count = 0
-
-        for word in words:
-            word_tokens = len(word) * self.word_token_rate
-            if current_token_count + word_tokens <= adjusted_chunk_threshold:
-                current_chunk.append(word)
-                current_token_count += word_tokens
-            else:
-                # Add overlap if not the last chunk
-                if chunks and self.overlap_rate > 0:
-                    overlap_size = int(len(current_chunk) * self.overlap_rate)
-                    current_chunk.extend(current_chunk[-overlap_size:])
-                
-                chunks.append(" ".join(current_chunk))
-                current_chunk = [word]
-                current_token_count = word_tokens
-
-        if current_chunk:
-            chunks.append(" ".join(current_chunk))
-
-        return chunks
-
-    def filter_content(self, html: str, ignore_cache: bool = False) -> List[str]:
+    def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
         if not html or not isinstance(html, str):
             return []
 
         if self.logger:
             self.logger.info(
-                "Starting LLM content filtering process", 
+                "Starting LLM markdown content filtering process",
                 tag="LLM",
-                params={"provider": self.provider},
-                colors={"provider": Fore.CYAN}
+                params={"provider": self.llm_config.provider},
+                colors={"provider": LogColor.CYAN},
             )
 
         # Cache handling
@@ -853,65 +899,88 @@ class LLMContentFilter(RelevantContentFilter):
         cache_key = self._get_cache_key(html, self.instruction or "")
         cache_file = cache_dir / f"{cache_key}.json"
 
+        # if ignore_cache == None:
+        ignore_cache = self.ignore_cache
+
         if not ignore_cache and cache_file.exists():
             if self.logger:
-                self.logger.info("Found cached result", tag="CACHE")
+                self.logger.info("Found  cached markdown result", tag="CACHE")
             try:
-                with cache_file.open('r') as f:
+                with cache_file.open("r") as f:
                     cached_data = json.load(f)
-                    usage = TokenUsage(**cached_data['usage'])
+                    usage = TokenUsage(**cached_data["usage"])
                     self.usages.append(usage)
                     self.total_usage.completion_tokens += usage.completion_tokens
                     self.total_usage.prompt_tokens += usage.prompt_tokens
                     self.total_usage.total_tokens += usage.total_tokens
-                    return cached_data['blocks']
+                    return cached_data["blocks"]
             except Exception as e:
                 if self.logger:
-                    self.logger.error(f"Cache read error: {str(e)}", tag="CACHE")
+                    self.logger.error(
+                        f"LLM markdown: Cache read error: {str(e)}", tag="CACHE"
+                    )
 
         # Split into chunks
         html_chunks = self._merge_chunks(html)
         if self.logger:
             self.logger.info(
-                "Split content into {chunk_count} chunks", 
+                "LLM markdown: Split content into {chunk_count} chunks",
                 tag="CHUNK",
                 params={"chunk_count": len(html_chunks)},
-                colors={"chunk_count": Fore.YELLOW}
+                colors={"chunk_count": LogColor.YELLOW},
             )
-        
-        extracted_content = []
+
         start_time = time.time()
-        
+
         # Process chunks in parallel
         with ThreadPoolExecutor(max_workers=4) as executor:
             futures = []
             for i, chunk in enumerate(html_chunks):
                 if self.logger:
                     self.logger.debug(
-                        "Processing chunk {chunk_num}/{total_chunks}", 
+                        "LLM markdown: Processing chunk {chunk_num}/{total_chunks}",
                         tag="CHUNK",
-                        params={
-                            "chunk_num": i + 1,
-                            "total_chunks": len(html_chunks)
-                        }
+                        params={"chunk_num": i + 1, "total_chunks": len(html_chunks)},
                     )
 
                 prompt_variables = {
                     "HTML": escape_json_string(sanitize_html(chunk)),
-                    "REQUEST": self.instruction or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content."
+                    "REQUEST": self.instruction
+                    or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content.",
                 }
 
                 prompt = PROMPT_FILTER_CONTENT
                 for var, value in prompt_variables.items():
                     prompt = prompt.replace("{" + var + "}", value)
 
+                def _proceed_with_chunk(
+                    provider: str,
+                    prompt: str,
+                    api_token: str,
+                    base_url: Optional[str] = None,
+                    extra_args: Dict = {},
+                ) -> List[str]:
+                    if self.logger:
+                        self.logger.info(
+                            "LLM Markdown: Processing chunk {chunk_num}",
+                            tag="CHUNK",
+                            params={"chunk_num": i + 1},
+                        )
+                    return perform_completion_with_backoff(
+                        provider,
+                        prompt,
+                        api_token,
+                        base_url=base_url,
+                        extra_args=extra_args,
+                    )
+
                 future = executor.submit(
-                    perform_completion_with_backoff,
-                    self.provider,
+                    _proceed_with_chunk,
+                    self.llm_config.provider,
                     prompt,
-                    self.api_token,
-                    base_url=self.api_base,
-                    extra_args=self.extra_args
+                    self.llm_config.api_token,
+                    self.llm_config.base_url,
+                    self.extra_args,
                 )
                 futures.append((i, future))
 
@@ -920,59 +989,61 @@ class LLMContentFilter(RelevantContentFilter):
             for i, future in sorted(futures):
                 try:
                     response = future.result()
-                    
+
                     # Track usage
                     usage = TokenUsage(
                         completion_tokens=response.usage.completion_tokens,
                         prompt_tokens=response.usage.prompt_tokens,
                         total_tokens=response.usage.total_tokens,
-                        completion_tokens_details=response.usage.completion_tokens_details.__dict__ 
-                        if response.usage.completion_tokens_details else {},
-                        prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
-                        if response.usage.prompt_tokens_details else {},
+                        completion_tokens_details=(
+                            response.usage.completion_tokens_details.__dict__
+                            if response.usage.completion_tokens_details
+                            else {}
+                        ),
+                        prompt_tokens_details=(
+                            response.usage.prompt_tokens_details.__dict__
+                            if response.usage.prompt_tokens_details
+                            else {}
+                        ),
                     )
                     self.usages.append(usage)
                     self.total_usage.completion_tokens += usage.completion_tokens
                     self.total_usage.prompt_tokens += usage.prompt_tokens
                     self.total_usage.total_tokens += usage.total_tokens
 
-                    blocks = extract_xml_data(["content"], response.choices[0].message.content)["content"]
+                    blocks = extract_xml_data(
+                        ["content"], response.choices[0].message.content
+                    )["content"]
                     if blocks:
                         ordered_results.append(blocks)
                         if self.logger:
                             self.logger.success(
-                                "Successfully processed chunk {chunk_num}", 
+                                "LLM markdown: Successfully processed chunk {chunk_num}",
                                 tag="CHUNK",
-                                params={"chunk_num": i + 1}
+                                params={"chunk_num": i + 1},
                             )
                 except Exception as e:
                     if self.logger:
                         self.logger.error(
-                            "Error processing chunk {chunk_num}: {error}", 
+                            "LLM markdown: Error processing chunk {chunk_num}: {error}",
                             tag="CHUNK",
-                            params={
-                                "chunk_num": i + 1,
-                                "error": str(e)
-                            }
+                            params={"chunk_num": i + 1, "error": str(e)},
                         )
 
         end_time = time.time()
         if self.logger:
             self.logger.success(
-                "Completed processing in {time:.2f}s", 
+                "LLM markdown: Completed processing in {time:.2f}s",
                 tag="LLM",
                 params={"time": end_time - start_time},
-                colors={"time": Fore.YELLOW}
+                colors={"time": LogColor.YELLOW},
             )
 
         result = ordered_results if ordered_results else []
 
         # Cache the final result
-        cache_data = {
-            'blocks': result,
-            'usage': self.total_usage.__dict__
-        }
-        with cache_file.open('w') as f:
+        cache_data = {"blocks": result, "usage": self.total_usage.__dict__}
+        with cache_file.open("w") as f:
             json.dump(cache_data, f)
             if self.logger:
                 self.logger.info("Cached results for future use", tag="CACHE")
@@ -996,4 +1067,4 @@ class LLMContentFilter(RelevantContentFilter):
                 print(
                     f"{i:<10} {usage.completion_tokens:>12,} "
                     f"{usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
-                )
\ No newline at end of file
+                )
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 6cb169db..1dfbce84 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -28,6 +28,7 @@ from lxml import etree
 from lxml import html as lhtml
 from typing import List
 from .models import ScrapingResult, MediaItem, Link, Media, Links
+import copy
 
 # Pre-compile regular expressions for Open Graph and Twitter metadata
 OG_REGEX = re.compile(r"^og:")
@@ -48,7 +49,7 @@ def parse_srcset(s: str) -> List[Dict]:
         if len(parts) >= 1:
             url = parts[0]
             width = (
-                parts[1].rstrip("w")
+                parts[1].rstrip("w").split('.')[0]
                 if len(parts) > 1 and parts[1].endswith("w")
                 else None
             )
@@ -128,7 +129,8 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         Returns:
             ScrapingResult: A structured result containing the scraped content.
         """
-        raw_result = self._scrap(url, html, is_async=False, **kwargs)
+        actual_url = kwargs.get("redirected_url", url)
+        raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
         if raw_result is None:
             return ScrapingResult(
                 cleaned_html="",
@@ -155,6 +157,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 for aud in raw_result.get("media", {}).get("audios", [])
                 if aud
             ],
+            tables=raw_result.get("media", {}).get("tables", [])
         )
 
         # Convert links
@@ -193,6 +196,153 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         """
         return await asyncio.to_thread(self._scrap, url, html, **kwargs)
 
+    def is_data_table(self, table: Tag, **kwargs) -> bool:
+        """
+        Determine if a table element is a data table (not a layout table).
+
+        Args:
+            table (Tag): BeautifulSoup Tag representing a table element
+            **kwargs: Additional keyword arguments including table_score_threshold
+
+        Returns:
+            bool: True if the table is a data table, False otherwise
+        """
+        score = 0
+        
+        # Check for thead and tbody
+        has_thead = len(table.select('thead')) > 0
+        has_tbody = len(table.select('tbody')) > 0
+        if has_thead:
+            score += 2
+        if has_tbody:
+            score += 1
+            
+        # Check for th elements
+        th_count = len(table.select('th'))
+        if th_count > 0:
+            score += 2
+            if has_thead or len(table.select('tr:first-child th')) > 0:
+                score += 1
+                
+        # Check for nested tables
+        if len(table.select('table')) > 0:
+            score -= 3
+            
+        # Role attribute check
+        role = table.get('role', '').lower()
+        if role in {'presentation', 'none'}:
+            score -= 3
+            
+        # Column consistency
+        rows = table.select('tr')
+        if not rows:
+            return False
+            
+        col_counts = [len(row.select('td, th')) for row in rows]
+        avg_cols = sum(col_counts) / len(col_counts)
+        variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
+        if variance < 1:
+            score += 2
+            
+        # Caption and summary
+        if table.select('caption'):
+            score += 2
+        if table.has_attr('summary') and table['summary']:
+            score += 1
+            
+        # Text density
+        total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
+        total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
+        text_ratio = total_text / (total_tags + 1e-5)
+        if text_ratio > 20:
+            score += 3
+        elif text_ratio > 10:
+            score += 2
+            
+        # Data attributes
+        data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
+        score += data_attrs * 0.5
+        
+        # Size check
+        if avg_cols >= 2 and len(rows) >= 2:
+            score += 2
+            
+        threshold = kwargs.get('table_score_threshold', 7)
+        return score >= threshold
+    
+    def extract_table_data(self, table: Tag) -> dict:
+        """
+        Extract structured data from a table element.
+        
+        Args:
+            table (Tag): BeautifulSoup Tag representing a table element
+            
+        Returns:
+            dict: Dictionary containing table data (headers, rows, caption, summary)
+        """
+        caption_elem = table.select_one('caption')
+        caption = caption_elem.get_text().strip() if caption_elem else ""
+        summary = table.get('summary', '').strip()
+        
+        # Extract headers with colspan handling
+        headers = []
+        thead_rows = table.select('thead tr')
+        if thead_rows:
+            header_cells = thead_rows[0].select('th')
+            for cell in header_cells:
+                text = cell.get_text().strip()
+                colspan = int(cell.get('colspan', 1))
+                headers.extend([text] * colspan)
+        else:
+            first_row = table.select('tr:first-child')
+            if first_row:
+                for cell in first_row[0].select('th, td'):
+                    text = cell.get_text().strip()
+                    colspan = int(cell.get('colspan', 1))
+                    headers.extend([text] * colspan)
+        
+        # Extract rows with colspan handling
+        rows = []
+        all_rows = table.select('tr')
+        thead = table.select_one('thead')
+        tbody_rows = []
+
+        if thead:
+            thead_rows = thead.select('tr')
+            tbody_rows = [row for row in all_rows if row not in thead_rows]
+        else:
+            if all_rows and all_rows[0].select('th'):
+                tbody_rows = all_rows[1:]
+            else:
+                tbody_rows = all_rows
+                
+        for row in tbody_rows:        
+        # for row in table.select('tr:not(:has(ancestor::thead))'):
+            row_data = []
+            for cell in row.select('td'):
+                text = cell.get_text().strip()
+                colspan = int(cell.get('colspan', 1))
+                row_data.extend([text] * colspan)
+            if row_data:
+                rows.append(row_data)
+                
+        # Align rows with headers
+        max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
+        aligned_rows = []
+        for row in rows:
+            aligned = row[:max_columns] + [''] * (max_columns - len(row))
+            aligned_rows.append(aligned)
+            
+        if not headers:
+            headers = [f"Column {i+1}" for i in range(max_columns)]
+            
+        return {
+            "headers": headers,
+            "rows": aligned_rows,
+            "caption": caption,
+            "summary": summary,
+        }
+    
     def flatten_nested_elements(self, node):
         """
         Flatten nested elements in a HTML tree.
@@ -431,7 +581,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         Returns:
             dict: A dictionary containing the processed element information.
         """
-        media = {"images": [], "videos": [], "audios": []}
+        media = {"images": [], "videos": [], "audios": [], "tables": []}
         internal_links_dict = {}
         external_links_dict = {}
         self._process_element(
@@ -471,6 +621,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 return False
 
             keep_element = False
+            # Special case for table elements - always preserve structure
+            if element.name in ["tr", "td", "th"]:
+                keep_element = True
 
             exclude_domains = kwargs.get("exclude_domains", [])
             # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
@@ -529,6 +682,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                         if normalized_href not in external_links_dict:
                             external_links_dict[normalized_href] = link_data
                     else:
+                        if kwargs.get("exclude_internal_links", False):
+                            element.decompose()
+                            return False
                         if normalized_href not in internal_links_dict:
                             internal_links_dict[normalized_href] = link_data
 
@@ -629,7 +785,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
 
             try:
                 self.remove_unwanted_attributes(
-                    element, IMPORTANT_ATTRS, kwargs.get("keep_data_attributes", False)
+                    element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
                 )
             except Exception as e:
                 # print('Error removing unwanted attributes:', str(e))
@@ -685,6 +841,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         html: str,
         word_count_threshold: int = MIN_WORD_THRESHOLD,
         css_selector: str = None,
+        target_elements: List[str] = None,
         **kwargs,
     ) -> Dict[str, Any]:
         """
@@ -707,7 +864,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         parser_type = kwargs.get("parser", "lxml")
         soup = BeautifulSoup(html, parser_type)
         body = soup.body
+        if body is None:
+            raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
         base_domain = get_base_domain(url)
+        
+        # Early removal of all images if exclude_all_images is set
+        # This happens before any processing to minimize memory usage
+        if kwargs.get("exclude_all_images", False):
+            for img in body.find_all('img'):
+                img.decompose()
 
         try:
             meta = extract_metadata("", soup)
@@ -739,22 +904,20 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 for element in body.select(excluded_selector):
                     element.extract()
 
-        if css_selector:
-            selected_elements = body.select(css_selector)
-            if not selected_elements:
-                return {
-                    "markdown": "",
-                    "cleaned_html": "",
-                    "success": True,
-                    "media": {"images": [], "videos": [], "audios": []},
-                    "links": {"internal": [], "external": []},
-                    "metadata": {},
-                    "message": f"No elements found for CSS selector: {css_selector}",
-                }
-                # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
-            body = soup.new_tag("div")
-            for el in selected_elements:
-                body.append(el)
+        content_element = None
+        if target_elements:
+            try:
+                for_content_targeted_element = []
+                for target_element in target_elements:
+                    for_content_targeted_element.extend(body.select(target_element))
+                content_element = soup.new_tag("div")
+                for el in for_content_targeted_element:
+                    content_element.append(copy.deepcopy(el))
+            except Exception as e:
+                self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
+                return None
+        else:
+            content_element = body     
 
         kwargs["exclude_social_media_domains"] = set(
             kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -794,6 +957,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
             if result is not None
             for img in result
         ]
+        
+        # Process tables if not excluded
+        excluded_tags = set(kwargs.get("excluded_tags", []) or [])
+        if 'table' not in excluded_tags:
+            tables = body.find_all('table')
+            for table in tables:
+                if self.is_data_table(table, **kwargs):
+                    table_data = self.extract_table_data(table)
+                    media["tables"].append(table_data)
 
         body = self.flatten_nested_elements(body)
         base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
@@ -805,7 +977,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
 
         str_body = ""
         try:
-            str_body = body.encode_contents().decode("utf-8")
+            str_body = content_element.encode_contents().decode("utf-8")
         except Exception:
             # Reset body to the original HTML
             success = False
@@ -844,7 +1016,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         cleaned_html = str_body.replace("\n\n", "\n").replace("  ", " ")
 
         return {
-            # **markdown_content,
             "cleaned_html": cleaned_html,
             "success": success,
             "media": media,
@@ -1127,6 +1298,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             "source",
             "track",
             "wbr",
+            "tr",
+            "td",
+            "th",
         }
 
         for el in reversed(list(root.iterdescendants())):
@@ -1184,12 +1358,125 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
 
         return root
 
+    def is_data_table(self, table: etree.Element, **kwargs) -> bool:
+        score = 0
+        # Check for thead and tbody
+        has_thead = len(table.xpath(".//thead")) > 0
+        has_tbody = len(table.xpath(".//tbody")) > 0
+        if has_thead:
+            score += 2
+        if has_tbody:
+            score += 1
+
+        # Check for th elements
+        th_count = len(table.xpath(".//th"))
+        if th_count > 0:
+            score += 2
+            if has_thead or table.xpath(".//tr[1]/th"):
+                score += 1
+
+        # Check for nested tables
+        if len(table.xpath(".//table")) > 0:
+            score -= 3
+
+        # Role attribute check
+        role = table.get("role", "").lower()
+        if role in {"presentation", "none"}:
+            score -= 3
+
+        # Column consistency
+        rows = table.xpath(".//tr")
+        if not rows:
+            return False
+        col_counts = [len(row.xpath(".//td|.//th")) for row in rows]
+        avg_cols = sum(col_counts) / len(col_counts)
+        variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
+        if variance < 1:
+            score += 2
+
+        # Caption and summary
+        if table.xpath(".//caption"):
+            score += 2
+        if table.get("summary"):
+            score += 1
+
+        # Text density
+        total_text = sum(len(''.join(cell.itertext()).strip()) for row in rows for cell in row.xpath(".//td|.//th"))
+        total_tags = sum(1 for _ in table.iterdescendants())
+        text_ratio = total_text / (total_tags + 1e-5)
+        if text_ratio > 20:
+            score += 3
+        elif text_ratio > 10:
+            score += 2
+
+        # Data attributes
+        data_attrs = sum(1 for attr in table.attrib if attr.startswith('data-'))
+        score += data_attrs * 0.5
+
+        # Size check
+        if avg_cols >= 2 and len(rows) >= 2:
+            score += 2
+
+        threshold = kwargs.get("table_score_threshold", 7)
+        return score >= threshold
+
+    def extract_table_data(self, table: etree.Element) -> dict:
+        caption = table.xpath(".//caption/text()")
+        caption = caption[0].strip() if caption else ""
+        summary = table.get("summary", "").strip()
+
+        # Extract headers with colspan handling
+        headers = []
+        thead_rows = table.xpath(".//thead/tr")
+        if thead_rows:
+            header_cells = thead_rows[0].xpath(".//th")
+            for cell in header_cells:
+                text = cell.text_content().strip()
+                colspan = int(cell.get("colspan", 1))
+                headers.extend([text] * colspan)
+        else:
+            first_row = table.xpath(".//tr[1]")
+            if first_row:
+                for cell in first_row[0].xpath(".//th|.//td"):
+                    text = cell.text_content().strip()
+                    colspan = int(cell.get("colspan", 1))
+                    headers.extend([text] * colspan)
+
+        # Extract rows with colspan handling
+        rows = []
+        for row in table.xpath(".//tr[not(ancestor::thead)]"):
+            row_data = []
+            for cell in row.xpath(".//td"):
+                text = cell.text_content().strip()
+                colspan = int(cell.get("colspan", 1))
+                row_data.extend([text] * colspan)
+            if row_data:
+                rows.append(row_data)
+
+        # Align rows with headers
+        max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
+        aligned_rows = []
+        for row in rows:
+            aligned = row[:max_columns] + [''] * (max_columns - len(row))
+            aligned_rows.append(aligned)
+
+        if not headers:
+            headers = [f"Column {i+1}" for i in range(max_columns)]
+
+        return {
+            "headers": headers,
+            "rows": aligned_rows,
+            "caption": caption,
+            "summary": summary,
+        }
+
     def _scrap(
         self,
         url: str,
         html: str,
         word_count_threshold: int = MIN_WORD_THRESHOLD,
         css_selector: str = None,
+        target_elements: List[str] = None,
         **kwargs,
     ) -> Dict[str, Any]:
         if not html:
@@ -1203,6 +1490,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             body = doc
 
             base_domain = get_base_domain(url)
+            
+            # Early removal of all images if exclude_all_images is set
+            # This is more efficient in lxml as we remove elements before any processing
+            if kwargs.get("exclude_all_images", False):
+                for img in body.xpath('//img'):
+                    if img.getparent() is not None:
+                        img.getparent().remove(img)
 
             # Add comment removal
             if kwargs.get("remove_comments", False):
@@ -1239,25 +1533,19 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                 self._log("error", f"Error extracting metadata: {str(e)}", "SCRAPE")
                 meta = {}
 
-            # Handle CSS selector targeting
-            if css_selector:
+            content_element = None
+            if target_elements:
                 try:
-                    selected_elements = body.cssselect(css_selector)
-                    if not selected_elements:
-                        return {
-                            "markdown": "",
-                            "cleaned_html": "",
-                            "success": True,
-                            "media": {"images": [], "videos": [], "audios": []},
-                            "links": {"internal": [], "external": []},
-                            "metadata": meta,
-                            "message": f"No elements found for CSS selector: {css_selector}",
-                        }
-                    body = lhtml.Element("div")
-                    body.extend(selected_elements)
+                    for_content_targeted_element = []
+                    for target_element in target_elements:
+                        for_content_targeted_element.extend(body.cssselect(target_element))
+                    content_element = lhtml.Element("div")
+                    content_element.extend(copy.deepcopy(for_content_targeted_element))
                 except Exception as e:
-                    self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
+                    self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                     return None
+            else:
+                content_element = body
 
             # Remove script and style tags
             for tag in ["script", "style", "link", "meta", "noscript"]:
@@ -1281,7 +1569,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                         form.getparent().remove(form)
 
             # Process content
-            media = {"images": [], "videos": [], "audios": []}
+            media = {"images": [], "videos": [], "audios": [], "tables": []}
             internal_links_dict = {}
             external_links_dict = {}
 
@@ -1295,6 +1583,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                 **kwargs,
             )
 
+            if 'table' not in excluded_tags:
+                tables = body.xpath(".//table")
+                for table in tables:
+                    if self.is_data_table(table, **kwargs):
+                        table_data = self.extract_table_data(table)
+                        media["tables"].append(table_data)
+
             # Handle only_text option
             if kwargs.get("only_text", False):
                 for tag in ONLY_TEXT_ELIGIBLE_TAGS:
@@ -1314,14 +1609,15 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             # Remove empty elements
             self.remove_empty_elements_fast(body, 1)
 
-            # Remvoe unneeded attributes
+            # Remove unneeded attributes
             self.remove_unwanted_attributes_fast(
                 body, keep_data_attributes=kwargs.get("keep_data_attributes", False)
             )
 
             # Generate output HTML
             cleaned_html = lhtml.tostring(
-                body,
+                # body,   
+                content_element,
                 encoding="unicode",
                 pretty_print=True,
                 method="html",
@@ -1366,7 +1662,12 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
             return {
                 "cleaned_html": cleaned_html,
                 "success": False,
-                "media": {"images": [], "videos": [], "audios": []},
+                "media": {
+                    "images": [],
+                    "videos": [],
+                    "audios": [],
+                    "tables": []
+                },
                 "links": {"internal": [], "external": []},
                 "metadata": {},
             }
diff --git a/crawl4ai/crawlers/__init__.py b/crawl4ai/crawlers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/crawl4ai/crawlers/amazon_product/__init__.py b/crawl4ai/crawlers/amazon_product/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/crawl4ai/crawlers/amazon_product/crawler.py b/crawl4ai/crawlers/amazon_product/crawler.py
new file mode 100644
index 00000000..45cc9d6a
--- /dev/null
+++ b/crawl4ai/crawlers/amazon_product/crawler.py
@@ -0,0 +1,20 @@
+from crawl4ai.hub import BaseCrawler
+
+__meta__ = {
+    "version": "1.2.0",
+    "tested_on": ["amazon.com"],
+    "rate_limit": "50 RPM",
+    "schema": {"product": ["name", "price"]}
+}
+
+class AmazonProductCrawler(BaseCrawler):
+    async def run(self, url: str, **kwargs) -> str:
+        try:
+            self.logger.info(f"Crawling {url}")
+            return '{"product": {"name": "Test Amazon Product"}}'
+        except Exception as e:
+            self.logger.error(f"Crawl failed: {str(e)}")
+            return json.dumps({
+                "error": str(e),
+                "metadata": self.meta  # Include meta in error response
+            })            
\ No newline at end of file
diff --git a/crawl4ai/crawlers/google_search/__init__.py b/crawl4ai/crawlers/google_search/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/crawl4ai/crawlers/google_search/crawler.py b/crawl4ai/crawlers/google_search/crawler.py
new file mode 100644
index 00000000..e1288de1
--- /dev/null
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -0,0 +1,131 @@
+from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.hub import BaseCrawler
+from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from pathlib import Path
+import json
+import os
+from typing import Dict
+
+
+class GoogleSearchCrawler(BaseCrawler):
+    __meta__ = {
+        "version": "1.0.0",
+        "tested_on": ["google.com/search*"],
+        "rate_limit": "10 RPM",
+        "description": "Crawls Google Search results (text + images)",
+    }
+
+    def __init__(self):
+        super().__init__()
+        self.js_script = (Path(__file__).parent /
+                          "script.js").read_text()
+
+    async def run(self, url="", query: str = "", search_type: str = "text", schema_cache_path = None, **kwargs) -> str:
+        """Crawl Google Search results for a query"""
+        url = f"https://www.google.com/search?q={query}&gl=sg&hl=en" if search_type == "text" else f"https://www.google.com/search?q={query}&gl=sg&hl=en&tbs=qdr:d&udm=2"
+        if kwargs.get("page_start", 1) > 1:
+            url = f"{url}&start={kwargs['page_start'] * 10}"
+        if kwargs.get("page_length", 1) > 1:
+            url = f"{url}&num={kwargs['page_length']}"
+            
+        browser_config = BrowserConfig(headless=True, verbose=True)
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
+                keep_attrs=["id", "class"],
+                keep_data_attributes=True,
+                delay_before_return_html=kwargs.get(
+                    "delay", 2 if search_type == "image" else 1),
+                js_code=self.js_script if search_type == "image" else None,
+            )
+
+            result = await crawler.arun(url=url, config=config)
+            if not result.success:
+                return json.dumps({"error": result.error})
+
+            if search_type == "image":
+                if result.js_execution_result.get("success", False) is False:
+                    return json.dumps({"error": result.js_execution_result.get("error", "Unknown error")})
+                if "results" in result.js_execution_result:
+                    image_result = result.js_execution_result['results'][0]
+                    if image_result.get("success", False) is False:
+                        return json.dumps({"error": image_result.get("error", "Unknown error")})
+                    return json.dumps(image_result["result"], indent=4)
+
+            # For text search, extract structured data
+            schemas = await self._build_schemas(result.cleaned_html, schema_cache_path)
+            extracted = {
+                key: JsonCssExtractionStrategy(schema=schemas[key]).run(
+                    url=url, sections=[result.html]
+                )
+                for key in schemas
+            }
+            return json.dumps(extracted, indent=4)
+
+    async def _build_schemas(self, html: str, schema_cache_path: str = None) -> Dict[str, Dict]:
+        """Build extraction schemas (organic, top stories, etc.)"""
+        home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
+        os.makedirs(f"{home_dir}/schema", exist_ok=True)
+
+        # cleaned_html = optimize_html(html, threshold=100)
+        cleaned_html = preprocess_html_for_schema(html) 
+
+        organic_schema = None
+        if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
+            with open(f"{home_dir}/schema/organic_schema.json", "r") as f:
+                organic_schema = json.load(f)
+        else:
+            organic_schema = JsonCssExtractionStrategy.generate_schema(
+                html=cleaned_html,
+                target_json_example="""{
+            "title": "...",
+            "link": "...",
+            "snippet": "...",
+            "date": "1 hour ago",
+        }""",
+                query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text. date."""
+            )
+
+            with open(f"{home_dir}/schema/organic_schema.json", "w") as f:
+                f.write(json.dumps(organic_schema))
+
+        top_stories_schema = None
+        if os.path.exists(f"{home_dir}/schema/top_stories_schema.json"):
+            with open(f"{home_dir}/schema/top_stories_schema.json", "r") as f:
+                top_stories_schema = json.load(f)
+        else:
+            top_stories_schema = JsonCssExtractionStrategy.generate_schema(
+                html=cleaned_html,
+                target_json_example="""{
+            "title": "...",
+            "link": "...",
+            "source": "Insider Monkey",
+            "date": "1 hour ago",
+        }""",
+                query="""The given html is the crawled html from Google search result. Please find the schema for Top Story item int he given html, I am interested in title, link, source. date and imageUrl."""
+            )
+
+            with open(f"{home_dir}/schema/top_stories_schema.json", "w") as f:
+                f.write(json.dumps(top_stories_schema))
+
+        suggested_query_schema = None
+        if os.path.exists(f"{home_dir}/schema/suggested_query_schema.json"):
+            with open(f"{home_dir}/schema/suggested_query_schema.json", "r") as f:
+                suggested_query_schema = json.load(f)
+        else:
+            suggested_query_schema = JsonCssExtractionStrategy.generate_schema(
+                html=cleaned_html,
+                target_json_example="""{
+            "query": "A for Apple",
+        }""",
+                query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "People also search for" within the given HTML. I am interested in the queries only."""
+            )
+            with open(f"{home_dir}/schema/suggested_query_schema.json", "w") as f:
+                f.write(json.dumps(suggested_query_schema))
+
+        return {
+            "organic_schema": organic_schema,
+            "top_stories_schema": top_stories_schema,
+            "suggested_query_schema": suggested_query_schema,
+        }
diff --git a/crawl4ai/crawlers/google_search/script.js b/crawl4ai/crawlers/google_search/script.js
new file mode 100644
index 00000000..33257465
--- /dev/null
+++ b/crawl4ai/crawlers/google_search/script.js
@@ -0,0 +1,115 @@
+(() => {
+    // Function to extract image data from Google Images page
+    function extractImageData() {
+        const keys = Object.keys(window.W_jd);
+        let allImageData = [];
+        let currentPosition = 0;
+
+        // Get the symbol we'll use (from first valid entry)
+        let targetSymbol;
+        for (let key of keys) {
+            try {
+                const symbols = Object.getOwnPropertySymbols(window.W_jd[key]);
+                if (symbols.length > 0) {
+                    targetSymbol = symbols[0];
+                    break;
+                }
+            } catch (e) {
+                continue;
+            }
+        }
+
+        if (!targetSymbol) return [];
+
+        // Iterate through ALL keys
+        for (let key of keys) {
+            try {
+                const o1 = window.W_jd[key][targetSymbol]
+                if (!o1) continue;
+                const data = Object.values(o1)[0]
+                // const data = window.W_jd[key][targetSymbol]?.Ws;
+                // Check if this is a valid image data entry
+                if (data && Array.isArray(data[1])) {
+                    const processedData = processImageEntry(data, currentPosition);
+                    if (processedData) {
+                        allImageData.push(processedData);
+                        currentPosition++;
+                    }
+                }
+            } catch (e) {
+                continue;
+            }
+        }
+
+        return allImageData;
+    }
+
+    function processImageEntry(entry, position) {
+        const imageData = entry[1];
+        if (!Array.isArray(imageData)) return null;
+
+        // Extract the image ID
+        const imageId = imageData[1];
+        if (!imageId) return null;
+
+        // Find the corresponding DOM element
+        const domElement = document.querySelector(`[data-docid="${imageId}"]`);
+        if (!domElement) return null;
+
+        // Extract data from the array structure
+        const [
+            _,
+            id,
+            thumbnailInfo,
+            imageInfo,
+            __,
+            ___,
+            rgb,
+            ____,
+            _____,
+            metadata
+        ] = imageData;
+
+        // Ensure we have the required data
+        if (!thumbnailInfo || !imageInfo) return null;
+
+        // Extract metadata from DOM
+        const title = domElement?.querySelector('.toI8Rb')?.textContent?.trim();
+        const source = domElement?.querySelector('.guK3rf')?.textContent?.trim();
+        const link = domElement?.querySelector('a.EZAeBe')?.href;
+
+        if (!link) return null;
+
+        // Build Google Image URL
+        const googleUrl = buildGoogleImageUrl(imageInfo[0], link, imageId, imageInfo[1], imageInfo[2]);
+
+        return {
+            title,
+            imageUrl: imageInfo[0],
+            imageWidth: imageInfo[2],
+            imageHeight: imageInfo[1],
+            thumbnailUrl: thumbnailInfo[0],
+            thumbnailWidth: thumbnailInfo[2],
+            thumbnailHeight: thumbnailInfo[1],
+            source,
+            domain: metadata['2000']?.[1] || new URL(link).hostname,
+            link,
+            googleUrl,
+            position: position + 1
+        };
+    }
+
+    function buildGoogleImageUrl(imgUrl, refUrl, tbnid, height, width) {
+        const params = new URLSearchParams({
+            imgurl: imgUrl,
+            tbnid: tbnid,
+            imgrefurl: refUrl,
+            docid: tbnid,
+            w: width.toString(),
+            h: height.toString(),
+        });
+
+        return `https://www.google.com/imgres?${params.toString()}`;
+    }
+    return extractImageData();
+})();
\ No newline at end of file
diff --git a/crawl4ai/deep_crawling/__init__.py b/crawl4ai/deep_crawling/__init__.py
new file mode 100644
index 00000000..4e831aa8
--- /dev/null
+++ b/crawl4ai/deep_crawling/__init__.py
@@ -0,0 +1,47 @@
+# deep_crawling/__init__.py
+from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
+from .bfs_strategy import BFSDeepCrawlStrategy
+from .bff_strategy import BestFirstCrawlingStrategy
+from .dfs_strategy import DFSDeepCrawlStrategy
+from .filters import (
+    FilterChain,
+    ContentTypeFilter,
+    DomainFilter,
+    URLFilter,
+    URLPatternFilter,
+    FilterStats,
+    ContentRelevanceFilter,
+    SEOFilter
+)
+from .scorers import (
+    KeywordRelevanceScorer,
+    URLScorer,
+    CompositeScorer,
+    DomainAuthorityScorer,
+    FreshnessScorer,
+    PathDepthScorer,
+    ContentTypeScorer
+)
+
+__all__ = [
+    "DeepCrawlDecorator",
+    "DeepCrawlStrategy",
+    "BFSDeepCrawlStrategy",
+    "BestFirstCrawlingStrategy",
+    "DFSDeepCrawlStrategy",
+    "FilterChain",
+    "ContentTypeFilter",
+    "DomainFilter",
+    "URLFilter",
+    "URLPatternFilter",
+    "FilterStats",
+    "ContentRelevanceFilter",
+    "SEOFilter",
+    "KeywordRelevanceScorer",
+    "URLScorer",
+    "CompositeScorer",
+    "DomainAuthorityScorer",
+    "FreshnessScorer",
+    "PathDepthScorer",
+    "ContentTypeScorer",
+]
diff --git a/crawl4ai/deep_crawling/base_strategy.py b/crawl4ai/deep_crawling/base_strategy.py
new file mode 100644
index 00000000..e1b3fe6b
--- /dev/null
+++ b/crawl4ai/deep_crawling/base_strategy.py
@@ -0,0 +1,159 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Optional, Set, List, Dict
+from functools import wraps
+from contextvars import ContextVar
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+
+
+class DeepCrawlDecorator:
+    """Decorator that adds deep crawling capability to arun method."""
+    deep_crawl_active = ContextVar("deep_crawl_active", default=False)
+    
+    def __init__(self, crawler: AsyncWebCrawler): 
+        self.crawler = crawler
+
+    def __call__(self, original_arun):
+        @wraps(original_arun)
+        async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
+            # If deep crawling is already active, call the original method to avoid recursion.
+            if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
+                token = self.deep_crawl_active.set(True)
+                # Await the arun call to get the actual result object.
+                result_obj = await config.deep_crawl_strategy.arun(
+                    crawler=self.crawler,
+                    start_url=url,
+                    config=config
+                )
+                if config.stream:
+                    async def result_wrapper():
+                        try:
+                            async for result in result_obj:
+                                yield result
+                        finally:
+                            self.deep_crawl_active.reset(token)
+                    return result_wrapper()
+                else:
+                    try:
+                        return result_obj
+                    finally:
+                        self.deep_crawl_active.reset(token)
+            return await original_arun(url, config=config, **kwargs)
+        return wrapped_arun
+
+class DeepCrawlStrategy(ABC):
+    """
+    Abstract base class for deep crawling strategies.
+    
+    Core functions:
+      - arun: Main entry point that returns an async generator of CrawlResults.
+      - shutdown: Clean up resources.
+      - can_process_url: Validate a URL and decide whether to process it.
+      - _process_links: Extract and process links from a CrawlResult.
+    """
+
+    @abstractmethod
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) mode:
+        Processes one BFS level at a time, then yields all the results.
+        """
+        pass
+
+    @abstractmethod
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming mode:
+        Processes one BFS level at a time and yields results immediately as they arrive.
+        """
+        pass
+    
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> RunManyReturn:
+        """
+        Traverse the given URL using the specified crawler.
+        
+        Args:
+            start_url (str): The URL from which to start crawling.
+            crawler (AsyncWebCrawler): The crawler instance to use.
+            crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
+        
+        Returns:
+            Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig):
+        return self.arun(start_url, crawler, config)
+
+    @abstractmethod
+    async def shutdown(self) -> None:
+        """
+        Clean up resources used by the deep crawl strategy.
+        """
+        pass
+
+    @abstractmethod
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply custom filtering logic.
+        
+        Args:
+            url (str): The URL to validate.
+            depth (int): The current depth in the crawl.
+        
+        Returns:
+            bool: True if the URL should be processed, False otherwise.
+        """
+        pass
+
+    @abstractmethod
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_level: List[tuple],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract and process links from the given crawl result.
+        
+        This method should:
+          - Validate each extracted URL using can_process_url.
+          - Optionally score URLs.
+          - Append valid URLs (and their parent references) to the next_level list.
+          - Update the depths dictionary with the new depth for each URL.
+        
+        Args:
+            result (CrawlResult): The result from a crawl operation.
+            source_url (str): The URL from which this result was obtained.
+            current_depth (int): The depth at which the source URL was processed.
+            visited (Set[str]): Set of already visited URLs.
+            next_level (List[tuple]): List of tuples (url, parent_url) for the next BFS level.
+            depths (Dict[str, int]): Mapping of URLs to their current depth.
+        """
+        pass
+
diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py
new file mode 100644
index 00000000..65d4e819
--- /dev/null
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -0,0 +1,257 @@
+# best_first_crawling_strategy.py
+import asyncio
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+from urllib.parse import urlparse
+
+from ..models import TraversalStats
+from .filters import FilterChain
+from .scorers import URLScorer
+from . import DeepCrawlStrategy
+
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+from ..utils import normalize_url_for_deep_crawl
+
+from math import inf as infinity
+
+# Configurable batch size for processing items from the priority queue
+BATCH_SIZE = 10
+
+
+class BestFirstCrawlingStrategy(DeepCrawlStrategy):
+    """
+    Best-First Crawling Strategy using a priority queue.
+    
+    This strategy prioritizes URLs based on their score, ensuring that higher-value
+    pages are crawled first. It reimplements the core traversal loop to use a priority
+    queue while keeping URL validation and link discovery consistent with our design.
+    
+    Core methods:
+      - arun: Returns either a list (batch mode) or an async generator (stream mode).
+      - _arun_best_first: Core generator that uses a priority queue to yield CrawlResults.
+      - can_process_url: Validates URLs and applies filtering (inherited behavior).
+      - link_discovery: Extracts and validates links from a CrawlResult.
+    """
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FilterChain = FilterChain(),
+        url_scorer: Optional[URLScorer] = None,
+        include_external: bool = False,
+        max_pages: int = infinity,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.include_external = include_external
+        self.max_pages = max_pages
+        self.logger = logger or logging.getLogger(__name__)
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply filtering.
+        For the starting URL (depth 0), filtering is bypassed.
+        """
+        try:
+            parsed = urlparse(url)
+            if not parsed.scheme or not parsed.netloc:
+                raise ValueError("Missing scheme or netloc")
+            if parsed.scheme not in ("http", "https"):
+                raise ValueError("Invalid scheme")
+            if "." not in parsed.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}, error: {e}")
+            return False
+
+        if depth != 0 and not await self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_links: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract links from the crawl result, validate them, and append new URLs
+        (with their parent references) to next_links.
+        Also updates the depths dictionary.
+        """
+        new_depth = current_depth + 1
+        if new_depth > self.max_depth:
+            return
+            
+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return
+
+        # Retrieve internal links; include external links if enabled.
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        # If we have more links than remaining capacity, limit how many we'll process
+        valid_links = []
+        for link in links:
+            url = link.get("href")
+            base_url = normalize_url_for_deep_crawl(url, source_url)
+            if base_url in visited:
+                continue
+            if not await self.can_process_url(url, new_depth):
+                self.stats.urls_skipped += 1
+                continue
+                
+            valid_links.append(base_url)
+            
+        # If we have more valid links than capacity, limit them
+        if len(valid_links) > remaining_capacity:
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Record the new depths and add to next_links
+        for url in valid_links:
+            depths[url] = new_depth
+            next_links.append((url, source_url))
+
+    async def _arun_best_first(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Core best-first crawl method using a priority queue.
+        
+        The queue items are tuples of (score, depth, url, parent_url). Lower scores
+        are treated as higher priority. URLs are processed in batches for efficiency.
+        """
+        queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
+        # Push the initial URL with score 0 and depth 0.
+        await queue.put((0, 0, start_url, None))
+        visited: Set[str] = set()
+        depths: Dict[str, int] = {start_url: 0}
+
+        while not queue.empty() and not self._cancel_event.is_set():
+            # Stop if we've reached the max pages limit
+            if self._pages_crawled >= self.max_pages:
+                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
+                break
+                
+            batch: List[Tuple[float, int, str, Optional[str]]] = []
+            # Retrieve up to BATCH_SIZE items from the priority queue.
+            for _ in range(BATCH_SIZE):
+                if queue.empty():
+                    break
+                item = await queue.get()
+                score, depth, url, parent_url = item
+                if url in visited:
+                    continue
+                visited.add(url)
+                batch.append(item)
+
+            if not batch:
+                continue
+
+            # Process the current batch of URLs.
+            urls = [item[2] for item in batch]
+            batch_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=urls, config=batch_config)
+            async for result in stream_gen:
+                result_url = result.url
+                # Find the corresponding tuple from the batch.
+                corresponding = next((item for item in batch if item[2] == result_url), None)
+                if not corresponding:
+                    continue
+                score, depth, url, parent_url = corresponding
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent_url
+                result.metadata["score"] = score
+                
+                # Count only successful crawls toward max_pages limit
+                if result.success:
+                    self._pages_crawled += 1
+                
+                yield result
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Discover new links from this result
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, result_url, depth, visited, new_links, depths)
+                    
+                    for new_url, new_parent in new_links:
+                        new_depth = depths.get(new_url, depth + 1)
+                        new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
+                        await queue.put((new_score, new_depth, new_url, new_parent))
+
+        # End of crawl.
+
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Best-first crawl in batch mode.
+        
+        Aggregates all CrawlResults into a list.
+        """
+        results: List[CrawlResult] = []
+        async for result in self._arun_best_first(start_url, crawler, config):
+            results.append(result)
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Best-first crawl in streaming mode.
+        
+        Yields CrawlResults as they become available.
+        """
+        async for result in self._arun_best_first(start_url, crawler, config):
+            yield result
+
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> "RunManyReturn":
+        """
+        Main entry point for best-first crawling.
+        
+        Returns either a list (batch mode) or an async generator (stream mode)
+        of CrawlResults.
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    async def shutdown(self) -> None:
+        """
+        Signal cancellation and clean up resources.
+        """
+        self._cancel_event.set()
+        self.stats.end_time = datetime.now()
diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py
new file mode 100644
index 00000000..48c116dd
--- /dev/null
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -0,0 +1,245 @@
+# bfs_deep_crawl_strategy.py
+import asyncio
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+from urllib.parse import urlparse
+
+from ..models import TraversalStats
+from .filters import FilterChain
+from .scorers import URLScorer
+from . import DeepCrawlStrategy  
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
+from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
+from math import inf as infinity
+
+class BFSDeepCrawlStrategy(DeepCrawlStrategy):
+    """
+    Breadth-First Search deep crawling strategy.
+    
+    Core functions:
+      - arun: Main entry point; splits execution into batch or stream modes.
+      - link_discovery: Extracts, filters, and (if needed) scores the outgoing URLs.
+      - can_process_url: Validates URL format and applies the filter chain.
+    """
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FilterChain = FilterChain(),
+        url_scorer: Optional[URLScorer] = None,        
+        include_external: bool = False,
+        score_threshold: float = -infinity,
+        max_pages: int = infinity,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.include_external = include_external
+        self.score_threshold = score_threshold
+        self.max_pages = max_pages
+        self.logger = logger or logging.getLogger(__name__)
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validates the URL and applies the filter chain.
+        For the start URL (depth 0) filtering is bypassed.
+        """
+        try:
+            parsed = urlparse(url)
+            if not parsed.scheme or not parsed.netloc:
+                raise ValueError("Missing scheme or netloc")
+            if parsed.scheme not in ("http", "https"):
+                raise ValueError("Invalid scheme")
+            if "." not in parsed.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}, error: {e}")
+            return False
+
+        if depth != 0 and not await self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_level: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extracts links from the crawl result, validates and scores them, and
+        prepares the next level of URLs.
+        Each valid URL is appended to next_level as a tuple (url, parent_url)
+        and its depth is tracked.
+        """            
+        next_depth = current_depth + 1
+        if next_depth > self.max_depth:
+            return
+
+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return
+
+        # Get internal links and, if enabled, external links.
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        valid_links = []
+        
+        # First collect all valid links
+        for link in links:
+            url = link.get("href")
+            # Strip URL fragments to avoid duplicate crawling
+            # base_url = url.split('#')[0] if url else url
+            base_url = normalize_url_for_deep_crawl(url, source_url)
+            if base_url in visited:
+                continue
+            if not await self.can_process_url(url, next_depth):
+                self.stats.urls_skipped += 1
+                continue
+
+            # Score the URL if a scorer is provided
+            score = self.url_scorer.score(base_url) if self.url_scorer else 0
+            
+            # Skip URLs with scores below the threshold
+            if score < self.score_threshold:
+                self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
+                self.stats.urls_skipped += 1
+                continue
+
+            visited.add(base_url)
+            valid_links.append((base_url, score))
+        
+        # If we have more valid links than capacity, sort by score and take the top ones
+        if len(valid_links) > remaining_capacity:
+            if self.url_scorer:
+                # Sort by score in descending order
+                valid_links.sort(key=lambda x: x[1], reverse=True)
+            # Take only as many as we have capacity for
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Process the final selected links
+        for url, score in valid_links:
+            # attach the score to metadata if needed
+            if score:
+                result.metadata = result.metadata or {}
+                result.metadata["score"] = score
+            next_level.append((url, source_url))
+            depths[url] = next_depth
+
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) mode:
+        Processes one BFS level at a time, then yields all the results.
+        """
+        visited: Set[str] = set()
+        # current_level holds tuples: (url, parent_url)
+        current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        results: List[CrawlResult] = []
+
+        while current_level and not self._cancel_event.is_set():
+            next_level: List[Tuple[str, Optional[str]]] = []
+            urls = [url for url, _ in current_level]
+
+            # Clone the config to disable deep crawling recursion and enforce batch mode.
+            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
+            batch_results = await crawler.arun_many(urls=urls, config=batch_config)
+            
+            # Update pages crawled counter - count only successful crawls
+            successful_results = [r for r in batch_results if r.success]
+            self._pages_crawled += len(successful_results)
+            
+            for result in batch_results:
+                url = result.url
+                depth = depths.get(url, 0)
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                parent_url = next((parent for (u, parent) in current_level if u == url), None)
+                result.metadata["parent_url"] = parent_url
+                results.append(result)
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)
+
+            current_level = next_level
+
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming mode:
+        Processes one BFS level at a time and yields results immediately as they arrive.
+        """
+        visited: Set[str] = set()
+        current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        while current_level and not self._cancel_event.is_set():
+            next_level: List[Tuple[str, Optional[str]]] = []
+            urls = [url for url, _ in current_level]
+            visited.update(urls)
+
+            stream_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=urls, config=stream_config)
+            
+            # Keep track of processed results for this batch
+            results_count = 0
+            async for result in stream_gen:
+                url = result.url
+                depth = depths.get(url, 0)
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                parent_url = next((parent for (u, parent) in current_level if u == url), None)
+                result.metadata["parent_url"] = parent_url
+                
+                # Count only successful crawls
+                if result.success:
+                    self._pages_crawled += 1
+                
+                results_count += 1
+                yield result
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)
+            
+            # If we didn't get results back (e.g. due to errors), avoid getting stuck in an infinite loop
+            # by considering these URLs as visited but not counting them toward the max_pages limit
+            if results_count == 0 and urls:
+                self.logger.warning(f"No results returned for {len(urls)} URLs, marking as visited")
+                
+            current_level = next_level
+
+    async def shutdown(self) -> None:
+        """
+        Clean up resources and signal cancellation of the crawl.
+        """
+        self._cancel_event.set()
+        self.stats.end_time = datetime.now()
diff --git a/crawl4ai/deep_crawling/crazy.py b/crawl4ai/deep_crawling/crazy.py
new file mode 100644
index 00000000..d2bc27e5
--- /dev/null
+++ b/crawl4ai/deep_crawling/crazy.py
@@ -0,0 +1,432 @@
+from __future__ import annotations
+# I just got crazy, trying to wrute K&R C but in Python. Right now I feel like I'm in a quantum state.
+# I probably won't use this; I just want to leave it here. A century later, the future human race will be like, "WTF?"
+
+# ------ Imports That Will Make You Question Reality ------ #
+from functools import wraps
+from contextvars import ContextVar
+import inspect
+
+from crawl4ai import CacheMode
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.models import CrawlResult, TraversalStats
+from crawl4ai.deep_crawling.filters import FilterChain
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+import time
+import logging
+from urllib.parse import urlparse
+
+from abc import ABC, abstractmethod
+from collections import deque
+import asyncio
+from typing import (
+    AsyncGenerator,
+    Dict,
+    List,
+    TypeVar,
+    Generic,
+    Tuple,
+    Callable,
+    Awaitable,
+    Union,
+)
+from functools import lru_cache
+import mmh3
+from bitarray import bitarray
+import numpy as np
+from heapq import heappush, heappop
+
+# ------ Type Algebra Mastery ------ #
+CrawlResultT = TypeVar("CrawlResultT", bound="CrawlResult")
+PriorityT = TypeVar("PriorityT")
+P = TypeVar("P")
+
+# ------ Hyperscalar Context Management ------ #
+deep_crawl_ctx = ContextVar("deep_crawl_stack", default=deque())
+
+# ------ Algebraic Crawler Monoid ------ #
+class TraversalContext:
+    __slots__ = ('visited', 'frontier', 'depths', 'priority_fn', 'current_depth')
+    
+    def __init__(self,
+                 priority_fn: Callable[[str], Awaitable[float]] = lambda _: 1.0):
+        self.visited: BloomFilter = BloomFilter(10**6, 0.01)  # 1M items, 1% FP
+        self.frontier: PriorityQueue = PriorityQueue()
+        self.depths: Dict[str, int] = {}
+        self.priority_fn = priority_fn
+        self.current_depth = 0
+
+    def clone_for_level(self) -> TraversalContext:
+        """Monadic context propagation"""
+        new_ctx = TraversalContext(self.priority_fn)
+        new_ctx.visited = self.visited.copy()
+        new_ctx.depths = self.depths.copy()
+        new_ctx.current_depth = self.current_depth
+        return new_ctx
+
+class PriorityQueue(Generic[PriorityT]):
+    """Fibonacci heap-inspired priority queue with O(1) amortized operations"""
+    __slots__ = ('_heap', '_index')
+
+    def __init__(self):
+        self._heap: List[Tuple[PriorityT, float, P]] = []
+        self._index: Dict[P, int] = {}
+
+    def insert(self, priority: PriorityT, item: P) -> None:
+        tiebreaker = time.time()  # Ensure FIFO for equal priorities
+        heappush(self._heap, (priority, tiebreaker, item))
+        self._index[item] = len(self._heap) - 1
+
+    def extract(self, top_n = 1) -> P:
+        items = []
+        for _ in range(top_n):
+            if not self._heap:
+                break
+            priority, _, item = heappop(self._heap)
+            del self._index[item]
+            items.append(item)
+        if not items:
+            raise IndexError("Priority queue empty")
+        return items
+        # while self._heap:
+        #     _, _, item = heappop(self._heap)
+        #     if item in self._index:
+        #         del self._index[item]
+        #         return item
+        raise IndexError("Priority queue empty")
+
+
+    def is_empty(self) -> bool:
+        return not bool(self._heap)
+
+class BloomFilter:
+    """Optimal Bloom filter using murmur3 hash avalanche"""
+    __slots__ = ('size', 'hashes', 'bits')
+
+    def __init__(self, capacity: int, error_rate: float):
+        self.size = self._optimal_size(capacity, error_rate)
+        self.hashes = self._optimal_hashes(capacity, self.size)
+        self.bits = bitarray(self.size)
+        self.bits.setall(False)
+
+    @staticmethod
+    def _optimal_size(n: int, p: float) -> int:
+        m = - (n * np.log(p)) / (np.log(2) ** 2)
+        return int(np.ceil(m))
+
+    @staticmethod
+    def _optimal_hashes(n: int, m: int) -> int:
+        k = (m / n) * np.log(2)
+        return int(np.ceil(k))
+
+    def add(self, item: str) -> None:
+        for seed in range(self.hashes):
+            digest = mmh3.hash(item, seed) % self.size
+            self.bits[digest] = True
+
+    def __contains__(self, item: str) -> bool:
+        return all(
+            self.bits[mmh3.hash(item, seed) % self.size]
+            for seed in range(self.hashes)
+        )
+
+    def copy(self) -> BloomFilter:
+        new = object.__new__(BloomFilter)
+        new.size = self.size
+        new.hashes = self.hashes
+        new.bits = self.bits.copy()
+        return new
+    
+    def __len__(self) -> int:
+        """
+        Estimates the number of items in the filter using the 
+        count of set bits and the formula:
+        n = -m/k * ln(1 - X/m)
+        where:
+            m = size of bit array
+            k = number of hash functions
+            X = count of set bits
+        """
+        set_bits = self.bits.count(True)
+        if set_bits == 0:
+            return 0
+            
+        # Use the inverse bloom filter formula to estimate cardinality
+        return int(
+            -(self.size / self.hashes) * 
+            np.log(1 - set_bits / self.size)
+        )
+    
+    def bit_count(self) -> int:
+        """Returns the raw count of set bits in the filter"""
+        return self.bits.count(True)
+        
+    def __repr__(self) -> str:
+        return f"BloomFilter(est_items={len(self)}, bits={self.bit_count()}/{self.size})"
+
+# ------ Hyper-Optimal Deep Crawl Core ------ #
+class DeepCrawlDecorator:
+    """Metaprogramming marvel: Zero-cost deep crawl abstraction"""
+    def __init__(self, crawler: AsyncWebCrawler):
+        self.crawler = crawler
+
+    def __call__(self, original_arun: Callable) -> Callable:
+        @wraps(original_arun)
+        async def quantum_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
+            stack = deep_crawl_ctx.get()
+            if config and config.deep_crawl_strategy and not stack:
+                stack.append(self.crawler)
+                try:
+                    deep_crawl_ctx.set(stack)
+                    async for result in config.deep_crawl_strategy.traverse(
+                        start_url=url,
+                        crawler=self.crawler,
+                        config=config
+                    ):
+                        yield result
+                finally:
+                    stack.pop()
+                    deep_crawl_ctx.set(stack)
+            else:
+                result = await original_arun(url, config=config, **kwargs)
+                yield result
+        return quantum_arun
+
+
+async def collect_results(url, crawler, config):
+    if id(getattr(crawler, "arun")) != id(getattr(crawler, "original_arun")):
+        setattr(crawler, "arun", getattr(crawler, "original_arun"))
+
+    ret = crawler.arun(url, config=config)
+    # If arun is an async generator, iterate over it
+    if inspect.isasyncgen(ret):
+        return [r async for r in ret]
+    # Otherwise, await the coroutine and normalize to a list
+    result = await ret
+    return result if isinstance(result, list) else [result]
+
+async def collect_many_results(url, crawler, config):
+    # Replace back arun to its original implementation
+    if id(getattr(crawler, "arun")) != id(getattr(crawler, "original_arun")):
+        setattr(crawler, "arun", getattr(crawler, "original_arun"))
+    ret = crawler.arun_many(url, config=config)
+    # If arun is an async generator, iterate over it
+    if inspect.isasyncgen(ret):
+        return [r async for r in ret]
+    # Otherwise, await the coroutine and normalize to a list
+    result = await ret
+    return result if isinstance(result, list) else [result]
+
+
+# ------ Deep Crawl Strategy Interface ------ #
+CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
+# In batch mode we return List[CrawlResult] and in stream mode an AsyncGenerator.
+RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+
+
+class DeepCrawlStrategy(ABC):
+    """Abstract base class that will make Dijkstra smile"""
+    @abstractmethod
+    async def traverse(self,
+                      start_url: str,
+                      crawler: AsyncWebCrawler,
+                      config: CrawlerRunConfig) -> RunManyReturn:
+        """Traverse with O(1) memory complexity via generator fusion"""
+        ...
+
+    @abstractmethod
+    def precompute_priority(self, url: str) -> Awaitable[float]:
+        """Quantum-inspired priority precomputation"""
+        pass
+
+    @abstractmethod
+    async def link_hypercube(self, result: CrawlResult) -> AsyncGenerator[str, None]:
+        """Hilbert-curve optimized link generation"""
+        pass
+
+# ------ BFS That Would Make Knuth Proud ------ #
+
+def calculate_quantum_batch_size(
+    depth: int,
+    max_depth: int,
+    frontier_size: int,
+    visited_size: int
+) -> int:
+    """
+    Calculates optimal batch size for URL processing using quantum-inspired mathematical principles.
+    
+    This function implements a sophisticated batch size calculation using:
+    1. Golden Ratio (φ) based scaling for optimal irrationality
+    2. Depth-aware amplitude modulation
+    3. Harmonic series dampening
+    4. Logarithmic growth control
+    5. Dynamic frontier adaptation
+    
+    The formula follows the quantum harmonic oscillator principle:
+        N = ⌈φ^(2d) * log₂(|V|) * H(d)⁻¹ * min(20, |F|/10)⌉
+    where:
+        φ = Golden Ratio ((1 + √5) / 2)
+        d = depth factor (normalized remaining depth)
+        |V| = size of visited set
+        H(d) = d-th harmonic number
+        |F| = frontier size
+    
+    Args:
+        depth (int): Current traversal depth
+        max_depth (int): Maximum allowed depth
+        frontier_size (int): Current size of frontier queue
+        visited_size (int): Number of URLs visited so far
+    
+    Returns:
+        int: Optimal batch size bounded between 1 and 100
+        
+    Mathematical Properties:
+        - Maintains O(log n) growth with respect to visited size
+        - Provides φ-optimal distribution of resources
+        - Ensures quantum-like state transitions between depths
+        - Harmonically dampened to prevent exponential explosion
+    """
+    # Golden ratio φ = (1 + √5) / 2
+    φ = (1 + 5 ** 0.5) / 2
+    
+    # Calculate normalized depth factor [0, 1]
+    depth_factor = (max_depth - depth) / max_depth if depth < max_depth else 0
+    
+    # Compute harmonic number for current depth
+    harmonic = sum(1/k for k in range(1, depth + 2))
+    
+    # Calculate quantum batch size
+    batch_size = int(np.ceil(
+        (φ ** (depth_factor * 2)) *          # Golden ratio scaling
+        np.log2(visited_size + 2) *          # Logarithmic growth factor
+        (1 / harmonic) *                     # Harmonic dampening
+        max(1, min(20, frontier_size / 10))  # Frontier-aware scaling
+    ))
+    
+    # Enforce practical bounds
+    return max(1, min(100, batch_size))
+
+
+class BFSDeepCrawlStrategy(DeepCrawlStrategy):
+    """Breadth-First Search with Einstein-Rosen bridge optimization"""
+    __slots__ = ('max_depth', 'filter_chain', 'priority_fn', 'stats', '_cancel')
+
+    def __init__(self,
+                 max_depth: int,
+                 filter_chain: FilterChain = FilterChain(),
+                 priority_fn: Callable[[str], Awaitable[float]] = lambda url: 1.0,
+                 logger: logging.Logger = None):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.priority_fn = priority_fn
+        self.stats = TraversalStats()
+        self._cancel = asyncio.Event()
+        self.semaphore = asyncio.Semaphore(1000)
+
+    async def traverse(self,
+                      start_url: str,
+                      crawler: AsyncWebCrawler,
+                      config: CrawlerRunConfig) -> RunManyReturn:
+        """Non-blocking BFS with O(b^d) time complexity awareness"""
+        ctx = TraversalContext(self.priority_fn)
+        ctx.frontier.insert(self.priority_fn(start_url), (start_url, None, 0))
+        ctx.visited.add(start_url)
+        ctx.depths[start_url] = 0
+
+        while not ctx.frontier.is_empty() and not self._cancel.is_set():
+            # Use the best algorith, to find top_n value
+            top_n = calculate_quantum_batch_size(
+                depth=ctx.current_depth,
+                max_depth=self.max_depth,
+                frontier_size=len(ctx.frontier._heap),
+                visited_size=len(ctx.visited)
+            )
+
+            urls = ctx.frontier.extract(top_n=top_n)
+            # url, parent, depth = ctx.frontier.extract(top_n=top_n)
+            if urls:
+                ctx.current_depth = urls[0][2]
+
+            async with self.semaphore:
+                results = await collect_many_results([url for (url, parent, depth) in urls], crawler, config)
+                # results = await asyncio.gather(*[
+                #     collect_results(url, crawler, config) for (url, parent, depth) in urls
+                # ])
+                # result = _result[0]
+                for ix, result in enumerate(results):
+                    url, parent, depth = result.url, urls[ix][1], urls[ix][2]
+                    result.metadata['depth'] = depth
+                    result.metadata['parent'] = parent
+                    yield result
+
+                    if depth < self.max_depth:
+                        async for link in self.link_hypercube(result):
+                            if link not in ctx.visited:
+                                priority = self.priority_fn(link)
+                                ctx.frontier.insert(priority, (link, url, depth + 1))
+                                ctx.visited.add(link)
+                                ctx.depths[link] = depth + 1
+
+    @lru_cache(maxsize=65536)
+    async def validate_url(self, url: str) -> bool:
+        """Memoized URL validation with λ-calculus purity"""
+        try:
+            parsed = urlparse(url)
+            return (parsed.scheme in {'http', 'https'}
+                    and '.' in parsed.netloc
+                    and await self.filter_chain.apply(url))
+        except Exception:
+            return False
+
+    async def link_hypercube(self, result: CrawlResult) -> AsyncGenerator[str, None]:
+        """Hilbert-ordered link generation with O(1) yield latency"""
+        links = (link['href'] for link in result.links.get('internal', []))
+        validated = filter(self.validate_url, links)
+        for link in sorted(validated, key=lambda x: -self.priority_fn(x)):
+            yield link
+
+    def __aiter__(self) -> AsyncGenerator[CrawlResult, None]:
+        """Native async iterator interface"""
+        return self.traverse()
+
+    async def __anext__(self) -> CrawlResult:
+        """True async iterator protocol implementation"""
+        result = await self.traverse().__anext__()
+        if result:
+            return result
+        raise StopAsyncIteration
+
+    async def precompute_priority(self, url):
+        return super().precompute_priority(url)
+
+    async def shutdown(self):
+        self._cancel.set()
+
+# ------ Usage That Will Drop Jaws ------ #
+async def main():
+    """Quantum crawl example"""
+    strategy = BFSDeepCrawlStrategy(
+        max_depth=2,
+        priority_fn=lambda url: 1.0 / (len(url) + 1e-9),  # Inverse length priority
+        # filter_chain=FilterChain(...)
+    )
+
+    config: CrawlerRunConfig = CrawlerRunConfig(
+        deep_crawl_strategy=strategy,
+        stream=False,
+        verbose=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        run_decorator = DeepCrawlDecorator(crawler)
+        setattr(crawler, "original_arun", crawler.arun)
+        crawler.arun = run_decorator(crawler.arun)
+        start_time = time.perf_counter()
+        async for result in crawler.arun("https://docs.crawl4ai.com", config=config):
+            print(f"🌀 {result.url} (Depth: {result.metadata['depth']})")
+        print(f"Deep crawl completed in {time.perf_counter() - start_time:.2f}s")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/crawl4ai/deep_crawling/dfs_strategy.py b/crawl4ai/deep_crawling/dfs_strategy.py
new file mode 100644
index 00000000..f79f9628
--- /dev/null
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -0,0 +1,102 @@
+# dfs_deep_crawl_strategy.py
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+
+from ..models import CrawlResult
+from .bfs_strategy import BFSDeepCrawlStrategy  # noqa
+from ..types import AsyncWebCrawler, CrawlerRunConfig
+
+class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
+    """
+    Depth-First Search (DFS) deep crawling strategy.
+
+    Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
+    Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
+    """
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) DFS mode.
+        Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
+        """
+        visited: Set[str] = set()
+        # Stack items: (url, parent_url, depth)
+        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
+        depths: Dict[str, int] = {start_url: 0}
+        results: List[CrawlResult] = []
+
+        while stack and not self._cancel_event.is_set():
+            url, parent, depth = stack.pop()
+            if url in visited or depth > self.max_depth:
+                continue
+            visited.add(url)
+
+            # Clone config to disable recursive deep crawling.
+            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
+            url_results = await crawler.arun_many(urls=[url], config=batch_config)
+            
+            for result in url_results:
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent
+                if self.url_scorer:
+                    result.metadata["score"] = self.url_scorer.score(url)
+                results.append(result)
+                
+                # Count only successful crawls toward max_pages limit
+                if result.success:
+                    self._pages_crawled += 1
+                    
+                    # Only discover links from successful crawls
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, url, depth, visited, new_links, depths)
+                    
+                    # Push new links in reverse order so the first discovered is processed next.
+                    for new_url, new_parent in reversed(new_links):
+                        new_depth = depths.get(new_url, depth + 1)
+                        stack.append((new_url, new_parent, new_depth))
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming DFS mode.
+        Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
+        """
+        visited: Set[str] = set()
+        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        while stack and not self._cancel_event.is_set():
+            url, parent, depth = stack.pop()
+            if url in visited or depth > self.max_depth:
+                continue
+            visited.add(url)
+
+            stream_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=[url], config=stream_config)
+            async for result in stream_gen:
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent
+                if self.url_scorer:
+                    result.metadata["score"] = self.url_scorer.score(url)
+                yield result
+
+                # Only count successful crawls toward max_pages limit
+                # and only discover links from successful crawls
+                if result.success:
+                    self._pages_crawled += 1
+                    
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, url, depth, visited, new_links, depths)
+                    for new_url, new_parent in reversed(new_links):
+                        new_depth = depths.get(new_url, depth + 1)
+                        stack.append((new_url, new_parent, new_depth))
diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py
new file mode 100644
index 00000000..122be482
--- /dev/null
+++ b/crawl4ai/deep_crawling/filters.py
@@ -0,0 +1,666 @@
+from abc import ABC, abstractmethod
+from typing import List, Pattern, Set, Union
+from urllib.parse import urlparse
+from array import array
+import re
+import logging
+from functools import lru_cache
+import fnmatch
+from dataclasses import dataclass
+import weakref
+import math
+from collections import defaultdict
+from typing import Dict
+from ..utils import HeadPeekr
+import asyncio
+import inspect
+
+
+@dataclass
+class FilterStats:
+    __slots__ = ("_counters",)
+
+    def __init__(self):
+        # Use array of unsigned ints for atomic operations
+        self._counters = array("I", [0, 0, 0])  # total, passed, rejected
+
+    @property
+    def total_urls(self):
+        return self._counters[0]
+
+    @property
+    def passed_urls(self):
+        return self._counters[1]
+
+    @property
+    def rejected_urls(self):
+        return self._counters[2]
+
+
+class URLFilter(ABC):
+    """Optimized base filter class"""
+
+    __slots__ = ("name", "stats", "_logger_ref")
+
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FilterStats()
+        # Lazy logger initialization using weakref
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger(f"urlfilter.{self.name}")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+    def _update_stats(self, passed: bool):
+        # Use direct array index for speed
+        self.stats._counters[0] += 1  # total
+        self.stats._counters[1] += passed  # passed
+        self.stats._counters[2] += not passed  # rejected
+
+
+class FilterChain:
+    """Optimized filter chain"""
+
+    __slots__ = ("filters", "stats", "_logger_ref")
+
+    def __init__(self, filters: List[URLFilter] = None):
+        self.filters = tuple(filters or [])  # Immutable tuple for speed
+        self.stats = FilterStats()
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger("urlfilter.chain")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    def add_filter(self, filter_: URLFilter) -> "FilterChain":
+        """Add a filter to the chain"""
+        self.filters.append(filter_)
+        return self  # Enable method chaining
+
+    async def apply(self, url: str) -> bool:
+        """Apply all filters concurrently when possible"""
+        self.stats._counters[0] += 1  # Total processed URLs
+
+        tasks = []
+        for f in self.filters:
+            result = f.apply(url)
+
+            if inspect.isawaitable(result):
+                tasks.append(result)  # Collect async tasks
+            elif not result:  # Sync rejection
+                self.stats._counters[2] += 1  # Sync rejected
+                return False
+
+        if tasks:
+            results = await asyncio.gather(*tasks)
+
+            # Count how many filters rejected
+            rejections = results.count(False)
+            self.stats._counters[2] += rejections
+
+            if not all(results):
+                return False  # Stop early if any filter rejected
+
+        self.stats._counters[1] += 1  # Passed
+        return True
+
+
+class URLPatternFilter(URLFilter):
+    """Pattern filter balancing speed and completeness"""
+
+    __slots__ = (
+        "_simple_suffixes",
+        "_simple_prefixes",
+        "_domain_patterns",
+        "_path_patterns",
+        "_reverse",
+    )
+
+    PATTERN_TYPES = {
+        "SUFFIX": 1,  # *.html
+        "PREFIX": 2,  # /foo/*
+        "DOMAIN": 3,  # *.example.com
+        "PATH": 4,  # Everything else
+        "REGEX": 5,
+    }
+
+    def __init__(
+        self,
+        patterns: Union[str, Pattern, List[Union[str, Pattern]]],
+        use_glob: bool = True,
+        reverse: bool = False,
+    ):
+        super().__init__()
+        self._reverse = reverse
+        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+
+        self._simple_suffixes = set()
+        self._simple_prefixes = set()
+        self._domain_patterns = []
+        self._path_patterns = []
+
+        for pattern in patterns:
+            pattern_type = self._categorize_pattern(pattern)
+            self._add_pattern(pattern, pattern_type)
+
+    def _categorize_pattern(self, pattern: str) -> int:
+        """Categorize pattern for specialized handling"""
+        if not isinstance(pattern, str):
+            return self.PATTERN_TYPES["PATH"]
+
+        # Check if it's a regex pattern
+        if pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern:
+            return self.PATTERN_TYPES["REGEX"]
+
+        if pattern.count("*") == 1:
+            if pattern.startswith("*."):
+                return self.PATTERN_TYPES["SUFFIX"]
+            if pattern.endswith("/*"):
+                return self.PATTERN_TYPES["PREFIX"]
+
+        if "://" in pattern and pattern.startswith("*."):
+            return self.PATTERN_TYPES["DOMAIN"]
+
+        return self.PATTERN_TYPES["PATH"]
+
+    def _add_pattern(self, pattern: str, pattern_type: int):
+        """Add pattern to appropriate matcher"""
+        if pattern_type == self.PATTERN_TYPES["REGEX"]:
+            # For regex patterns, compile directly without glob translation
+            if isinstance(pattern, str) and (
+                pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern
+            ):
+                self._path_patterns.append(re.compile(pattern))
+                return
+        elif pattern_type == self.PATTERN_TYPES["SUFFIX"]:
+            self._simple_suffixes.add(pattern[2:])
+        elif pattern_type == self.PATTERN_TYPES["PREFIX"]:
+            self._simple_prefixes.add(pattern[:-2])
+        elif pattern_type == self.PATTERN_TYPES["DOMAIN"]:
+            self._domain_patterns.append(re.compile(pattern.replace("*.", r"[^/]+\.")))
+        else:
+            if isinstance(pattern, str):
+                # Handle complex glob patterns
+                if "**" in pattern:
+                    pattern = pattern.replace("**", ".*")
+                if "{" in pattern:
+                    # Convert {a,b} to (a|b)
+                    pattern = re.sub(
+                        r"\{([^}]+)\}",
+                        lambda m: f'({"|".join(m.group(1).split(","))})',
+                        pattern,
+                    )
+                pattern = fnmatch.translate(pattern)
+            self._path_patterns.append(
+                pattern if isinstance(pattern, Pattern) else re.compile(pattern)
+            )
+
+    @lru_cache(maxsize=10000)
+    def apply(self, url: str) -> bool:
+        # Quick suffix check (*.html)
+        if self._simple_suffixes:
+            path = url.split("?")[0]
+            if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        # Domain check
+        if self._domain_patterns:
+            for pattern in self._domain_patterns:
+                if pattern.match(url):
+                    result = True
+                    self._update_stats(result)
+                    return not result if self._reverse else result
+
+        # Prefix check (/foo/*)
+        if self._simple_prefixes:
+            path = url.split("?")[0]
+            if any(path.startswith(p) for p in self._simple_prefixes):
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        # Complex patterns
+        if self._path_patterns:
+            if any(p.search(url) for p in self._path_patterns):
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        result = False
+        self._update_stats(result)
+        return not result if self._reverse else result
+
+
+class ContentTypeFilter(URLFilter):
+    """Optimized content type filter using fast lookups"""
+
+    __slots__ = ("allowed_types", "_ext_map", "_check_extension")
+
+    # Fast extension to mime type mapping
+    _MIME_MAP = {
+        # Text Formats
+        "txt": "text/plain",
+        "html": "text/html",
+        "htm": "text/html",
+        "xhtml": "application/xhtml+xml",
+        "css": "text/css",
+        "csv": "text/csv",
+        "ics": "text/calendar",
+        "js": "application/javascript",
+        # Images
+        "bmp": "image/bmp",
+        "gif": "image/gif",
+        "jpeg": "image/jpeg",
+        "jpg": "image/jpeg",
+        "png": "image/png",
+        "svg": "image/svg+xml",
+        "tiff": "image/tiff",
+        "ico": "image/x-icon",
+        "webp": "image/webp",
+        # Audio
+        "mp3": "audio/mpeg",
+        "wav": "audio/wav",
+        "ogg": "audio/ogg",
+        "m4a": "audio/mp4",
+        "aac": "audio/aac",
+        # Video
+        "mp4": "video/mp4",
+        "mpeg": "video/mpeg",
+        "webm": "video/webm",
+        "avi": "video/x-msvideo",
+        "mov": "video/quicktime",
+        "flv": "video/x-flv",
+        "wmv": "video/x-ms-wmv",
+        "mkv": "video/x-matroska",
+        # Applications
+        "json": "application/json",
+        "xml": "application/xml",
+        "pdf": "application/pdf",
+        "zip": "application/zip",
+        "gz": "application/gzip",
+        "tar": "application/x-tar",
+        "rar": "application/vnd.rar",
+        "7z": "application/x-7z-compressed",
+        "exe": "application/vnd.microsoft.portable-executable",
+        "msi": "application/x-msdownload",
+        # Fonts
+        "woff": "font/woff",
+        "woff2": "font/woff2",
+        "ttf": "font/ttf",
+        "otf": "font/otf",
+        # Microsoft Office
+        "doc": "application/msword",
+        "dot": "application/msword",
+        "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "xls": "application/vnd.ms-excel",
+        "ppt": "application/vnd.ms-powerpoint",
+        "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        # OpenDocument Formats
+        "odt": "application/vnd.oasis.opendocument.text",
+        "ods": "application/vnd.oasis.opendocument.spreadsheet",
+        "odp": "application/vnd.oasis.opendocument.presentation",
+        # Archives
+        "tar.gz": "application/gzip",
+        "tgz": "application/gzip",
+        "bz2": "application/x-bzip2",
+        # Others
+        "rtf": "application/rtf",
+        "apk": "application/vnd.android.package-archive",
+        "epub": "application/epub+zip",
+        "jar": "application/java-archive",
+        "swf": "application/x-shockwave-flash",
+        "midi": "audio/midi",
+        "mid": "audio/midi",
+        "ps": "application/postscript",
+        "ai": "application/postscript",
+        "eps": "application/postscript",
+        # Custom or less common
+        "bin": "application/octet-stream",
+        "dmg": "application/x-apple-diskimage",
+        "iso": "application/x-iso9660-image",
+        "deb": "application/x-debian-package",
+        "rpm": "application/x-rpm",
+        "sqlite": "application/vnd.sqlite3",
+        # Placeholder
+        "unknown": "application/octet-stream",  # Fallback for unknown file types
+    }
+
+    @staticmethod
+    @lru_cache(maxsize=1000)
+    def _extract_extension(url: str) -> str:
+        """Extracts file extension from a URL."""
+        # Remove scheme (http://, https://) if present
+        if "://" in url:
+            url = url.split("://", 1)[-1]  # Get everything after '://'
+
+        # Remove domain (everything up to the first '/')
+        path_start = url.find("/")
+        path = url[path_start:] if path_start != -1 else ""
+
+        # Extract last filename in path
+        filename = path.rsplit("/", 1)[-1] if "/" in path else ""
+
+        # Extract and validate extension
+        if "." not in filename:
+            return ""
+
+        return filename.rpartition(".")[-1].lower()
+
+    def __init__(
+        self,
+        allowed_types: Union[str, List[str]],
+        check_extension: bool = True,
+        ext_map: Dict[str, str] = _MIME_MAP,
+    ):
+        super().__init__()
+        # Normalize and store as frozenset for fast lookup
+        self.allowed_types = frozenset(
+            t.lower()
+            for t in (
+                allowed_types if isinstance(allowed_types, list) else [allowed_types]
+            )
+        )
+        self._check_extension = check_extension
+
+        # Pre-compute extension map for allowed types
+        self._ext_map = frozenset(
+            ext
+            for ext, mime in self._MIME_MAP.items()
+            if any(allowed in mime for allowed in self.allowed_types)
+        )
+
+    @lru_cache(maxsize=1000)
+    def _check_url_cached(self, url: str) -> bool:
+        """Cached URL checking"""
+        if not self._check_extension:
+            return True
+        ext = self._extract_extension(url)
+        if not ext:
+            return True
+
+        return ext in self._ext_map
+
+    def apply(self, url: str) -> bool:
+        """Fast extension check with caching"""
+        result = self._check_url_cached(url)
+        self._update_stats(result)
+        return result
+
+
+class DomainFilter(URLFilter):
+    """Optimized domain filter with fast lookups and caching"""
+
+    __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
+
+    # Regex for fast domain extraction
+    _DOMAIN_REGEX = re.compile(r"://([^/]+)")
+
+    def __init__(
+        self,
+        allowed_domains: Union[str, List[str]] = None,
+        blocked_domains: Union[str, List[str]] = None,
+    ):
+        super().__init__()
+
+        # Convert inputs to frozensets for immutable, fast lookups
+        self._allowed_domains = (
+            frozenset(self._normalize_domains(allowed_domains))
+            if allowed_domains
+            else None
+        )
+        self._blocked_domains = (
+            frozenset(self._normalize_domains(blocked_domains))
+            if blocked_domains
+            else frozenset()
+        )
+
+    @staticmethod
+    def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]:
+        """Fast domain normalization"""
+        if isinstance(domains, str):
+            return {domains.lower()}
+        return {d.lower() for d in domains}
+    
+    @staticmethod
+    def _is_subdomain(domain: str, parent_domain: str) -> bool:
+        """Check if domain is a subdomain of parent_domain"""
+        return domain == parent_domain or domain.endswith(f".{parent_domain}")
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Ultra-fast domain extraction with regex and caching"""
+        match = DomainFilter._DOMAIN_REGEX.search(url)
+        return match.group(1).lower() if match else ""
+
+    def apply(self, url: str) -> bool:
+        """Optimized domain checking with early returns"""
+        # Skip processing if no filters
+        if not self._blocked_domains and self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        domain = self._extract_domain(url)
+
+        # Check for blocked domains, including subdomains
+        for blocked in self._blocked_domains:
+            if self._is_subdomain(domain, blocked):
+                self._update_stats(False)
+                return False
+
+        # If no allowed domains specified, accept all non-blocked
+        if self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        # Check if domain matches any allowed domain (including subdomains)
+        for allowed in self._allowed_domains:
+            if self._is_subdomain(domain, allowed):
+                self._update_stats(True)
+                return True
+
+        # No matches found
+        self._update_stats(False)
+        return False
+
+
+class ContentRelevanceFilter(URLFilter):
+    """BM25-based relevance filter using head section content"""
+
+    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
+
+    def __init__(
+        self,
+        query: str,
+        threshold: float,
+        k1: float = 1.2,
+        b: float = 0.75,
+        avgdl: int = 1000,
+    ):
+        super().__init__(name="BM25RelevanceFilter")
+        self.query_terms = self._tokenize(query)
+        self.threshold = threshold
+        self.k1 = k1  # TF saturation parameter
+        self.b = b  # Length normalization parameter
+        self.avgdl = avgdl  # Average document length (empirical value)
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        # Field extraction with weighting
+        fields = {
+            "title": HeadPeekr.get_title(head_content) or "",
+            "meta": HeadPeekr.extract_meta_tags(head_content),
+        }
+        doc_text = self._build_document(fields)
+
+        score = self._bm25(doc_text)
+        decision = score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _build_document(self, fields: Dict) -> str:
+        """Weighted document construction"""
+        return " ".join(
+            [
+                fields["title"] * 3,  # Title weight
+                fields["meta"].get("description", "") * 2,
+                fields["meta"].get("keywords", ""),
+                " ".join(fields["meta"].values()),
+            ]
+        )
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Fast case-insensitive tokenization"""
+        return text.lower().split()
+
+    def _bm25(self, document: str) -> float:
+        """Optimized BM25 implementation for head sections"""
+        doc_terms = self._tokenize(document)
+        doc_len = len(doc_terms)
+        tf = defaultdict(int)
+
+        for term in doc_terms:
+            tf[term] += 1
+
+        score = 0.0
+        for term in set(self.query_terms):
+            term_freq = tf[term]
+            idf = math.log((1 + 1) / (term_freq + 0.5) + 1)  # Simplified IDF
+            numerator = term_freq * (self.k1 + 1)
+            denominator = term_freq + self.k1 * (
+                1 - self.b + self.b * (doc_len / self.avgdl)
+            )
+            score += idf * (numerator / denominator)
+
+        return score
+
+
+class SEOFilter(URLFilter):
+    """Quantitative SEO quality assessment filter using head section analysis"""
+
+    __slots__ = ("threshold", "_weights", "_kw_patterns")
+
+    # Based on SEMrush/Google ranking factors research
+    DEFAULT_WEIGHTS = {
+        "title_length": 0.15,
+        "title_kw": 0.18,
+        "meta_description": 0.12,
+        "canonical": 0.10,
+        "robot_ok": 0.20,  # Most critical factor
+        "schema_org": 0.10,
+        "url_quality": 0.15,
+    }
+
+    def __init__(
+        self,
+        threshold: float = 0.65,
+        keywords: List[str] = None,
+        weights: Dict[str, float] = None,
+    ):
+        super().__init__(name="SEOFilter")
+        self.threshold = threshold
+        self._weights = weights or self.DEFAULT_WEIGHTS
+        self._kw_patterns = (
+            re.compile(
+                r"\b({})\b".format("|".join(map(re.escape, keywords or []))), re.I
+            )
+            if keywords
+            else None
+        )
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        meta = HeadPeekr.extract_meta_tags(head_content)
+        title = HeadPeekr.get_title(head_content) or ""
+        parsed_url = urlparse(url)
+
+        scores = {
+            "title_length": self._score_title_length(title),
+            "title_kw": self._score_keyword_presence(title),
+            "meta_description": self._score_meta_description(
+                meta.get("description", "")
+            ),
+            "canonical": self._score_canonical(meta.get("canonical"), url),
+            "robot_ok": 1.0 if "noindex" not in meta.get("robots", "") else 0.0,
+            "schema_org": self._score_schema_org(head_content),
+            "url_quality": self._score_url_quality(parsed_url),
+        }
+
+        total_score = sum(
+            weight * scores[factor] for factor, weight in self._weights.items()
+        )
+
+        decision = total_score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _score_title_length(self, title: str) -> float:
+        length = len(title)
+        if 50 <= length <= 60:
+            return 1.0
+        if 40 <= length < 50 or 60 < length <= 70:
+            return 0.7
+        return 0.3  # Poor length
+
+    def _score_keyword_presence(self, text: str) -> float:
+        if not self._kw_patterns:
+            return 0.0
+        matches = len(self._kw_patterns.findall(text))
+        return min(matches * 0.3, 1.0)  # Max 3 matches
+
+    def _score_meta_description(self, desc: str) -> float:
+        length = len(desc)
+        if 140 <= length <= 160:
+            return 1.0
+        return 0.5 if 120 <= length <= 200 else 0.2
+
+    def _score_canonical(self, canonical: str, original: str) -> float:
+        if not canonical:
+            return 0.5  # Neutral score
+        return 1.0 if canonical == original else 0.2
+
+    def _score_schema_org(self, html: str) -> float:
+        # Detect any schema.org markup in head
+        return (
+            1.0
+            if re.search(r'<script[^>]+type=["\']application/ld\+json', html)
+            else 0.0
+        )
+
+    def _score_url_quality(self, parsed_url) -> float:
+        score = 1.0
+        path = parsed_url.path.lower()
+
+        # Penalty factors
+        if len(path) > 80:
+            score *= 0.7
+        if re.search(r"\d{4}", path):
+            score *= 0.8  # Numbers in path
+        if parsed_url.query:
+            score *= 0.6  # URL parameters
+        if "_" in path:
+            score *= 0.9  # Underscores vs hyphens
+
+        return score
diff --git a/crawl4ai/deep_crawling/scorers.py b/crawl4ai/deep_crawling/scorers.py
new file mode 100644
index 00000000..1cd9f3e1
--- /dev/null
+++ b/crawl4ai/deep_crawling/scorers.py
@@ -0,0 +1,519 @@
+from abc import ABC, abstractmethod
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from urllib.parse import urlparse, unquote
+import re
+import logging
+from functools import lru_cache
+from array import array
+import ctypes
+import platform
+PLATFORM = platform.system()
+
+# Pre-computed scores for common year differences
+_SCORE_LOOKUP = [1.0, 0.5, 0.3333333333333333, 0.25]
+
+# Pre-computed scores for common year differences
+_FRESHNESS_SCORES = [
+   1.0,    # Current year
+   0.9,    # Last year
+   0.8,    # 2 years ago
+   0.7,    # 3 years ago
+   0.6,    # 4 years ago
+   0.5,    # 5 years ago
+]
+
+class ScoringStats:
+    __slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score')
+    
+    def __init__(self):
+        self._urls_scored = 0
+        self._total_score = 0.0
+        self._min_score = None  # Lazy initialization
+        self._max_score = None
+    
+    def update(self, score: float) -> None:
+        """Optimized update with minimal operations"""
+        self._urls_scored += 1
+        self._total_score += score
+        
+        # Lazy min/max tracking - only if actually accessed
+        if self._min_score is not None:
+            if score < self._min_score:
+                self._min_score = score
+        if self._max_score is not None:
+            if score > self._max_score:
+                self._max_score = score
+                
+    def get_average(self) -> float:
+        """Direct calculation instead of property"""
+        return self._total_score / self._urls_scored if self._urls_scored else 0.0
+    
+    def get_min(self) -> float:
+        """Lazy min calculation"""
+        if self._min_score is None:
+            self._min_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
+        return self._min_score
+        
+    def get_max(self) -> float:
+        """Lazy max calculation"""
+        if self._max_score is None:
+            self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
+        return self._max_score
+class URLScorer(ABC):
+    __slots__ = ('_weight', '_stats')
+    
+    def __init__(self, weight: float = 1.0):
+        # Store weight directly as float32 for memory efficiency
+        self._weight = ctypes.c_float(weight).value
+        self._stats = ScoringStats()
+    
+    @abstractmethod
+    def _calculate_score(self, url: str) -> float:
+        """Calculate raw score for URL."""
+        pass
+    
+    def score(self, url: str) -> float:
+        """Calculate weighted score with minimal overhead."""
+        score = self._calculate_score(url) * self._weight
+        self._stats.update(score)
+        return score
+    
+    @property
+    def stats(self):
+        """Access to scoring statistics."""
+        return self._stats
+    
+    @property
+    def weight(self):
+        return self._weight
+
+class CompositeScorer(URLScorer):
+    __slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array')
+    
+    def __init__(self, scorers: List[URLScorer], normalize: bool = True):
+        """Initialize composite scorer combining multiple scoring strategies.
+        
+        Optimized for:
+        - Fast parallel scoring
+        - Memory efficient score aggregation
+        - Quick short-circuit conditions
+        - Pre-allocated arrays
+        
+        Args:
+            scorers: List of scoring strategies to combine
+            normalize: Whether to normalize final score by scorer count
+        """
+        super().__init__(weight=1.0)
+        self._scorers = scorers
+        self._normalize = normalize
+        
+        # Pre-allocate arrays for scores and weights
+        self._weights_array = array('f', [s.weight for s in scorers])
+        self._score_array = array('f', [0.0] * len(scorers))
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate combined score from all scoring strategies.
+        
+        Uses:
+        1. Pre-allocated arrays for scores
+        2. Short-circuit on zero scores
+        3. Optimized normalization
+        4. Vectorized operations where possible
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Combined and optionally normalized score
+        """
+        total_score = 0.0
+        scores = self._score_array
+        
+        # Get scores from all scorers
+        for i, scorer in enumerate(self._scorers):
+            # Use public score() method which applies weight
+            scores[i] = scorer.score(url)
+            total_score += scores[i]
+            
+        # Normalize if requested
+        if self._normalize and self._scorers:
+            count = len(self._scorers)
+            return total_score / count
+            
+        return total_score
+
+    def score(self, url: str) -> float:
+        """Public scoring interface with stats tracking.
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Final combined score
+        """
+        score = self._calculate_score(url)
+        self.stats.update(score)
+        return score
+
+class KeywordRelevanceScorer(URLScorer):
+    __slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive')
+    
+    def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
+        super().__init__(weight=weight)
+        self._case_sensitive = case_sensitive
+        # Pre-process keywords once
+        self._keywords = [k if case_sensitive else k.lower() for k in keywords]
+    
+    @lru_cache(maxsize=10000)
+    def _url_bytes(self, url: str) -> bytes:
+        """Cache decoded URL bytes"""
+        return url.encode('utf-8') if self._case_sensitive else url.lower().encode('utf-8')
+    
+    
+    def _calculate_score(self, url: str) -> float:
+        """Fast string matching without regex or byte conversion"""
+        if not self._case_sensitive:
+            url = url.lower()
+            
+        matches = sum(1 for k in self._keywords if k in url)
+        
+        # Fast return paths
+        if not matches:
+            return 0.0
+        if matches == len(self._keywords):
+            return 1.0
+            
+        return matches / len(self._keywords)
+
+class PathDepthScorer(URLScorer):
+    __slots__ = ('_weight', '_stats', '_optimal_depth')  # Remove _url_cache
+    
+    def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
+        super().__init__(weight=weight)
+        self._optimal_depth = optimal_depth
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _quick_depth(path: str) -> int:
+        """Ultra fast path depth calculation.
+        
+        Examples:
+            - "http://example.com" -> 0  # No path segments
+            - "http://example.com/" -> 0  # Empty path
+            - "http://example.com/a" -> 1
+            - "http://example.com/a/b" -> 2
+        """
+        if not path or path == '/':
+            return 0
+            
+        if '/' not in path:
+            return 0
+            
+        depth = 0
+        last_was_slash = True
+        
+        for c in path:
+            if c == '/':
+                if not last_was_slash:
+                    depth += 1
+                last_was_slash = True
+            else:
+                last_was_slash = False
+                
+        if not last_was_slash:
+            depth += 1
+            
+        return depth
+
+    @lru_cache(maxsize=10000)  # Cache the whole calculation
+    def _calculate_score(self, url: str) -> float:
+        pos = url.find('/', url.find('://') + 3)
+        if pos == -1:
+            depth = 0
+        else:
+            depth = self._quick_depth(url[pos:])
+            
+        # Use lookup table for common distances
+        distance = depth - self._optimal_depth
+        distance = distance if distance >= 0 else -distance  # Faster than abs()
+        
+        if distance < 4:
+            return _SCORE_LOOKUP[distance]
+            
+        return 1.0 / (1.0 + distance)                                             
+
+class ContentTypeScorer(URLScorer):
+    __slots__ = ('_weight', '_exact_types', '_regex_types')
+
+    def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
+        """Initialize scorer with type weights map.
+        
+        Args:
+            type_weights: Dict mapping file extensions/patterns to scores (e.g. {'.html$': 1.0})
+            weight: Overall weight multiplier for this scorer
+        """
+        super().__init__(weight=weight)
+        self._exact_types = {}  # Fast lookup for simple extensions
+        self._regex_types = []  # Fallback for complex patterns
+        
+        # Split into exact vs regex matchers for performance
+        for pattern, score in type_weights.items():
+            if pattern.startswith('.') and pattern.endswith('$'):
+                ext = pattern[1:-1]
+                self._exact_types[ext] = score
+            else:
+                self._regex_types.append((re.compile(pattern), score))
+                
+        # Sort complex patterns by score for early exit
+        self._regex_types.sort(key=lambda x: -x[1])
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _quick_extension(url: str) -> str:
+        """Extract file extension ultra-fast without regex/splits.
+        
+        Handles:
+        - Basic extensions: "example.html" -> "html"
+        - Query strings: "page.php?id=1" -> "php" 
+        - Fragments: "doc.pdf#page=1" -> "pdf"
+        - Path params: "file.jpg;width=100" -> "jpg"
+        
+        Args:
+            url: URL to extract extension from
+            
+        Returns:
+            Extension without dot, or empty string if none found
+        """
+        pos = url.rfind('.')
+        if pos == -1:
+            return ''
+        
+        # Find first non-alphanumeric char after extension
+        end = len(url)
+        for i in range(pos + 1, len(url)):
+            c = url[i]
+            # Stop at query string, fragment, path param or any non-alphanumeric
+            if c in '?#;' or not c.isalnum():
+                end = i
+                break
+                
+        return url[pos + 1:end].lower()
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate content type score for URL.
+        
+        Uses staged approach:
+        1. Try exact extension match (fast path)
+        2. Fall back to regex patterns if needed
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Score between 0.0 and 1.0 * weight
+        """
+        # Fast path: direct extension lookup
+        ext = self._quick_extension(url)
+        if ext:
+            score = self._exact_types.get(ext, None)
+            if score is not None:
+                return score
+                
+        # Slow path: regex patterns
+        for pattern, score in self._regex_types:
+            if pattern.search(url):
+                return score
+
+        return 0.0
+
+class FreshnessScorer(URLScorer):
+    __slots__ = ('_weight', '_date_pattern', '_current_year')
+
+    def __init__(self, weight: float = 1.0, current_year: int = 2024):
+        """Initialize freshness scorer.
+        
+        Extracts and scores dates from URLs using format:
+        - YYYY/MM/DD 
+        - YYYY-MM-DD
+        - YYYY_MM_DD
+        - YYYY (year only)
+        
+        Args:
+            weight: Score multiplier
+            current_year: Year to calculate freshness against (default 2024)
+        """
+        super().__init__(weight=weight)
+        self._current_year = current_year
+        
+        # Combined pattern for all date formats
+        # Uses non-capturing groups (?:) and alternation
+        self._date_pattern = re.compile(
+            r'(?:/'  # Path separator
+            r'|[-_])'  # or date separators
+            r'((?:19|20)\d{2})'  # Year group (1900-2099)
+            r'(?:'  # Optional month/day group
+            r'(?:/|[-_])'  # Date separator  
+            r'(?:\d{2})'  # Month
+            r'(?:'  # Optional day
+            r'(?:/|[-_])'  # Date separator
+            r'(?:\d{2})'  # Day
+            r')?'  # Day is optional
+            r')?'  # Month/day group is optional
+        )
+
+    @lru_cache(maxsize=10000)
+    def _extract_year(self, url: str) -> Optional[int]:
+        """Extract the most recent year from URL.
+        
+        Args:
+            url: URL to extract year from
+            
+        Returns:
+            Year as int or None if no valid year found
+        """
+        matches = self._date_pattern.finditer(url)
+        latest_year = None
+        
+        # Find most recent year
+        for match in matches:
+            year = int(match.group(1))
+            if (year <= self._current_year and  # Sanity check
+                (latest_year is None or year > latest_year)):
+                latest_year = year
+                
+        return latest_year
+
+    @lru_cache(maxsize=10000) 
+    def _calculate_score(self, url: str) -> float:
+        """Calculate freshness score based on URL date.
+        
+        More recent years score higher. Uses pre-computed scoring
+        table for common year differences.
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Score between 0.0 and 1.0 * weight
+        """
+        year = self._extract_year(url)
+        if year is None:
+            return 0.5  # Default score
+            
+        # Use lookup table for common year differences
+        year_diff = self._current_year - year
+        if year_diff < len(_FRESHNESS_SCORES):
+            return _FRESHNESS_SCORES[year_diff]
+            
+        # Fallback calculation for older content
+        return max(0.1, 1.0 - year_diff * 0.1)
+
+class DomainAuthorityScorer(URLScorer):
+    __slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains')
+    
+    def __init__(
+        self,
+        domain_weights: Dict[str, float],
+        default_weight: float = 0.5,
+        weight: float = 1.0,
+    ):
+        """Initialize domain authority scorer.
+        
+        Args:
+            domain_weights: Dict mapping domains to authority scores
+            default_weight: Score for unknown domains
+            weight: Overall scorer weight multiplier
+            
+        Example:
+            {
+                'python.org': 1.0,
+                'github.com': 0.9,
+                'medium.com': 0.7
+            }
+        """
+        super().__init__(weight=weight)
+        
+        # Pre-process domains for faster lookup
+        self._domain_weights = {
+            domain.lower(): score 
+            for domain, score in domain_weights.items()
+        }
+        self._default_weight = default_weight
+        
+        # Cache top domains for fast path
+        self._top_domains = {
+            domain: score
+            for domain, score in sorted(
+                domain_weights.items(), 
+                key=lambda x: -x[1]
+            )[:5]  # Keep top 5 highest scoring domains
+        }
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Extract domain from URL ultra-fast.
+        
+        Handles:
+        - Basic domains: "example.com"
+        - Subdomains: "sub.example.com" 
+        - Ports: "example.com:8080"
+        - IPv4: "192.168.1.1"
+        
+        Args:
+            url: Full URL to extract domain from
+            
+        Returns:
+            Lowercase domain without port
+        """
+        # Find domain start
+        start = url.find('://') 
+        if start == -1:
+            start = 0
+        else:
+            start += 3
+            
+        # Find domain end
+        end = url.find('/', start)
+        if end == -1:
+            end = url.find('?', start)
+            if end == -1:
+                end = url.find('#', start)
+                if end == -1:
+                    end = len(url)
+                    
+        # Extract domain and remove port
+        domain = url[start:end]
+        port_idx = domain.rfind(':')
+        if port_idx != -1:
+            domain = domain[:port_idx]
+            
+        return domain.lower()
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate domain authority score.
+        
+        Uses staged approach:
+        1. Check top domains (fastest)
+        2. Check full domain weights
+        3. Return default weight
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Authority score between 0.0 and 1.0 * weight
+        """
+        domain = self._extract_domain(url)
+        
+        # Fast path: check top domains first
+        score = self._top_domains.get(domain)
+        if score is not None:
+            return score
+            
+        # Regular path: check all domains
+        return self._domain_weights.get(domain, self._default_weight)
\ No newline at end of file
diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py
new file mode 100644
index 00000000..f4816eb5
--- /dev/null
+++ b/crawl4ai/docker_client.py
@@ -0,0 +1,170 @@
+from typing import List, Optional, Union, AsyncGenerator, Dict, Any
+import httpx
+import json
+from urllib.parse import urljoin
+import asyncio
+
+from .async_configs import BrowserConfig, CrawlerRunConfig
+from .models import CrawlResult
+from .async_logger import AsyncLogger, LogLevel
+
+
+class Crawl4aiClientError(Exception):
+    """Base exception for Crawl4ai Docker client errors."""
+    pass
+
+
+class ConnectionError(Crawl4aiClientError):
+    """Raised when connection to the Docker server fails."""
+    pass
+
+
+class RequestError(Crawl4aiClientError):
+    """Raised when the server returns an error response."""
+    pass
+
+
+class Crawl4aiDockerClient:
+    """Client for interacting with Crawl4AI Docker server with token authentication."""
+    
+    def __init__(
+        self,
+        base_url: str = "http://localhost:8000",
+        timeout: float = 30.0,
+        verify_ssl: bool = True,
+        verbose: bool = True,
+        log_file: Optional[str] = None
+    ):
+        self.base_url = base_url.rstrip('/')
+        self.timeout = timeout
+        self.logger = AsyncLogger(log_file=log_file, log_level=LogLevel.DEBUG, verbose=verbose)
+        self._http_client = httpx.AsyncClient(
+            timeout=timeout,
+            verify=verify_ssl,
+            headers={"Content-Type": "application/json"}
+        )
+        self._token: Optional[str] = None
+
+    async def authenticate(self, email: str) -> None:
+        """Authenticate with the server and store the token."""
+        url = urljoin(self.base_url, "/token")
+        try:
+            self.logger.info(f"Authenticating with email: {email}", tag="AUTH")
+            response = await self._http_client.post(url, json={"email": email})
+            response.raise_for_status()
+            data = response.json()
+            self._token = data["access_token"]
+            self._http_client.headers["Authorization"] = f"Bearer {self._token}"
+            self.logger.success("Authentication successful", tag="AUTH")
+        except (httpx.RequestError, httpx.HTTPStatusError) as e:
+            error_msg = f"Authentication failed: {str(e)}"
+            self.logger.error(error_msg, tag="ERROR")
+            raise ConnectionError(error_msg)
+
+    async def _check_server(self) -> None:
+        """Check if server is reachable, raising an error if not."""
+        try:
+            await self._http_client.get(urljoin(self.base_url, "/health"))
+            self.logger.success(f"Connected to {self.base_url}", tag="READY")
+        except httpx.RequestError as e:
+            self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
+            raise ConnectionError(f"Cannot connect to server: {str(e)}")
+
+    def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, 
+                       crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
+        """Prepare request data from configs."""
+        return {
+            "urls": urls,
+            "browser_config": browser_config.dump() if browser_config else {},
+            "crawler_config": crawler_config.dump() if crawler_config else {}
+        }
+
+    async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
+        """Make an HTTP request with error handling."""
+        url = urljoin(self.base_url, endpoint)
+        try:
+            response = await self._http_client.request(method, url, **kwargs)
+            response.raise_for_status()
+            return response
+        except httpx.TimeoutException as e:
+            raise ConnectionError(f"Request timed out: {str(e)}")
+        except httpx.RequestError as e:
+            raise ConnectionError(f"Failed to connect: {str(e)}")
+        except httpx.HTTPStatusError as e:
+            error_msg = (e.response.json().get("detail", str(e)) 
+                        if "application/json" in e.response.headers.get("content-type", "") 
+                        else str(e))
+            raise RequestError(f"Server error {e.response.status_code}: {error_msg}")
+
+    async def crawl(
+        self,
+        urls: List[str],
+        browser_config: Optional[BrowserConfig] = None,
+        crawler_config: Optional[CrawlerRunConfig] = None
+    ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
+        """Execute a crawl operation."""
+        if not self._token:
+            raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
+        await self._check_server()
+        
+        data = self._prepare_request(urls, browser_config, crawler_config)
+        is_streaming = crawler_config and crawler_config.stream
+        
+        self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
+        
+        if is_streaming:
+            async def stream_results() -> AsyncGenerator[CrawlResult, None]:
+                async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
+                    response.raise_for_status()
+                    async for line in response.aiter_lines():
+                        if line.strip():
+                            result = json.loads(line)
+                            if "error" in result:
+                                self.logger.error_status(url=result.get("url", "unknown"), error=result["error"])
+                                continue
+                            self.logger.url_status(url=result.get("url", "unknown"), success=True, timing=result.get("timing", 0.0))
+                            if result.get("status") == "completed":
+                                continue
+                            else:
+                                yield CrawlResult(**result)
+            return stream_results()
+        
+        response = await self._request("POST", "/crawl", json=data)
+        result_data = response.json()
+        if not result_data.get("success", False):
+            raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
+        
+        results = [CrawlResult(**r) for r in result_data.get("results", [])]
+        self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
+        return results[0] if len(results) == 1 else results
+
+    async def get_schema(self) -> Dict[str, Any]:
+        """Retrieve configuration schemas."""
+        if not self._token:
+            raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
+        response = await self._request("GET", "/schema")
+        return response.json()
+
+    async def close(self) -> None:
+        """Close the HTTP client session."""
+        self.logger.info("Closing client", tag="CLOSE")
+        await self._http_client.aclose()
+
+    async def __aenter__(self) -> "Crawl4aiDockerClient":
+        return self
+
+    async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None:
+        await self.close()
+
+
+# Example usage
+async def main():
+    async with Crawl4aiDockerClient(verbose=True) as client:
+        await client.authenticate("user@example.com")
+        result = await client.crawl(["https://example.com"])
+        print(result)
+        schema = await client.get_schema()
+        print(schema)
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index b2b24751..245abc54 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1,13 +1,15 @@
 from abc import ABC, abstractmethod
-from typing import Any, List, Dict, Optional
+import inspect
+from typing import Any, List, Dict, Optional, Tuple, Pattern, Union
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import json
 import time
-import os
+from enum import IntFlag, auto
 
-from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
+from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
 from .config import (
-    DEFAULT_PROVIDER, PROVIDER_MODELS, 
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
     CHUNK_TOKEN_THRESHOLD,
     OVERLAP_RATE,
     WORD_TOKEN_RATE,
@@ -21,6 +23,7 @@ from .utils import (
     extract_xml_data,
     split_and_parse_json_objects,
     sanitize_input_encode,
+    merge_chunks,
 )
 from .models import * # noqa: F403
 
@@ -34,8 +37,9 @@ from .model_loader import (
     calculate_batch_size
 )
 
+from .types import LLMConfig, create_llm_config
+
 from functools import partial
-import math
 import numpy as np
 import re
 from bs4 import BeautifulSoup
@@ -477,8 +481,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
     A strategy that uses an LLM to extract meaningful content from the HTML.
 
     Attributes:
-        provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
-        api_token: The API token for the provider.
+        llm_config: The LLM configuration object.
         instruction: The instruction to use for the LLM model.
         schema: Pydantic model schema for structured data.
         extraction_type: "block" or "schema".
@@ -486,29 +489,41 @@ class LLMExtractionStrategy(ExtractionStrategy):
         overlap_rate: Overlap between chunks.
         word_token_rate: Word to token conversion rate.
         apply_chunking: Whether to apply chunking.
-        base_url: The base URL for the API request.
-        api_base: The base URL for the API request.
-        extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
         verbose: Whether to print verbose output.
         usages: List of individual token usages.
         total_usage: Accumulated token usage.
     """
-
+    _UNWANTED_PROPS = {
+            'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
+            'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
+            'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+            'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+        }
     def __init__(
         self,
-        provider: str = DEFAULT_PROVIDER,
-        api_token: Optional[str] = None,
+        llm_config: 'LLMConfig' = None,
         instruction: str = None,
         schema: Dict = None,
         extraction_type="block",
+        chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
+        overlap_rate=OVERLAP_RATE,
+        word_token_rate=WORD_TOKEN_RATE,
+        apply_chunking=True,
+        input_format: str = "markdown",
+        force_json_response=False,
+        verbose=False,
+        # Deprecated arguments
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: str = None,
+        api_base: str = None,
         **kwargs,
     ):
         """
         Initialize the strategy with clustering parameters.
 
         Args:
-            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
-            api_token: The API token for the provider.
+            llm_config: The LLM configuration object.
             instruction: The instruction to use for the LLM model.
             schema: Pydantic model schema for structured data.
             extraction_type: "block" or "schema".
@@ -516,48 +531,59 @@ class LLMExtractionStrategy(ExtractionStrategy):
             overlap_rate: Overlap between chunks.
             word_token_rate: Word to token conversion rate.
             apply_chunking: Whether to apply chunking.
+            input_format: Content format to use for extraction.
+                            Options: "markdown" (default), "html", "fit_markdown"
+            force_json_response: Whether to force a JSON response from the LLM.
+            verbose: Whether to print verbose output.
+
+            # Deprecated arguments, will be removed very soon
+            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+            api_token: The API token for the provider.
             base_url: The base URL for the API request.
             api_base: The base URL for the API request.
             extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
-            verbose: Whether to print verbose output.
-            usages: List of individual token usages.
-            total_usage: Accumulated token usage.
-
         """
-        super().__init__(**kwargs)
-        self.provider = provider
-        self.api_token = (
-            api_token
-            or PROVIDER_MODELS.get(provider, "no-token")
-            or os.getenv("OPENAI_API_KEY")
-        )
+        super().__init__( input_format=input_format, **kwargs)
+        self.llm_config = llm_config
+        if not self.llm_config:
+            self.llm_config = create_llm_config(
+                provider=DEFAULT_PROVIDER,
+                api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
+            )
         self.instruction = instruction
         self.extract_type = extraction_type
         self.schema = schema
         if schema:
             self.extract_type = "schema"
-
-        self.chunk_token_threshold = kwargs.get(
-            "chunk_token_threshold", CHUNK_TOKEN_THRESHOLD
-        )
-        self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
-        self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
-        self.apply_chunking = kwargs.get("apply_chunking", True)
-        self.base_url = kwargs.get("base_url", None)
-        self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
+        self.force_json_response = force_json_response
+        self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate
+        self.apply_chunking = apply_chunking
         self.extra_args = kwargs.get("extra_args", {})
         if not self.apply_chunking:
             self.chunk_token_threshold = 1e9
-
-        self.verbose = kwargs.get("verbose", False)
+        self.verbose = verbose
         self.usages = []  # Store individual usages
         self.total_usage = TokenUsage()  # Accumulated usage
 
-        if not self.api_token:
-            raise ValueError(
-                "API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable."
-            )
+        self.provider = provider
+        self.api_token = api_token
+        self.base_url = base_url
+        self.api_base = api_base
 
+    
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)  
+        
     def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
         """
         Extract meaningful blocks or chunks from the given HTML using an LLM.
@@ -590,115 +616,111 @@ class LLMExtractionStrategy(ExtractionStrategy):
             prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
 
         if self.extract_type == "schema" and self.schema:
-            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
+            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema
             prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
 
+        if self.extract_type == "schema" and not self.schema:
+            prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
+
         for variable in variable_values:
             prompt_with_variables = prompt_with_variables.replace(
                 "{" + variable + "}", variable_values[variable]
             )
 
-        response = perform_completion_with_backoff(
-            self.provider,
-            prompt_with_variables,
-            self.api_token,
-            base_url=self.api_base or self.base_url,
-            extra_args=self.extra_args,
-        )  # , json_response=self.extract_type == "schema")
-        # Track usage
-        usage = TokenUsage(
-            completion_tokens=response.usage.completion_tokens,
-            prompt_tokens=response.usage.prompt_tokens,
-            total_tokens=response.usage.total_tokens,
-            completion_tokens_details=response.usage.completion_tokens_details.__dict__
-            if response.usage.completion_tokens_details
-            else {},
-            prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
-            if response.usage.prompt_tokens_details
-            else {},
-        )
-        self.usages.append(usage)
-
-        # Update totals
-        self.total_usage.completion_tokens += usage.completion_tokens
-        self.total_usage.prompt_tokens += usage.prompt_tokens
-        self.total_usage.total_tokens += usage.total_tokens
-
         try:
-            blocks = extract_xml_data(["blocks"], response.choices[0].message.content)[
-                "blocks"
-            ]
-            blocks = json.loads(blocks)
-            for block in blocks:
-                block["error"] = False
-        except Exception:
-            parsed, unparsed = split_and_parse_json_objects(
-                response.choices[0].message.content
+            response = perform_completion_with_backoff(
+                self.llm_config.provider,
+                prompt_with_variables,
+                self.llm_config.api_token,
+                base_url=self.llm_config.base_url,
+                json_response=self.force_json_response,
+                extra_args=self.extra_args,
+            )  # , json_response=self.extract_type == "schema")
+            # Track usage
+            usage = TokenUsage(
+                completion_tokens=response.usage.completion_tokens,
+                prompt_tokens=response.usage.prompt_tokens,
+                total_tokens=response.usage.total_tokens,
+                completion_tokens_details=response.usage.completion_tokens_details.__dict__
+                if response.usage.completion_tokens_details
+                else {},
+                prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
+                if response.usage.prompt_tokens_details
+                else {},
             )
-            blocks = parsed
-            if unparsed:
-                blocks.append(
-                    {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
+            self.usages.append(usage)
+
+            # Update totals
+            self.total_usage.completion_tokens += usage.completion_tokens
+            self.total_usage.prompt_tokens += usage.prompt_tokens
+            self.total_usage.total_tokens += usage.total_tokens
+
+            try:
+                response = response.choices[0].message.content
+                blocks = None
+
+                if self.force_json_response:
+                    blocks = json.loads(response)
+                    if isinstance(blocks, dict):
+                        # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
+                        if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
+                            blocks = list(blocks.values())[0]
+                        else:
+                            # If it has only one key which value is not list then assign that to blocks, exampled: { "article_id": "1234", ... }
+                            blocks = [blocks]
+                    elif isinstance(blocks, list):
+                        # If it is a list then assign that to blocks
+                        blocks = blocks
+                else: 
+                    # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
+                    blocks = extract_xml_data(["blocks"], response)["blocks"]
+                    blocks = json.loads(blocks)
+
+                for block in blocks:
+                    block["error"] = False
+            except Exception:
+                parsed, unparsed = split_and_parse_json_objects(
+                    response.choices[0].message.content
                 )
+                blocks = parsed
+                if unparsed:
+                    blocks.append(
+                        {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
+                    )
 
-        if self.verbose:
-            print(
-                "[LOG] Extracted",
-                len(blocks),
-                "blocks from URL:",
-                url,
-                "block index:",
-                ix,
-            )
-        return blocks
+            if self.verbose:
+                print(
+                    "[LOG] Extracted",
+                    len(blocks),
+                    "blocks from URL:",
+                    url,
+                    "block index:",
+                    ix,
+                )
+            return blocks
+        except Exception as e:
+            if self.verbose:
+                print(f"[LOG] Error in LLM extraction: {e}")
+            # Add error information to extracted_content
+            return [
+                {
+                    "index": ix,
+                    "error": True,
+                    "tags": ["error"],
+                    "content": str(e),
+                }
+            ]
 
-    def _merge(self, documents, chunk_token_threshold, overlap):
+    def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
         """
         Merge documents into sections based on chunk_token_threshold and overlap.
         """
-        # chunks = []
-        sections = []
-        total_tokens = 0
-
-        # Calculate the total tokens across all documents
-        for document in documents:
-            total_tokens += len(document.split(" ")) * self.word_token_rate
-
-        # Calculate the number of sections needed
-        num_sections = math.floor(total_tokens / chunk_token_threshold)
-        if num_sections < 1:
-            num_sections = 1  # Ensure there is at least one section
-        adjusted_chunk_threshold = total_tokens / num_sections
-
-        total_token_so_far = 0
-        current_chunk = []
-
-        for document in documents:
-            tokens = document.split(" ")
-            token_count = len(tokens) * self.word_token_rate
-
-            if total_token_so_far + token_count <= adjusted_chunk_threshold:
-                current_chunk.extend(tokens)
-                total_token_so_far += token_count
-            else:
-                # Ensure to handle the last section properly
-                if len(sections) == num_sections - 1:
-                    current_chunk.extend(tokens)
-                    continue
-
-                # Add overlap if specified
-                if overlap > 0 and current_chunk:
-                    overlap_tokens = current_chunk[-overlap:]
-                    current_chunk.extend(overlap_tokens)
-
-                sections.append(" ".join(current_chunk))
-                current_chunk = tokens
-                total_token_so_far = token_count
-
-        # Add the last chunk
-        if current_chunk:
-            sections.append(" ".join(current_chunk))
-
+        sections =  merge_chunks(
+            docs = documents,
+            target_size= chunk_token_threshold,
+            overlap=overlap,
+            word_token_ratio=self.word_token_rate
+        )
         return sections
 
     def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
@@ -719,7 +741,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
             overlap=int(self.chunk_token_threshold * self.overlap_rate),
         )
         extracted_content = []
-        if self.provider.startswith("groq/"):
+        if self.llm_config.provider.startswith("groq/"):
             # Sequential processing with a delay
             for ix, section in enumerate(merged_sections):
                 extract_func = partial(self.extract, url)
@@ -779,8 +801,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
 #######################################################
 # New extraction strategies for JSON-based extraction #
 #######################################################
-
-
 class JsonElementExtractionStrategy(ExtractionStrategy):
     """
     Abstract base class for extracting structured JSON from HTML content.
@@ -1060,13 +1080,20 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         """Get attribute value from element"""
         pass
 
+    _GENERATE_SCHEMA_UNWANTED_PROPS = {
+        'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
+        'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
+    }
+
     @staticmethod
     def generate_schema(
         html: str,
         schema_type: str = "CSS", # or XPATH
         query: str = None,
-        provider: str = "gpt-4o",
-        api_token: str = os.getenv("OPENAI_API_KEY"),
+        target_json_example: str = None,
+        llm_config: 'LLMConfig' = create_llm_config(),
+        provider: str = None,
+        api_token: str = None,
         **kwargs
     ) -> dict:
         """
@@ -1075,16 +1102,20 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         Args:
             html (str): The HTML content to analyze
             query (str, optional): Natural language description of what data to extract
-            provider (str): LLM provider to use 
-            api_token (str): API token for LLM provider
+            provider (str): Legacy Parameter. LLM provider to use 
+            api_token (str): Legacy Parameter. API token for LLM provider
+            llm_config (LLMConfig): LLM configuration object
             prompt (str, optional): Custom prompt template to use
-            **kwargs: Additional args passed to perform_completion_with_backoff
+            **kwargs: Additional args passed to LLM processor
             
         Returns:
             dict: Generated schema following the JsonElementExtractionStrategy format
         """
         from .prompts import JSON_SCHEMA_BUILDER
         from .utils import perform_completion_with_backoff
+        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
+            if locals()[name] is not None:
+                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
         
         # Use default or custom prompt
         prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
@@ -1092,32 +1123,65 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         # Build the prompt
         system_message = {
             "role": "system", 
-            "content": "You are a specialized HTML schema generator. Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else."
+            "content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
+
+Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
+
+# Schema main keys:
+- name: This is the name of the schema.
+- baseSelector: This is the CSS or XPATH selector that identifies the base element that contains all the repetitive patterns.
+- baseFields: This is a list of fields that you extract from the base element itself.
+- fields: This is a list of fields that you extract from the children of the base element. {{name, selector, type}} based on the type, you may have extra keys such as "attribute" when the type is "attribute".
+
+# Extra Context:
+In this context, the following items may or may not be present:
+- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
+- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
+- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.
+
+# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
+In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.
+
+# What are the instructions and details for this schema generation?
+{prompt_template}"""
         }
         
         user_message = {
             "role": "user",
             "content": f"""
-                Instructions:
-                {prompt_template}
-
                 HTML to analyze:
                 ```html
                 {html}
                 ```
-
-                {"Extract the following data: " + query if query else "Please analyze the HTML structure and create the most appropriate schema for data extraction."}
                 """
         }
 
+        if query:
+            user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
+        if target_json_example:
+            user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
+
+        if query and not target_json_example:
+            user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
+        elif not query and target_json_example:
+            user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
+        elif not query and not target_json_example:
+            user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
+        
+        user_message["content"] += """IMPORTANT: Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
+
+        Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
+        """
+
         try:
             # Call LLM with backoff handling
             response = perform_completion_with_backoff(
-                provider=provider,
+                provider=llm_config.provider,
                 prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
                 json_response = True,                
-                api_token=api_token,
-                **kwargs
+                api_token=llm_config.api_token,
+                base_url=llm_config.base_url,
+                extra_args=kwargs
             )
             
             # Extract and return schema
@@ -1126,7 +1190,6 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
         except Exception as e:
             raise Exception(f"Failed to generate schema: {str(e)}")
 
-
 class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
     """
     Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
@@ -1154,7 +1217,8 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
         super().__init__(schema, **kwargs)
 
     def _parse_html(self, html_content: str):
-        return BeautifulSoup(html_content, "html.parser")
+        # return BeautifulSoup(html_content, "html.parser")
+        return BeautifulSoup(html_content, "lxml")
 
     def _get_base_elements(self, parsed_html, selector: str):
         return parsed_html.select(selector)
@@ -1173,6 +1237,373 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
     def _get_element_attribute(self, element, attribute: str):
         return element.get(attribute)
 
+class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"
+        super().__init__(schema, **kwargs)
+        self._selector_cache = {}
+        self._xpath_cache = {}
+        self._result_cache = {}
+        
+        # Control selector optimization strategy
+        self.use_caching = kwargs.get("use_caching", True)
+        self.optimize_common_patterns = kwargs.get("optimize_common_patterns", True)
+        
+        # Load lxml dependencies once
+        from lxml import etree, html
+        from lxml.cssselect import CSSSelector
+        self.etree = etree
+        self.html_parser = html
+        self.CSSSelector = CSSSelector
+    
+    def _parse_html(self, html_content: str):
+        """Parse HTML content with error recovery"""
+        try:
+            parser = self.etree.HTMLParser(recover=True, remove_blank_text=True)
+            return self.etree.fromstring(html_content, parser)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error parsing HTML, falling back to alternative method: {e}")
+            try:
+                return self.html_parser.fromstring(html_content)
+            except Exception as e2:
+                if self.verbose:
+                    print(f"Critical error parsing HTML: {e2}")
+                # Create minimal document as fallback
+                return self.etree.Element("html")
+    
+    def _optimize_selector(self, selector_str):
+        """Optimize common selector patterns for better performance"""
+        if not self.optimize_common_patterns:
+            return selector_str
+            
+        # Handle td:nth-child(N) pattern which is very common in table scraping
+        import re
+        if re.search(r'td:nth-child\(\d+\)', selector_str):
+            return selector_str  # Already handled specially in _apply_selector
+            
+        # Split complex selectors into parts for optimization
+        parts = selector_str.split()
+        if len(parts) <= 1:
+            return selector_str
+            
+        # For very long selectors, consider using just the last specific part
+        if len(parts) > 3 and any(p.startswith('.') or p.startswith('#') for p in parts):
+            specific_parts = [p for p in parts if p.startswith('.') or p.startswith('#')]
+            if specific_parts:
+                return specific_parts[-1]  # Use most specific class/id selector
+                
+        return selector_str
+    
+    def _create_selector_function(self, selector_str):
+        """Create a selector function that handles all edge cases"""
+        original_selector = selector_str
+        
+        # Try to optimize the selector if appropriate
+        if self.optimize_common_patterns:
+            selector_str = self._optimize_selector(selector_str)
+        
+        try:
+            # Attempt to compile the CSS selector
+            compiled = self.CSSSelector(selector_str)
+            xpath = compiled.path
+            
+            # Store XPath for later use
+            self._xpath_cache[selector_str] = xpath
+            
+            # Create the wrapper function that implements the selection strategy
+            def selector_func(element, context_sensitive=True):
+                cache_key = None
+                
+                # Use result caching if enabled
+                if self.use_caching:
+                    # Create a cache key based on element and selector
+                    element_id = element.get('id', '') or str(hash(element))
+                    cache_key = f"{element_id}::{selector_str}"
+                    
+                    if cache_key in self._result_cache:
+                        return self._result_cache[cache_key]
+                
+                results = []
+                try:
+                    # Strategy 1: Direct CSS selector application (fastest)
+                    results = compiled(element)
+                    
+                    # If that fails and we need context sensitivity
+                    if not results and context_sensitive:
+                        # Strategy 2: Try XPath with context adjustment
+                        context_xpath = self._make_context_sensitive_xpath(xpath, element)
+                        if context_xpath:
+                            results = element.xpath(context_xpath)
+                        
+                        # Strategy 3: Handle special case - nth-child
+                        if not results and 'nth-child' in original_selector:
+                            results = self._handle_nth_child_selector(element, original_selector)
+                        
+                        # Strategy 4: Direct descendant search for class/ID selectors
+                        if not results:
+                            results = self._fallback_class_id_search(element, original_selector)
+                            
+                        # Strategy 5: Last resort - tag name search for the final part
+                        if not results:
+                            parts = original_selector.split()
+                            if parts:
+                                last_part = parts[-1]
+                                # Extract tag name from the selector
+                                tag_match = re.match(r'^(\w+)', last_part)
+                                if tag_match:
+                                    tag_name = tag_match.group(1)
+                                    results = element.xpath(f".//{tag_name}")
+                    
+                    # Cache results if caching is enabled
+                    if self.use_caching and cache_key:
+                        self._result_cache[cache_key] = results
+                        
+                except Exception as e:
+                    if self.verbose:
+                        print(f"Error applying selector '{selector_str}': {e}")
+                
+                return results
+                
+            return selector_func
+            
+        except Exception as e:
+            if self.verbose:
+                print(f"Error compiling selector '{selector_str}': {e}")
+            
+            # Fallback function for invalid selectors
+            return lambda element, context_sensitive=True: []
+    
+    def _make_context_sensitive_xpath(self, xpath, element):
+        """Convert absolute XPath to context-sensitive XPath"""
+        try:
+            # If starts with descendant-or-self, it's already context-sensitive
+            if xpath.startswith('descendant-or-self::'):
+                return xpath
+                
+            # Remove leading slash if present
+            if xpath.startswith('/'):
+                context_xpath = f".{xpath}"
+            else:
+                context_xpath = f".//{xpath}"
+                
+            # Validate the XPath by trying it
+            try:
+                element.xpath(context_xpath)
+                return context_xpath
+            except:
+                # If that fails, try a simpler descendant search
+                return f".//{xpath.split('/')[-1]}"
+        except:
+            return None
+    
+    def _handle_nth_child_selector(self, element, selector_str):
+        """Special handling for nth-child selectors in tables"""
+        import re
+        results = []
+        
+        try:
+            # Extract the column number from td:nth-child(N)
+            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
+            if match:
+                col_num = match.group(1)
+                
+                # Check if there's content after the nth-child part
+                remaining_selector = selector_str.split(f"td:nth-child({col_num})", 1)[-1].strip()
+                
+                if remaining_selector:
+                    # If there's a specific element we're looking for after the column
+                    # Extract any tag names from the remaining selector
+                    tag_match = re.search(r'(\w+)', remaining_selector)
+                    tag_name = tag_match.group(1) if tag_match else '*'
+                    results = element.xpath(f".//td[{col_num}]//{tag_name}")
+                else:
+                    # Just get the column cell
+                    results = element.xpath(f".//td[{col_num}]")
+        except Exception as e:
+            if self.verbose:
+                print(f"Error handling nth-child selector: {e}")
+                
+        return results
+    
+    def _fallback_class_id_search(self, element, selector_str):
+        """Fallback to search by class or ID"""
+        results = []
+        
+        try:
+            # Extract class selectors (.classname)
+            import re
+            class_matches = re.findall(r'\.([a-zA-Z0-9_-]+)', selector_str)
+            
+            # Extract ID selectors (#idname)
+            id_matches = re.findall(r'#([a-zA-Z0-9_-]+)', selector_str)
+            
+            # Try each class
+            for class_name in class_matches:
+                class_results = element.xpath(f".//*[contains(@class, '{class_name}')]")
+                results.extend(class_results)
+                
+            # Try each ID (usually more specific)
+            for id_name in id_matches:
+                id_results = element.xpath(f".//*[@id='{id_name}']")
+                results.extend(id_results)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error in fallback class/id search: {e}")
+                
+        return results
+    
+    def _get_selector(self, selector_str):
+        """Get or create a selector function with caching"""
+        if selector_str not in self._selector_cache:
+            self._selector_cache[selector_str] = self._create_selector_function(selector_str)
+        return self._selector_cache[selector_str]
+    
+    def _get_base_elements(self, parsed_html, selector: str):
+        """Get all base elements using the selector"""
+        selector_func = self._get_selector(selector)
+        # For base elements, we don't need context sensitivity
+        return selector_func(parsed_html, context_sensitive=False)
+    
+    def _get_elements(self, element, selector: str):
+        """Get child elements using the selector with context sensitivity"""
+        selector_func = self._get_selector(selector)
+        return selector_func(element, context_sensitive=True)
+    
+    def _get_element_text(self, element) -> str:
+        """Extract normalized text from element"""
+        try:
+            # Get all text nodes and normalize
+            text = " ".join(t.strip() for t in element.xpath(".//text()") if t.strip())
+            return text
+        except Exception as e:
+            if self.verbose:
+                print(f"Error extracting text: {e}")
+            # Fallback
+            try:
+                return element.text_content().strip()
+            except:
+                return ""
+    
+    def _get_element_html(self, element) -> str:
+        """Get HTML string representation of element"""
+        try:
+            return self.etree.tostring(element, encoding='unicode', method='html')
+        except Exception as e:
+            if self.verbose:
+                print(f"Error serializing HTML: {e}")
+            return ""
+    
+    def _get_element_attribute(self, element, attribute: str):
+        """Get attribute value safely"""
+        try:
+            return element.get(attribute)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error getting attribute '{attribute}': {e}")
+            return None
+            
+    def _clear_caches(self):
+        """Clear caches to free memory"""
+        if self.use_caching:
+            self._result_cache.clear()
+
+class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+        self._selector_cache = {}
+    
+    def _parse_html(self, html_content: str):
+        from lxml import etree
+        parser = etree.HTMLParser(recover=True)
+        return etree.fromstring(html_content, parser)
+    
+    def _get_selector(self, selector_str):
+        """Get a selector function that works within the context of an element"""
+        if selector_str not in self._selector_cache:
+            from lxml.cssselect import CSSSelector
+            try:
+                # Store both the compiled selector and its xpath translation
+                compiled = CSSSelector(selector_str)
+                
+                # Create a function that will apply this selector appropriately
+                def select_func(element):
+                    try:
+                        # First attempt: direct CSS selector application
+                        results = compiled(element)
+                        if results:
+                            return results
+                        
+                        # Second attempt: contextual XPath selection
+                        # Convert the root-based XPath to a context-based XPath
+                        xpath = compiled.path
+                        
+                        # If the XPath already starts with descendant-or-self, handle it specially
+                        if xpath.startswith('descendant-or-self::'):
+                            context_xpath = xpath
+                        else:
+                            # For normal XPath expressions, make them relative to current context
+                            context_xpath = f"./{xpath.lstrip('/')}"
+                        
+                        results = element.xpath(context_xpath)
+                        if results:
+                            return results
+                        
+                        # Final fallback: simple descendant search for common patterns
+                        if 'nth-child' in selector_str:
+                            # Handle td:nth-child(N) pattern
+                            import re
+                            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
+                            if match:
+                                col_num = match.group(1)
+                                sub_selector = selector_str.split(')', 1)[-1].strip()
+                                if sub_selector:
+                                    return element.xpath(f".//td[{col_num}]//{sub_selector}")
+                                else:
+                                    return element.xpath(f".//td[{col_num}]")
+                        
+                        # Last resort: try each part of the selector separately
+                        parts = selector_str.split()
+                        if len(parts) > 1 and parts[-1]:
+                            return element.xpath(f".//{parts[-1]}")
+                            
+                        return []
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error applying selector '{selector_str}': {e}")
+                        return []
+                
+                self._selector_cache[selector_str] = select_func
+            except Exception as e:
+                if self.verbose:
+                    print(f"Error compiling selector '{selector_str}': {e}")
+                
+                # Fallback function for invalid selectors
+                def fallback_func(element):
+                    return []
+                
+                self._selector_cache[selector_str] = fallback_func
+                
+        return self._selector_cache[selector_str]
+    
+    def _get_base_elements(self, parsed_html, selector: str):
+        selector_func = self._get_selector(selector)
+        return selector_func(parsed_html)
+    
+    def _get_elements(self, element, selector: str):
+        selector_func = self._get_selector(selector)
+        return selector_func(element)
+    
+    def _get_element_text(self, element) -> str:
+        return "".join(element.xpath(".//text()")).strip()
+    
+    def _get_element_html(self, element) -> str:
+        from lxml import etree
+        return etree.tostring(element, encoding='unicode')
+    
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)    
 
 class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
     """
@@ -1238,3 +1669,303 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
     def _get_element_attribute(self, element, attribute: str):
         return element.get(attribute)
 
+"""
+RegexExtractionStrategy
+Fast, zero-LLM extraction of common entities via regular expressions.
+"""
+
+_CTRL = {c: rf"\x{ord(c):02x}" for c in map(chr, range(32)) if c not in "\t\n\r"}
+
+_WB_FIX = re.compile(r"\x08")               # stray back-space   →   word-boundary
+_NEEDS_ESCAPE = re.compile(r"(?<!\\)\\(?![\\u])")   # lone backslash
+
+def _sanitize_schema(schema: Dict[str, str]) -> Dict[str, str]:
+    """Fix common JSON-escape goofs coming from LLMs or manual edits."""
+    safe = {}
+    for label, pat in schema.items():
+        # 1️⃣ replace accidental control chars (inc. the infamous back-space)
+        pat = _WB_FIX.sub(r"\\b", pat).translate(_CTRL)
+
+        # 2️⃣ double any single backslash that JSON kept single
+        pat = _NEEDS_ESCAPE.sub(r"\\\\", pat)
+
+        # 3️⃣ quick sanity compile
+        try:
+            re.compile(pat)
+        except re.error as e:
+            raise ValueError(f"Regex for '{label}' won’t compile after fix: {e}") from None
+
+        safe[label] = pat
+    return safe
+
+
+class RegexExtractionStrategy(ExtractionStrategy):
+    """
+    A lean strategy that finds e-mails, phones, URLs, dates, money, etc.,
+    using nothing but pre-compiled regular expressions.
+
+    Extraction returns::
+
+        {
+            "url":   "<page-url>",
+            "label": "<pattern-label>",
+            "value": "<matched-string>",
+            "span":  [start, end]
+        }
+
+    Only `generate_schema()` touches an LLM, extraction itself is pure Python.
+    """
+
+    # -------------------------------------------------------------- #
+    # Built-in patterns exposed as IntFlag so callers can bit-OR them
+    # -------------------------------------------------------------- #
+    class _B(IntFlag):
+        EMAIL           = auto()
+        PHONE_INTL      = auto()
+        PHONE_US        = auto()
+        URL             = auto()
+        IPV4            = auto()
+        IPV6            = auto()
+        UUID            = auto()
+        CURRENCY        = auto()
+        PERCENTAGE      = auto()
+        NUMBER          = auto()
+        DATE_ISO        = auto()
+        DATE_US         = auto()
+        TIME_24H        = auto()
+        POSTAL_US       = auto()
+        POSTAL_UK       = auto()
+        HTML_COLOR_HEX  = auto()
+        TWITTER_HANDLE  = auto()
+        HASHTAG         = auto()
+        MAC_ADDR        = auto()
+        IBAN            = auto()
+        CREDIT_CARD     = auto()
+        NOTHING         = auto()
+        ALL             = (
+            EMAIL | PHONE_INTL | PHONE_US | URL | IPV4 | IPV6 | UUID
+            | CURRENCY | PERCENTAGE | NUMBER | DATE_ISO | DATE_US | TIME_24H
+            | POSTAL_US | POSTAL_UK | HTML_COLOR_HEX | TWITTER_HANDLE
+            | HASHTAG | MAC_ADDR | IBAN | CREDIT_CARD
+        )
+
+    # user-friendly aliases  (RegexExtractionStrategy.Email, .IPv4, …)
+    Email          = _B.EMAIL
+    PhoneIntl      = _B.PHONE_INTL
+    PhoneUS        = _B.PHONE_US
+    Url            = _B.URL
+    IPv4           = _B.IPV4
+    IPv6           = _B.IPV6
+    Uuid           = _B.UUID
+    Currency       = _B.CURRENCY
+    Percentage     = _B.PERCENTAGE
+    Number         = _B.NUMBER
+    DateIso        = _B.DATE_ISO
+    DateUS         = _B.DATE_US
+    Time24h        = _B.TIME_24H
+    PostalUS       = _B.POSTAL_US
+    PostalUK       = _B.POSTAL_UK
+    HexColor       = _B.HTML_COLOR_HEX
+    TwitterHandle  = _B.TWITTER_HANDLE
+    Hashtag        = _B.HASHTAG
+    MacAddr        = _B.MAC_ADDR
+    Iban           = _B.IBAN
+    CreditCard     = _B.CREDIT_CARD
+    All            = _B.ALL
+    Nothing        = _B(0)  # no patterns
+
+    # ------------------------------------------------------------------ #
+    # Built-in pattern catalog
+    # ------------------------------------------------------------------ #
+    DEFAULT_PATTERNS: Dict[str, str] = {
+        # Communication
+        "email":           r"[\w.+-]+@[\w-]+\.[\w.-]+",
+        "phone_intl":      r"\+?\d[\d .()-]{7,}\d",
+        "phone_us":        r"\(?\d{3}\)?[ -. ]?\d{3}[ -. ]?\d{4}",
+        # Web
+        "url":             r"https?://[^\s\"'<>]+",
+        "ipv4":            r"(?:\d{1,3}\.){3}\d{1,3}",
+        "ipv6":            r"[A-F0-9]{1,4}(?::[A-F0-9]{1,4}){7}",
+        # IDs
+        "uuid":            r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}",
+        # Money / numbers
+        "currency":        r"(?:USD|EUR|RM|\$|€|£)\s?\d+(?:[.,]\d{2})?",
+        "percentage":      r"\d+(?:\.\d+)?%",
+        "number":          r"\b\d{1,3}(?:[,.\s]\d{3})*(?:\.\d+)?\b",
+        # Dates / Times
+        "date_iso":        r"\d{4}-\d{2}-\d{2}",
+        "date_us":         r"\d{1,2}/\d{1,2}/\d{2,4}",
+        "time_24h":        r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?:[:.][0-5]\d)?\b",
+        # Misc
+        "postal_us":       r"\b\d{5}(?:-\d{4})?\b",
+        "postal_uk":       r"\b[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}\b",
+        "html_color_hex":  r"#[0-9A-Fa-f]{6}\b",
+        "twitter_handle":  r"@[\w]{1,15}",
+        "hashtag":         r"#[\w-]+",
+        "mac_addr":        r"(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}",
+        "iban":            r"[A-Z]{2}\d{2}[A-Z0-9]{11,30}",
+        "credit_card":     r"\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|6(?:011|5\d{2})\d{12})\b",
+    }
+
+    _FLAGS = re.IGNORECASE | re.MULTILINE
+    _UNWANTED_PROPS = {
+        "provider": "Use llm_config instead",
+        "api_token": "Use llm_config instead",
+    }
+
+    # ------------------------------------------------------------------ #
+    # Construction
+    # ------------------------------------------------------------------ #
+    def __init__(
+        self,
+        pattern: "_B" = _B.NOTHING,
+        *,
+        custom: Optional[Union[Dict[str, str], List[Tuple[str, str]]]] = None,
+        input_format: str = "fit_html",
+        **kwargs,
+    ) -> None:
+        """
+        Args:
+            patterns: Custom patterns overriding or extending defaults.
+                      Dict[label, regex] or list[tuple(label, regex)].
+            input_format: "html", "markdown" or "text".
+            **kwargs: Forwarded to ExtractionStrategy.
+        """
+        super().__init__(input_format=input_format, **kwargs)
+
+        # 1️⃣  take only the requested built-ins
+        merged: Dict[str, str] = {
+            key: rx
+            for key, rx in self.DEFAULT_PATTERNS.items()
+            if getattr(self._B, key.upper()).value & pattern
+        }
+
+        # 2️⃣  apply user overrides / additions
+        if custom:
+            if isinstance(custom, dict):
+                merged.update(custom)
+            else:  # iterable of (label, regex)
+                merged.update({lbl: rx for lbl, rx in custom})
+
+        self._compiled: Dict[str, Pattern] = {
+            lbl: re.compile(rx, self._FLAGS) for lbl, rx in merged.items()
+        }
+
+    # ------------------------------------------------------------------ #
+    # Extraction
+    # ------------------------------------------------------------------ #
+    def extract(self, url: str, content: str, *q, **kw) -> List[Dict[str, Any]]:
+        # text = self._plain_text(html)
+        out: List[Dict[str, Any]] = []
+
+        for label, cre in self._compiled.items():
+            for m in cre.finditer(content):
+                out.append(
+                    {
+                        "url": url,
+                        "label": label,
+                        "value": m.group(0),
+                        "span": [m.start(), m.end()],
+                    }
+                )
+        return out
+
+    # ------------------------------------------------------------------ #
+    # Helpers
+    # ------------------------------------------------------------------ #
+    def _plain_text(self, content: str) -> str:
+        if self.input_format == "text":
+            return content
+        return BeautifulSoup(content, "lxml").get_text(" ", strip=True)
+
+    # ------------------------------------------------------------------ #
+    # LLM-assisted pattern generator
+    # ------------------------------------------------------------------ #
+    # ------------------------------------------------------------------ #
+    # LLM-assisted one-off pattern builder
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def generate_pattern(
+        label: str,
+        html: str,
+        *,
+        query: Optional[str] = None,
+        examples: Optional[List[str]] = None,
+        llm_config: Optional[LLMConfig] = None,
+        **kwargs,
+    ) -> Dict[str, str]:
+        """
+        Ask an LLM for a single page-specific regex and return
+            {label: pattern}   ── ready for RegexExtractionStrategy(custom=…)
+        """
+
+        # ── guard deprecated kwargs
+        for k in RegexExtractionStrategy._UNWANTED_PROPS:
+            if k in kwargs:
+                raise AttributeError(
+                    f"{k} is deprecated, {RegexExtractionStrategy._UNWANTED_PROPS[k]}"
+                )
+
+        # ── default LLM config
+        if llm_config is None:
+            llm_config = create_llm_config()
+
+        # ── system prompt – hardened
+        system_msg = (
+            "You are an expert Python-regex engineer.\n"
+            f"Return **one** JSON object whose single key is exactly \"{label}\", "
+            "and whose value is a raw-string regex pattern that works with "
+            "the standard `re` module in Python.\n\n"
+            "Strict rules (obey every bullet):\n"
+            "• If a *user query* is supplied, treat it as the precise semantic target and optimise the "
+            "  pattern to capture ONLY text that answers that query. If the query conflicts with the "
+            "  sample HTML, the HTML wins.\n"
+            "• Tailor the pattern to the *sample HTML* – reproduce its exact punctuation, spacing, "
+            "  symbols, capitalisation, etc. Do **NOT** invent a generic form.\n"
+            "• Keep it minimal and fast: avoid unnecessary capturing, prefer non-capturing `(?: … )`, "
+            "  and guard against catastrophic backtracking.\n"
+            "• Anchor with `^`, `$`, or `\\b` only when it genuinely improves precision.\n"
+            "• Use inline flags like `(?i)` when needed; no verbose flag comments.\n"
+            "• Output must be valid JSON – no markdown, code fences, comments, or extra keys.\n"
+            "• The regex value must be a Python string literal: **double every backslash** "
+            "(e.g. `\\\\b`, `\\\\d`, `\\\\\\\\`).\n\n"
+            "Example valid output:\n"
+            f"{{\"{label}\": \"(?:RM|rm)\\\\s?\\\\d{{1,3}}(?:,\\\\d{{3}})*(?:\\\\.\\\\d{{2}})?\"}}"
+        )
+
+        # ── user message: cropped HTML + optional hints
+        user_parts = ["```html", html[:5000], "```"]  # protect token budget
+        if query:
+            user_parts.append(f"\n\n## Query\n{query.strip()}")
+        if examples:
+            user_parts.append("## Examples\n" + "\n".join(examples[:20]))
+        user_msg = "\n\n".join(user_parts)
+
+        # ── LLM call (with retry/backoff)
+        resp = perform_completion_with_backoff(
+            provider=llm_config.provider,
+            prompt_with_variables="\n\n".join([system_msg, user_msg]),
+            json_response=True,
+            api_token=llm_config.api_token,
+            base_url=llm_config.base_url,
+            extra_args=kwargs,
+        )
+
+        # ── clean & load JSON (fix common escape mistakes *before* json.loads)
+        raw = resp.choices[0].message.content
+        raw = raw.replace("\x08", "\\b")                     # stray back-space → \b
+        raw = re.sub(r'(?<!\\)\\(?![\\u"])', r"\\\\", raw)   # lone \ → \\
+
+        try:
+            pattern_dict = json.loads(raw)
+        except Exception as exc:
+            raise ValueError(f"LLM did not return valid JSON: {raw}") from exc
+
+        # quick sanity-compile
+        for lbl, pat in pattern_dict.items():
+            try:
+                re.compile(pat)
+            except re.error as e:
+                raise ValueError(f"Invalid regex for '{lbl}': {e}") from None
+
+        return pattern_dict
diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py
index a3349e70..ca15b453 100644
--- a/crawl4ai/html2text/__init__.py
+++ b/crawl4ai/html2text/__init__.py
@@ -510,6 +510,7 @@ class HTML2Text(html.parser.HTMLParser):
 
         if tag == "a" and not self.ignore_links:
             if start:
+                self.inside_link = True
                 if (
                     "href" in attrs
                     and attrs["href"] is not None
@@ -526,6 +527,7 @@ class HTML2Text(html.parser.HTMLParser):
                 else:
                     self.astack.append(None)
             else:
+                self.inside_link = False
                 if self.astack:
                     a = self.astack.pop()
                     if self.maybe_automatic_link and not self.empty_link:
@@ -610,13 +612,22 @@ class HTML2Text(html.parser.HTMLParser):
                         self.o("[" + str(a_props.count) + "]")
 
         if tag == "dl" and start:
-            self.p()
-        if tag == "dt" and not start:
-            self.pbr()
-        if tag == "dd" and start:
-            self.o("    ")
-        if tag == "dd" and not start:
-            self.pbr()
+            self.p()  # Add paragraph break before list starts
+            self.p_p = 0  # Reset paragraph state
+        
+        elif tag == "dt" and start:
+            if self.p_p == 0:  # If not first term
+                self.o("\n\n")  # Add spacing before new term-definition pair
+            self.p_p = 0  # Reset paragraph state
+        
+        elif tag == "dt" and not start:
+            self.o("\n")  # Single newline between term and definition
+        
+        elif tag == "dd" and start:
+            self.o("    ")  # Indent definition
+        
+        elif tag == "dd" and not start:
+            self.p_p = 0
 
         if tag in ["ol", "ul"]:
             # Google Docs create sub lists as top level lists
@@ -1026,6 +1037,7 @@ class CustomHTML2Text(HTML2Text):
         super().__init__(*args, **kwargs)
         self.inside_pre = False
         self.inside_code = False
+        self.inside_link = False
         self.preserve_tags = set()  # Set of tags to preserve
         self.current_preserved_tag = None
         self.preserved_content = []
@@ -1105,11 +1117,17 @@ class CustomHTML2Text(HTML2Text):
                 # Ignore code tags inside pre blocks if handle_code_in_pre is False
                 return
             if start:
-                self.o("`")  # Markdown inline code start
+                if not self.inside_link:
+                    self.o("`")  # Only output backtick if not inside a link
                 self.inside_code = True
             else:
-                self.o("`")  # Markdown inline code end
+                if not self.inside_link:
+                    self.o("`")  # Only output backtick if not inside a link
                 self.inside_code = False
+
+            # If inside a link, let the parent class handle the content
+            if self.inside_link:
+                super().handle_tag(tag, attrs, start) 
         else:
             super().handle_tag(tag, attrs, start)
 
diff --git a/crawl4ai/hub.py b/crawl4ai/hub.py
new file mode 100644
index 00000000..75056df7
--- /dev/null
+++ b/crawl4ai/hub.py
@@ -0,0 +1,69 @@
+# crawl4ai/hub.py
+from abc import ABC, abstractmethod
+from typing import Dict, Type, Union
+import logging
+import importlib
+from pathlib import Path
+import inspect
+
+logger = logging.getLogger(__name__)
+
+
+class BaseCrawler(ABC):
+    def __init__(self):
+        self.logger = logging.getLogger(self.__class__.__name__)
+        
+    @abstractmethod
+    async def run(self, url: str = "", **kwargs) -> str:
+        """
+        Implement this method to return JSON string.
+        Must accept URL + arbitrary kwargs for flexibility.
+        """
+        pass
+
+    def __init_subclass__(cls, **kwargs):
+        """Enforce interface validation on subclassing"""
+        super().__init_subclass__(**kwargs)
+        
+        # Verify run method signature
+        run_method = cls.run
+        if not run_method.__code__.co_argcount >= 2:  # self + url
+            raise TypeError(f"{cls.__name__} must implement 'run(self, url: str, **kwargs)'")
+            
+        # Verify async nature
+        if not inspect.iscoroutinefunction(run_method):
+            raise TypeError(f"{cls.__name__}.run must be async")
+
+class CrawlerHub:
+    _crawlers: Dict[str, Type[BaseCrawler]] = {}
+
+    @classmethod
+    def _discover_crawlers(cls):
+        """Dynamically load crawlers from /crawlers in 3 lines"""
+        base_path = Path(__file__).parent / "crawlers"
+        for crawler_dir in base_path.iterdir():
+            if crawler_dir.is_dir():
+                try:
+                    module = importlib.import_module(
+                        f"crawl4ai.crawlers.{crawler_dir.name}.crawler"
+                    )
+                    for attr in dir(module):
+                        cls._maybe_register_crawler(
+                            getattr(module, attr), crawler_dir.name
+                        )
+                except Exception as e:
+                    logger.warning(f"Failed {crawler_dir.name}: {str(e)}")
+
+    @classmethod
+    def _maybe_register_crawler(cls, obj, name: str):
+        """Brilliant one-liner registration"""
+        if isinstance(obj, type) and issubclass(obj, BaseCrawler) and obj != BaseCrawler:
+            module = importlib.import_module(obj.__module__)
+            obj.meta = getattr(module, "__meta__", {})
+            cls._crawlers[name] = obj
+
+    @classmethod
+    def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
+        if not cls._crawlers:
+            cls._discover_crawlers()
+        return cls._crawlers.get(name)
\ No newline at end of file
diff --git a/crawl4ai/install.py b/crawl4ai/install.py
index 139be591..b2fcca78 100644
--- a/crawl4ai/install.py
+++ b/crawl4ai/install.py
@@ -2,17 +2,93 @@ import subprocess
 import sys
 import asyncio
 from .async_logger import AsyncLogger, LogLevel
+from pathlib import Path
+import os
+import shutil
 
 # Initialize logger
 logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
 
+def setup_home_directory():
+    """Set up the .crawl4ai folder structure in the user's home directory."""
+    base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
+    crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
+    crawl4ai_config = crawl4ai_folder / "global.yml"
+    crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
+    cache_folder = crawl4ai_folder / "cache"
+    content_folders = [
+        "html_content",
+        "cleaned_html",
+        "markdown_content",
+        "extracted_content",
+        "screenshots",
+    ]
+
+    # Clean up old cache if exists
+    if cache_folder.exists():
+        shutil.rmtree(cache_folder)
+
+    # Create new folder structure
+    crawl4ai_folder.mkdir(exist_ok=True)
+    cache_folder.mkdir(exist_ok=True)
+    for folder in content_folders:
+        (crawl4ai_folder / folder).mkdir(exist_ok=True)
+    
+    # If config file does not exist, create it
+    if not crawl4ai_config.exists():
+        with open(crawl4ai_config, "w") as f:
+            f.write("")
 
 def post_install():
-    """Run all post-installation tasks"""
+    """
+    Run all post-installation tasks.
+    Checks CRAWL4AI_MODE environment variable. If set to 'api',
+    skips Playwright browser installation.
+    """
     logger.info("Running post-installation setup...", tag="INIT")
-    install_playwright()
+    setup_home_directory()
+
+    # Check environment variable to conditionally skip Playwright install
+    run_mode = os.getenv('CRAWL4AI_MODE')
+    if run_mode == 'api':
+        logger.warning(
+            "CRAWL4AI_MODE=api detected. Skipping Playwright browser installation.",
+            tag="SETUP"
+        )
+    else:
+        # Proceed with installation only if mode is not 'api'
+        install_playwright()
+
     run_migration()
+    # TODO: Will be added in the future
+    # setup_builtin_browser()
     logger.success("Post-installation setup completed!", tag="COMPLETE")
+    
+def setup_builtin_browser():
+    """Set up a builtin browser for use with Crawl4AI"""
+    try:
+        logger.info("Setting up builtin browser...", tag="INIT")
+        asyncio.run(_setup_builtin_browser())
+        logger.success("Builtin browser setup completed!", tag="COMPLETE")
+    except Exception as e:
+        logger.warning(f"Failed to set up builtin browser: {e}")
+        logger.warning("You can manually set up a builtin browser using 'crawl4ai-doctor builtin-browser-start'")
+    
+async def _setup_builtin_browser():
+    try:
+        # Import BrowserProfiler here to avoid circular imports
+        from .browser_profiler import BrowserProfiler
+        profiler = BrowserProfiler(logger=logger)
+        
+        # Launch the builtin browser
+        cdp_url = await profiler.launch_builtin_browser(headless=True)
+        if cdp_url:
+            logger.success(f"Builtin browser launched at {cdp_url}", tag="BROWSER")
+        else:
+            logger.warning("Failed to launch builtin browser", tag="BROWSER")
+    except Exception as e:
+        logger.warning(f"Error setting up builtin browser: {e}", tag="BROWSER")
+        raise
 
 
 def install_playwright():
@@ -106,4 +182,5 @@ def doctor():
     """Entry point for the doctor command"""
     import asyncio
 
-    return asyncio.run(run_doctor())
+    asyncio.run(run_doctor())
+    sys.exit(0)
diff --git a/crawl4ai/js_snippet/remove_overlay_elements.js b/crawl4ai/js_snippet/remove_overlay_elements.js
index 0400d89c..a50d9427 100644
--- a/crawl4ai/js_snippet/remove_overlay_elements.js
+++ b/crawl4ai/js_snippet/remove_overlay_elements.js
@@ -115,5 +115,6 @@ async () => {
     document.body.style.overflow = "auto";
 
     // Wait a bit for any animations to complete
-    await new Promise((resolve) => setTimeout(resolve, 100));
+    document.body.scrollIntoView(false);
+    await new Promise((resolve) => setTimeout(resolve, 50));
 };
diff --git a/crawl4ai/legacy/__init__.py b/crawl4ai/legacy/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/crawl4ai/legacy/cli.py b/crawl4ai/legacy/cli.py
new file mode 100644
index 00000000..b2d2199e
--- /dev/null
+++ b/crawl4ai/legacy/cli.py
@@ -0,0 +1,123 @@
+import click
+import sys
+import asyncio
+from typing import List
+from .docs_manager import DocsManager
+from .async_logger import AsyncLogger
+
+logger = AsyncLogger(verbose=True)
+docs_manager = DocsManager(logger)
+
+
+def print_table(headers: List[str], rows: List[List[str]], padding: int = 2):
+    """Print formatted table with headers and rows"""
+    widths = [max(len(str(cell)) for cell in col) for col in zip(headers, *rows)]
+    border = "+" + "+".join("-" * (w + 2 * padding) for w in widths) + "+"
+
+    def format_row(row):
+        return (
+            "|"
+            + "|".join(
+                f"{' ' * padding}{str(cell):<{w}}{' ' * padding}"
+                for cell, w in zip(row, widths)
+            )
+            + "|"
+        )
+
+    click.echo(border)
+    click.echo(format_row(headers))
+    click.echo(border)
+    for row in rows:
+        click.echo(format_row(row))
+    click.echo(border)
+
+
+@click.group()
+def cli():
+    """Crawl4AI Command Line Interface"""
+    pass
+
+
+@cli.group()
+def docs():
+    """Documentation operations"""
+    pass
+
+
+@docs.command()
+@click.argument("sections", nargs=-1)
+@click.option(
+    "--mode", type=click.Choice(["extended", "condensed"]), default="extended"
+)
+def combine(sections: tuple, mode: str):
+    """Combine documentation sections"""
+    try:
+        asyncio.run(docs_manager.ensure_docs_exist())
+        click.echo(docs_manager.generate(sections, mode))
+    except Exception as e:
+        logger.error(str(e), tag="ERROR")
+        sys.exit(1)
+
+
+@docs.command()
+@click.argument("query")
+@click.option("--top-k", "-k", default=5)
+@click.option("--build-index", is_flag=True, help="Build index if missing")
+def search(query: str, top_k: int, build_index: bool):
+    """Search documentation"""
+    try:
+        result = docs_manager.search(query, top_k)
+        if result == "No search index available. Call build_search_index() first.":
+            if build_index or click.confirm("No search index found. Build it now?"):
+                asyncio.run(docs_manager.llm_text.generate_index_files())
+                result = docs_manager.search(query, top_k)
+        click.echo(result)
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
+
+
+@docs.command()
+def update():
+    """Update docs from GitHub"""
+    try:
+        asyncio.run(docs_manager.fetch_docs())
+        click.echo("Documentation updated successfully")
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
+
+
+@docs.command()
+@click.option("--force-facts", is_flag=True, help="Force regenerate fact files")
+@click.option("--clear-cache", is_flag=True, help="Clear BM25 cache")
+def index(force_facts: bool, clear_cache: bool):
+    """Build or rebuild search indexes"""
+    try:
+        asyncio.run(docs_manager.ensure_docs_exist())
+        asyncio.run(
+            docs_manager.llm_text.generate_index_files(
+                force_generate_facts=force_facts, clear_bm25_cache=clear_cache
+            )
+        )
+        click.echo("Search indexes built successfully")
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
+
+
+# Add docs list command
+@docs.command()
+def list():
+    """List available documentation sections"""
+    try:
+        sections = docs_manager.list()
+        print_table(["Sections"], [[section] for section in sections])
+
+    except Exception as e:
+        click.echo(f"Error: {str(e)}", err=True)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/legacy/crawler_strategy.py
similarity index 100%
rename from crawl4ai/crawler_strategy.py
rename to crawl4ai/legacy/crawler_strategy.py
diff --git a/crawl4ai/database.py b/crawl4ai/legacy/database.py
similarity index 100%
rename from crawl4ai/database.py
rename to crawl4ai/legacy/database.py
diff --git a/crawl4ai/docs_manager.py b/crawl4ai/legacy/docs_manager.py
similarity index 100%
rename from crawl4ai/docs_manager.py
rename to crawl4ai/legacy/docs_manager.py
diff --git a/crawl4ai/llmtxt.py b/crawl4ai/legacy/llmtxt.py
similarity index 100%
rename from crawl4ai/llmtxt.py
rename to crawl4ai/legacy/llmtxt.py
diff --git a/crawl4ai/version_manager.py b/crawl4ai/legacy/version_manager.py
similarity index 100%
rename from crawl4ai/version_manager.py
rename to crawl4ai/legacy/version_manager.py
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/legacy/web_crawler.py
similarity index 100%
rename from crawl4ai/web_crawler.py
rename to crawl4ai/legacy/web_crawler.py
diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py
index 1e3f0554..622cc8da 100644
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -2,6 +2,7 @@ from abc import ABC, abstractmethod
 from typing import Optional, Dict, Any, Tuple
 from .models import MarkdownGenerationResult
 from .html2text import CustomHTML2Text
+# from .types import RelevantContentFilter
 from .content_filter_strategy import RelevantContentFilter
 import re
 from urllib.parse import urljoin
@@ -29,21 +30,25 @@ class MarkdownGenerationStrategy(ABC):
         self,
         content_filter: Optional[RelevantContentFilter] = None,
         options: Optional[Dict[str, Any]] = None,
+        verbose: bool = False,
+        content_source: str = "cleaned_html",
     ):
         self.content_filter = content_filter
         self.options = options or {}
+        self.verbose = verbose
+        self.content_source = content_source
 
     @abstractmethod
     def generate_markdown(
         self,
-        cleaned_html: str,
+        input_html: str,
         base_url: str = "",
         html2text_options: Optional[Dict[str, Any]] = None,
         content_filter: Optional[RelevantContentFilter] = None,
         citations: bool = True,
         **kwargs,
     ) -> MarkdownGenerationResult:
-        """Generate markdown from cleaned HTML."""
+        """Generate markdown from the selected input HTML."""
         pass
 
 
@@ -60,6 +65,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
     Args:
         content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
         options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
+        content_source (str): Source of content to generate markdown from. Options: "cleaned_html", "raw_html", "fit_html". Defaults to "cleaned_html".
 
     Returns:
         MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
@@ -69,8 +75,9 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
         self,
         content_filter: Optional[RelevantContentFilter] = None,
         options: Optional[Dict[str, Any]] = None,
+        content_source: str = "cleaned_html",
     ):
-        super().__init__(content_filter, options)
+        super().__init__(content_filter, options, verbose=False, content_source=content_source)
 
     def convert_links_to_citations(
         self, markdown: str, base_url: str = ""
@@ -140,7 +147,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
 
     def generate_markdown(
         self,
-        cleaned_html: str,
+        input_html: str,
         base_url: str = "",
         html2text_options: Optional[Dict[str, Any]] = None,
         options: Optional[Dict[str, Any]] = None,
@@ -149,16 +156,16 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
         **kwargs,
     ) -> MarkdownGenerationResult:
         """
-        Generate markdown with citations from cleaned HTML.
+        Generate markdown with citations from the provided input HTML.
 
         How it works:
-        1. Generate raw markdown from cleaned HTML.
+        1. Generate raw markdown from the input HTML.
         2. Convert links to citations.
         3. Generate fit markdown if content filter is provided.
         4. Return MarkdownGenerationResult.
 
         Args:
-            cleaned_html (str): Cleaned HTML content.
+            input_html (str): The HTML content to process (selected based on content_source).
             base_url (str): Base URL for URL joins.
             html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
             options (Optional[Dict[str, Any]]): Additional options for markdown generation.
@@ -176,7 +183,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
                 "ignore_emphasis": False,
                 "ignore_links": False,
                 "ignore_images": False,
-                "protect_links": True,
+                "protect_links": False,
                 "single_line_break": True,
                 "mark_code": True,
                 "escape_snob": False,
@@ -193,14 +200,14 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
             h.update_params(**default_options)
 
             # Ensure we have valid input
-            if not cleaned_html:
-                cleaned_html = ""
-            elif not isinstance(cleaned_html, str):
-                cleaned_html = str(cleaned_html)
+            if not input_html:
+                input_html = ""
+            elif not isinstance(input_html, str):
+                input_html = str(input_html)
 
             # Generate raw markdown
             try:
-                raw_markdown = h.handle(cleaned_html)
+                raw_markdown = h.handle(input_html)
             except Exception as e:
                 raw_markdown = f"Error converting HTML to markdown: {str(e)}"
 
@@ -225,7 +232,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
             if content_filter or self.content_filter:
                 try:
                     content_filter = content_filter or self.content_filter
-                    filtered_html = content_filter.filter_content(cleaned_html)
+                    filtered_html = content_filter.filter_content(input_html)
                     filtered_html = "\n".join(
                         "<div>{}</div>".format(s) for s in filtered_html
                     )
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index 57edacd7..0c48294a 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,7 @@
-from pydantic import BaseModel, HttpUrl
+from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
 from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
+from typing import AsyncGenerator
+from typing import Generic, TypeVar
 from enum import Enum
 from dataclasses import dataclass
 from .ssl_certificate import SSLCertificate
@@ -24,10 +26,15 @@ class CrawlerTaskResult:
     result: "CrawlResult"
     memory_usage: float
     peak_memory: float
-    start_time: datetime
-    end_time: datetime
+    start_time: Union[datetime, float]
+    end_time: Union[datetime, float]
     error_message: str = ""
-
+    retry_count: int = 0
+    wait_time: float = 0.0
+    
+    @property
+    def success(self) -> bool:
+        return self.result.success
 
 class CrawlStatus(Enum):
     QUEUED = "QUEUED"
@@ -35,27 +42,39 @@ class CrawlStatus(Enum):
     COMPLETED = "COMPLETED"
     FAILED = "FAILED"
 
-
 @dataclass
 class CrawlStats:
     task_id: str
     url: str
     status: CrawlStatus
-    start_time: Optional[datetime] = None
-    end_time: Optional[datetime] = None
+    start_time: Optional[Union[datetime, float]] = None
+    end_time: Optional[Union[datetime, float]] = None
     memory_usage: float = 0.0
     peak_memory: float = 0.0
     error_message: str = ""
+    wait_time: float = 0.0
+    retry_count: int = 0
+    counted_requeue: bool = False
 
     @property
     def duration(self) -> str:
         if not self.start_time:
             return "0:00"
+            
+        # Convert start_time to datetime if it's a float
+        start = self.start_time
+        if isinstance(start, float):
+            start = datetime.fromtimestamp(start)
+            
+        # Get end time or use current time
         end = self.end_time or datetime.now()
-        duration = end - self.start_time
+        # Convert end_time to datetime if it's a float
+        if isinstance(end, float):
+            end = datetime.fromtimestamp(end)
+            
+        duration = end - start
         return str(timedelta(seconds=int(duration.total_seconds())))
 
-
 class DisplayMode(Enum):
     DETAILED = "DETAILED"
     AGGREGATED = "AGGREGATED"
@@ -72,12 +91,31 @@ class TokenUsage:
     completion_tokens_details: Optional[dict] = None
     prompt_tokens_details: Optional[dict] = None
 
-
 class UrlModel(BaseModel):
     url: HttpUrl
     forced: bool = False
 
 
+
+@dataclass
+class TraversalStats:
+    """Statistics for the traversal process"""
+
+    start_time: datetime = datetime.now()
+    urls_processed: int = 0
+    urls_failed: int = 0
+    urls_skipped: int = 0
+    total_depth_reached: int = 0
+    current_depth: int = 0
+
+class DispatchResult(BaseModel):
+    task_id: str
+    memory_usage: float
+    peak_memory: float
+    start_time: Union[datetime, float]
+    end_time: Union[datetime, float]
+    error_message: str = ""
+
 class MarkdownGenerationResult(BaseModel):
     raw_markdown: str
     markdown_with_citations: str
@@ -85,30 +123,23 @@ class MarkdownGenerationResult(BaseModel):
     fit_markdown: Optional[str] = None
     fit_html: Optional[str] = None
 
-
-class DispatchResult(BaseModel):
-    task_id: str
-    memory_usage: float
-    peak_memory: float
-    start_time: datetime
-    end_time: datetime
-    error_message: str = ""
-
-
+    def __str__(self):
+        return self.raw_markdown
+    
 class CrawlResult(BaseModel):
     url: str
     html: str
+    fit_html: Optional[str] = None
     success: bool
     cleaned_html: Optional[str] = None
     media: Dict[str, List[Dict]] = {}
     links: Dict[str, List[Dict]] = {}
     downloaded_files: Optional[List[str]] = None
+    js_execution_result: Optional[Dict[str, Any]] = None
     screenshot: Optional[str] = None
     pdf: Optional[bytes] = None
-    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
-    markdown_v2: Optional[MarkdownGenerationResult] = None
-    fit_markdown: Optional[str] = None
-    fit_html: Optional[str] = None
+    mhtml: Optional[str] = None
+    _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
     extracted_content: Optional[str] = None
     metadata: Optional[dict] = None
     error_message: Optional[str] = None
@@ -118,31 +149,188 @@ class CrawlResult(BaseModel):
     ssl_certificate: Optional[SSLCertificate] = None
     dispatch_result: Optional[DispatchResult] = None
     redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+    tables: List[Dict] = Field(default_factory=list)  # NEW – [{headers,rows,caption,summary}]
 
     class Config:
         arbitrary_types_allowed = True
 
+# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
+# and model_dump override all exist to support a smooth transition from markdown as a string
+# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
+# 
+# This allows code that expects markdown to be a string to continue working, while also
+# providing access to the full MarkdownGenerationResult object's properties.
+# 
+# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
+# 
+# When backward compatibility is no longer needed in future versions, this entire mechanism
+# can be simplified to a standard field with no custom accessors or serialization logic.
+    
+    def __init__(self, **data):
+        markdown_result = data.pop('markdown', None)
+        super().__init__(**data)
+        if markdown_result is not None:
+            self._markdown = (
+                MarkdownGenerationResult(**markdown_result)
+                if isinstance(markdown_result, dict)
+                else markdown_result
+            )
+    
+    @property
+    def markdown(self):
+        """
+        Property that returns a StringCompatibleMarkdown object that behaves like
+        a string but also provides access to MarkdownGenerationResult attributes.
+        
+        This approach allows backward compatibility with code that expects 'markdown'
+        to be a string, while providing access to the full MarkdownGenerationResult.
+        """
+        if self._markdown is None:
+            return None
+        return StringCompatibleMarkdown(self._markdown)
+    
+    @markdown.setter
+    def markdown(self, value):
+        """
+        Setter for the markdown property.
+        """
+        self._markdown = value
+    
+    @property
+    def markdown_v2(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+
+        This property exists to inform users that 'markdown_v2' has been
+        deprecated and they should use 'markdown' instead.
+        """
+        raise AttributeError(
+            "The 'markdown_v2' attribute is deprecated and has been removed. "
+            """Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
+            following properties:
+            - raw_markdown: The raw markdown string
+            - markdown_with_citations: The markdown string with citations
+            - references_markdown: The markdown string with references
+            - fit_markdown: The markdown string with fit text
+            """
+        )
+    
+    @property
+    def fit_markdown(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_markdown' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_markdown' instead."
+        )
+    
+    @property
+    def fit_html(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_html' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_html' instead."
+        )
+
+    def model_dump(self, *args, **kwargs):
+        """
+        Override model_dump to include the _markdown private attribute in serialization.
+        
+        This override is necessary because:
+        1. PrivateAttr fields are excluded from serialization by default
+        2. We need to maintain backward compatibility by including the 'markdown' field
+           in the serialized output
+        3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
+           the same type of data
+        
+        Future developers: This method ensures that the markdown content is properly
+        serialized despite being stored in a private attribute. If the serialization
+        requirements change, this is where you would update the logic.
+        """
+        result = super().model_dump(*args, **kwargs)
+        if self._markdown is not None:
+            result["markdown"] = self._markdown.model_dump() 
+        return result
+
+class StringCompatibleMarkdown(str):
+    """A string subclass that also provides access to MarkdownGenerationResult attributes"""
+    def __new__(cls, markdown_result):
+        return super().__new__(cls, markdown_result.raw_markdown)
+    
+    def __init__(self, markdown_result):
+        self._markdown_result = markdown_result
+    
+    def __getattr__(self, name):
+        return getattr(self._markdown_result, name)
+
+CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
+
+class CrawlResultContainer(Generic[CrawlResultT]):
+    def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
+        # Normalize to a list
+        if isinstance(results, list):
+            self._results = results
+        else:
+            self._results = [results]
+
+    def __iter__(self):
+        return iter(self._results)
+
+    def __getitem__(self, index):
+        return self._results[index]
+
+    def __len__(self):
+        return len(self._results)
+
+    def __getattr__(self, attr):
+        # Delegate attribute access to the first element.
+        if self._results:
+            return getattr(self._results[0], attr)
+        raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self._results!r})"
+
+RunManyReturn = Union[
+    CrawlResultContainer[CrawlResultT],
+    AsyncGenerator[CrawlResultT, None]
+]
+
+
+# END of backward compatibility code for markdown/markdown_v2.
+# When removing this code in the future, make sure to:
+# 1. Replace the private attribute and property with a standard field
+# 2. Update any serialization logic that might depend on the current behavior
 
 class AsyncCrawlResponse(BaseModel):
     html: str
     response_headers: Dict[str, str]
+    js_execution_result: Optional[Dict[str, Any]] = None
     status_code: int
     screenshot: Optional[str] = None
     pdf_data: Optional[bytes] = None
+    mhtml_data: Optional[str] = None
     get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
     downloaded_files: Optional[List[str]] = None
     ssl_certificate: Optional[SSLCertificate] = None
     redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
 
     class Config:
         arbitrary_types_allowed = True
 
-
 ###############################
 # Scraping Models
 ###############################
 class MediaItem(BaseModel):
     src: Optional[str] = ""
+    data: Optional[str] = ""
     alt: Optional[str] = ""
     desc: Optional[str] = ""
     score: Optional[int] = 0
@@ -167,6 +355,7 @@ class Media(BaseModel):
     audios: List[
         MediaItem
     ] = []  # Using MediaItem model for now, can be extended with Audio model if needed
+    tables: List[Dict] = []  # Table data extracted from HTML tables
 
 
 class Links(BaseModel):
diff --git a/crawl4ai/processors/pdf/__init__.py b/crawl4ai/processors/pdf/__init__.py
new file mode 100644
index 00000000..947641cb
--- /dev/null
+++ b/crawl4ai/processors/pdf/__init__.py
@@ -0,0 +1,165 @@
+from pathlib import Path
+import asyncio
+from dataclasses import asdict
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
+from crawl4ai.models import AsyncCrawlResponse, ScrapingResult 
+from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
+from .processor import NaivePDFProcessorStrategy  # Assuming your current PDF code is in pdf_processor.py
+
+class PDFCrawlerStrategy(AsyncCrawlerStrategy):
+    def __init__(self, logger: AsyncLogger = None):
+        self.logger = logger
+        
+    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
+        # Just pass through with empty HTML - scraper will handle actual processing
+        return AsyncCrawlResponse(
+            html="",  # Scraper will handle the real work
+            response_headers={"Content-Type": "application/pdf"},
+            status_code=200
+        )
+    
+    async def close(self):
+        pass        
+        
+    async def __aenter__(self):        
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+
+class PDFContentScrapingStrategy(ContentScrapingStrategy):
+    """
+    A content scraping strategy for PDF files.
+    
+    Attributes:
+        save_images_locally (bool): Whether to save images locally.
+        extract_images (bool): Whether to extract images from PDF.
+        image_save_dir (str): Directory to save extracted images.
+        logger (AsyncLogger): Logger instance for recording events and errors.
+        
+    Methods:
+        scrap(url: str, html: str, **params) -> ScrapingResult:
+            Scrap content from a PDF file.
+        ascrap(url: str, html: str, **kwargs) -> ScrapingResult:
+            Asynchronous version of scrap.
+            
+    Usage:
+        strategy = PDFContentScrapingStrategy(
+            save_images_locally=False,
+            extract_images=False,
+            image_save_dir=None,
+            logger=logger
+        )
+        
+    """
+    def __init__(self, 
+                 save_images_locally : bool = False,
+                 extract_images : bool = False,
+                 image_save_dir : str = None,
+                 batch_size: int = 4,
+                 logger: AsyncLogger = None):
+        self.logger = logger
+        self.pdf_processor = NaivePDFProcessorStrategy(
+            save_images_locally=save_images_locally,
+            extract_images=extract_images,
+            image_save_dir=image_save_dir,
+            batch_size=batch_size
+        )
+
+    def scrap(self, url: str, html: str, **params) -> ScrapingResult:
+        """
+        Scrap content from a PDF file.
+        
+        Args:
+            url (str): The URL of the PDF file.
+            html (str): The HTML content of the page.
+            **params: Additional parameters.
+        
+        Returns:
+            ScrapingResult: The scraped content.
+        """
+        # Download if URL or use local path
+        pdf_path = self._get_pdf_path(url)
+        try:
+            # Process PDF
+            # result = self.pdf_processor.process(Path(pdf_path))
+            result = self.pdf_processor.process_batch(Path(pdf_path))
+            
+            # Combine page HTML
+            cleaned_html = f"""
+        <html>
+            <head><meta name="pdf-pages" content="{len(result.pages)}"></head>
+            <body>
+                {''.join(f'<div class="pdf-page" data-page="{i+1}">{page.html}</div>'
+                         for i, page in enumerate(result.pages))}
+            </body>
+        </html>
+        """
+            
+            # Accumulate media and links with page numbers
+            media = {"images": []}
+            links = {"urls": []}
+            
+            for page in result.pages:
+                # Add page number to each image
+                for img in page.images:
+                    img["page"] = page.page_number
+                    media["images"].append(img)
+                
+                # Add page number to each link
+                for link in page.links:
+                    links["urls"].append({
+                        "url": link,
+                        "page": page.page_number
+                    })
+
+            return ScrapingResult(
+                cleaned_html=cleaned_html,
+                success=True,
+                media=media,
+                links=links,
+                metadata=asdict(result.metadata)
+            )
+        finally:
+            # Cleanup temp file if downloaded
+            if url.startswith(("http://", "https://")):
+                Path(pdf_path).unlink(missing_ok=True)
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # For simple cases, you can use the sync version
+        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
+        
+
+    def _get_pdf_path(self, url: str) -> str:
+        if url.startswith(("http://", "https://")):
+            import tempfile
+            import requests
+            
+            # Create temp file with .pdf extension
+            temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
+            
+            try:
+                # Download PDF with streaming
+                response = requests.get(url, stream=True)
+                response.raise_for_status()
+                
+                # Write to temp file
+                with open(temp_file.name, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                        
+                return temp_file.name
+                
+            except Exception as e:
+                # Clean up temp file if download fails
+                Path(temp_file.name).unlink(missing_ok=True)
+                raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
+                
+        elif url.startswith("file://"):
+            return url[7:]  # Strip file:// prefix
+            
+        return url  # Assume local path
+    
+
+__all__ = ["PDFCrawlerStrategy", "PDFContentScrapingStrategy"]
\ No newline at end of file
diff --git a/crawl4ai/processors/pdf/processor.py b/crawl4ai/processors/pdf/processor.py
new file mode 100644
index 00000000..2888eef1
--- /dev/null
+++ b/crawl4ai/processors/pdf/processor.py
@@ -0,0 +1,487 @@
+import logging
+import re
+from abc import ABC, abstractmethod
+from datetime import datetime
+from pathlib import Path
+from time import time
+from dataclasses import dataclass, asdict, field
+from typing import Dict, List, Optional, Any, Union
+import base64
+import tempfile
+from .utils import *
+from .utils import (
+    apply_png_predictor,
+    clean_pdf_text,
+    clean_pdf_text_to_html,
+)
+
+# Remove direct PyPDF2 imports from the top
+# import PyPDF2
+# from PyPDF2 import PdfReader
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class PDFMetadata:
+    title: Optional[str] = None
+    author: Optional[str] = None
+    producer: Optional[str] = None
+    created: Optional[datetime] = None
+    modified: Optional[datetime] = None
+    pages: int = 0
+    encrypted: bool = False
+    file_size: Optional[int] = None
+
+@dataclass
+class PDFPage:
+    page_number: int
+    raw_text: str = ""
+    markdown: str = ""
+    html: str = ""
+    images: List[Dict] = field(default_factory=list)
+    links: List[str] = field(default_factory=list)
+    layout: List[Dict] = field(default_factory=list)
+
+@dataclass
+class PDFProcessResult:
+    metadata: PDFMetadata
+    pages: List[PDFPage]
+    processing_time: float = 0.0
+    version: str = "1.0"
+
+class PDFProcessorStrategy(ABC):
+    @abstractmethod
+    def process(self, pdf_path: Path) -> PDFProcessResult:
+        pass
+
+class NaivePDFProcessorStrategy(PDFProcessorStrategy):
+    def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, 
+                 save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
+        # Import check at initialization time
+        try:
+            import PyPDF2
+        except ImportError:
+            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            
+        self.image_dpi = image_dpi
+        self.image_quality = image_quality
+        self.current_page_number = 0
+        self.extract_images = extract_images
+        self.save_images_locally = save_images_locally
+        self.image_save_dir = image_save_dir
+        self.batch_size = batch_size
+        self._temp_dir = None
+
+    def process(self, pdf_path: Path) -> PDFProcessResult:
+        # Import inside method to allow dependency to be optional
+        try:
+            from PyPDF2 import PdfReader
+        except ImportError:
+            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            
+        start_time = time()
+        result = PDFProcessResult(
+            metadata=PDFMetadata(),
+            pages=[],
+            version="1.1"
+        )
+
+        try:
+            with pdf_path.open('rb') as file:
+                reader = PdfReader(file)
+                result.metadata = self._extract_metadata(pdf_path, reader)
+                
+                # Handle image directory
+                image_dir = None
+                if self.extract_images and self.save_images_locally:
+                    if self.image_save_dir:
+                        image_dir = Path(self.image_save_dir)
+                        image_dir.mkdir(exist_ok=True, parents=True)
+                    else:
+                        self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
+                        image_dir = Path(self._temp_dir)
+
+                for page_num, page in enumerate(reader.pages):
+                    self.current_page_number = page_num + 1
+                    pdf_page = self._process_page(page, image_dir)
+                    result.pages.append(pdf_page)
+
+        except Exception as e:
+            logger.error(f"Failed to process PDF: {str(e)}")
+            raise
+        finally:
+            # Cleanup temp directory if it was created
+            if self._temp_dir and not self.image_save_dir:
+                import shutil
+                try:
+                    shutil.rmtree(self._temp_dir)
+                except Exception as e:
+                    logger.error(f"Failed to cleanup temp directory: {str(e)}")
+
+        result.processing_time = time() - start_time
+        return result
+
+    def process_batch(self, pdf_path: Path) -> PDFProcessResult:
+        """Like process() but processes PDF pages in parallel batches"""
+        # Import inside method to allow dependency to be optional
+        try:
+            from PyPDF2 import PdfReader
+            import PyPDF2  # For type checking
+        except ImportError:
+            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            
+        import concurrent.futures
+        import threading
+        
+        # Initialize PyPDF2 thread support
+        if not hasattr(threading.current_thread(), "_children"): 
+            threading.current_thread()._children = set()
+        
+        start_time = time()
+        result = PDFProcessResult(
+            metadata=PDFMetadata(),
+            pages=[],
+            version="1.1" 
+        )
+
+        try:
+            # Get metadata and page count from main thread
+            with pdf_path.open('rb') as file:
+                reader = PdfReader(file)
+                result.metadata = self._extract_metadata(pdf_path, reader)
+                total_pages = len(reader.pages)
+
+            # Handle image directory setup
+            image_dir = None
+            if self.extract_images and self.save_images_locally:
+                if self.image_save_dir:
+                    image_dir = Path(self.image_save_dir)
+                    image_dir.mkdir(exist_ok=True, parents=True)
+                else:
+                    self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
+                    image_dir = Path(self._temp_dir)
+
+            def process_page_safely(page_num: int):
+                # Each thread opens its own file handle
+                with pdf_path.open('rb') as file:
+                    thread_reader = PdfReader(file)
+                    page = thread_reader.pages[page_num]
+                    self.current_page_number = page_num + 1
+                    return self._process_page(page, image_dir)
+
+            # Process pages in parallel batches
+            with concurrent.futures.ThreadPoolExecutor(max_workers=self.batch_size) as executor:
+                futures = []
+                for page_num in range(total_pages):
+                    future = executor.submit(process_page_safely, page_num)
+                    futures.append((page_num + 1, future))
+
+                # Collect results in order
+                result.pages = [None] * total_pages
+                for page_num, future in futures:
+                    try:
+                        pdf_page = future.result()
+                        result.pages[page_num - 1] = pdf_page
+                    except Exception as e:
+                        logger.error(f"Failed to process page {page_num}: {str(e)}")
+                        raise
+
+        except Exception as e:
+            logger.error(f"Failed to process PDF: {str(e)}")
+            raise
+        finally:
+            # Cleanup temp directory if it was created
+            if self._temp_dir and not self.image_save_dir:
+                import shutil
+                try:
+                    shutil.rmtree(self._temp_dir)
+                except Exception as e:
+                    logger.error(f"Failed to cleanup temp directory: {str(e)}")
+
+        result.processing_time = time() - start_time
+        return result
+
+    def _process_page(self, page, image_dir: Optional[Path]) -> PDFPage:
+        pdf_page = PDFPage(
+            page_number=self.current_page_number,
+        )
+
+        # Text and font extraction
+        def visitor_text(text, cm, tm, font_dict, font_size):
+            pdf_page.raw_text += text
+            pdf_page.layout.append({
+                "type": "text",
+                "text": text,
+                "x": tm[4],
+                "y": tm[5],
+            })
+        
+        page.extract_text(visitor_text=visitor_text)
+
+        # Image extraction
+        if self.extract_images:
+            pdf_page.images = self._extract_images(page, image_dir)
+
+        # Link extraction
+        pdf_page.links = self._extract_links(page)
+        
+        # Add markdown content
+        pdf_page.markdown = clean_pdf_text(self.current_page_number, pdf_page.raw_text)
+        pdf_page.html = clean_pdf_text_to_html(self.current_page_number, pdf_page.raw_text)
+
+        return pdf_page
+
+    def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
+        # Import PyPDF2 for type checking only when needed
+        try:
+            import PyPDF2
+        except ImportError:
+            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            
+        if not self.extract_images:
+            return []
+
+        images = []
+        try:
+            resources = page.get("/Resources")
+            if resources:  # Check if resources exist
+                resources = resources.get_object()  # Resolve IndirectObject
+                if '/XObject' in resources:
+                    xobjects = resources['/XObject'].get_object()
+                    img_count = 0
+                    for obj_name in xobjects:
+                        xobj = xobjects[obj_name]
+                        if hasattr(xobj, 'get_object') and callable(xobj.get_object):
+                            xobj = xobj.get_object()
+                            if xobj.get('/Subtype') == '/Image':
+                                try:
+                                    img_count += 1
+                                    img_filename = f"page_{self.current_page_number}_img_{img_count}"
+                                    data = xobj.get_data()
+                                    filters = xobj.get('/Filter', [])
+                                    if not isinstance(filters, list):
+                                        filters = [filters]
+
+                                    # Resolve IndirectObjects in properties
+                                    width = xobj.get('/Width', 0)
+                                    height = xobj.get('/Height', 0)
+                                    color_space = xobj.get('/ColorSpace', '/DeviceRGB')
+                                    if isinstance(color_space, PyPDF2.generic.IndirectObject):
+                                        color_space = color_space.get_object()
+
+                                    # Handle different image encodings
+                                    success = False
+                                    image_format = 'bin'
+                                    image_data = None
+                                    
+                                    if '/FlateDecode' in filters:
+                                        try:
+                                            decode_parms = xobj.get('/DecodeParms', {})
+                                            if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
+                                                decode_parms = decode_parms.get_object()
+                                            
+                                            predictor = decode_parms.get('/Predictor', 1)
+                                            bits = xobj.get('/BitsPerComponent', 8)
+                                            colors = 3 if color_space == '/DeviceRGB' else 1
+
+                                            if predictor >= 10:
+                                                data = apply_png_predictor(data, width, bits, colors)
+
+                                            # Create PIL Image
+                                            from PIL import Image
+                                            mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
+                                            img = Image.frombytes(mode, (width, height), data)
+                                            
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.png')
+                                                img.save(final_path)
+                                                image_data = str(final_path)
+                                            else:
+                                                import io
+                                                img_byte_arr = io.BytesIO()
+                                                img.save(img_byte_arr, format='PNG')
+                                                image_data = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+                                            
+                                            success = True
+                                            image_format = 'png'
+                                        except Exception as e:
+                                            logger.error(f"FlateDecode error: {str(e)}")
+
+                                    elif '/DCTDecode' in filters:
+                                        # JPEG image
+                                        try:
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.jpg')
+                                                with open(final_path, 'wb') as f:
+                                                    f.write(data)
+                                                image_data = str(final_path)
+                                            else:
+                                                image_data = base64.b64encode(data).decode('utf-8')
+                                            success = True
+                                            image_format = 'jpeg'
+                                        except Exception as e:
+                                            logger.error(f"JPEG save error: {str(e)}")
+
+                                    elif '/CCITTFaxDecode' in filters:
+                                        try:
+                                            if data[:4] != b'II*\x00':
+                                                # Add TIFF header if missing
+                                                tiff_header = b'II*\x00\x08\x00\x00\x00\x0e\x00\x00\x01\x03\x00\x01\x00\x00\x00' + \
+                                                            width.to_bytes(4, 'little') + \
+                                                            b'\x01\x03\x00\x01\x00\x00\x00' + \
+                                                            height.to_bytes(4, 'little') + \
+                                                            b'\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x17\x00\x04\x00\x00\x00\x01\x00\x00\x00J\x01\x1B\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01\x28\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00'
+                                                data = tiff_header + data
+                                            
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.tiff')
+                                                with open(final_path, 'wb') as f:
+                                                    f.write(data)
+                                                image_data = str(final_path)
+                                            else:
+                                                image_data = base64.b64encode(data).decode('utf-8')
+                                            success = True
+                                            image_format = 'tiff'
+                                        except Exception as e:
+                                            logger.error(f"CCITT save error: {str(e)}")
+
+                                    elif '/JPXDecode' in filters:
+                                        # JPEG 2000
+                                        try:
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.jp2')
+                                                with open(final_path, 'wb') as f:
+                                                    f.write(data)
+                                                image_data = str(final_path)
+                                            else:
+                                                image_data = base64.b64encode(data).decode('utf-8')
+                                            success = True
+                                            image_format = 'jpeg2000'
+                                        except Exception as e:
+                                            logger.error(f"JPEG2000 save error: {str(e)}")
+
+                                    if success and image_data:
+                                        image_info = {
+                                            "format": image_format,
+                                            "width": width,
+                                            "height": height,
+                                            "color_space": str(color_space),
+                                            "bits_per_component": xobj.get('/BitsPerComponent', 1)
+                                        }
+                                        
+                                        if self.save_images_locally:
+                                            image_info["path"] = image_data
+                                        else:
+                                            image_info["data"] = image_data
+                                            
+                                        images.append(image_info)
+                                    else:
+                                        # Fallback: Save raw data
+                                        if self.save_images_locally:
+                                            final_path = (image_dir / img_filename).with_suffix('.bin')
+                                            with open(final_path, 'wb') as f:
+                                                f.write(data)
+                                            logger.warning(f"Saved raw image data to {final_path}")
+                                        else:
+                                            image_data = base64.b64encode(data).decode('utf-8')
+                                            images.append({
+                                                "format": "bin",
+                                                "width": width,
+                                                "height": height,
+                                                "color_space": str(color_space),
+                                                "bits_per_component": xobj.get('/BitsPerComponent', 1),
+                                                "data": image_data
+                                            })
+
+                                except Exception as e:
+                                    logger.error(f"Error processing image: {str(e)}")
+        except Exception as e:
+            logger.error(f"Image extraction error: {str(e)}")
+        
+        return images
+
+    def _extract_links(self, page) -> List[str]:
+        links = []
+        if '/Annots' in page:
+            try:
+                for annot in page['/Annots']:
+                    a = annot.get_object()
+                    if '/A' in a and '/URI' in a['/A']:
+                        links.append(a['/A']['/URI'])
+            except Exception as e:
+                print(f"Link error: {str(e)}")
+        return links
+
+    def _extract_metadata(self, pdf_path: Path, reader = None) -> PDFMetadata:
+        # Import inside method to allow dependency to be optional 
+        if reader is None:
+            try:
+                from PyPDF2 import PdfReader
+                reader = PdfReader(pdf_path)
+            except ImportError:
+                raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+
+        meta = reader.metadata or {}
+        created = self._parse_pdf_date(meta.get('/CreationDate', ''))
+        modified = self._parse_pdf_date(meta.get('/ModDate', ''))
+        
+        return PDFMetadata(
+            title=meta.get('/Title'),
+            author=meta.get('/Author'),
+            producer=meta.get('/Producer'),
+            created=created,
+            modified=modified,
+            pages=len(reader.pages),
+            encrypted=reader.is_encrypted,
+            file_size=pdf_path.stat().st_size
+        )
+
+    def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
+        try:
+            match = re.match(r'D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})', date_str)
+            if not match:
+                return None
+                
+            return datetime(
+                year=int(match[1]),
+                month=int(match[2]),
+                day=int(match[3]),
+                hour=int(match[4]),
+                minute=int(match[5]),
+                second=int(match[6])
+            )
+        except:
+            return None
+
+# Usage example
+if __name__ == "__main__":
+    import json
+    from pathlib import Path
+    
+    try:
+        # Import PyPDF2 only when running the file directly
+        import PyPDF2
+        from PyPDF2 import PdfReader
+    except ImportError:
+        print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+        exit(1)
+        
+    current_dir = Path(__file__).resolve().parent
+    pdf_path = f'{current_dir}/test.pdf'
+    
+    strategy = NaivePDFProcessorStrategy()
+    result = strategy.process(Path(pdf_path))
+    
+    # Convert to JSON
+    json_output = asdict(result)
+    print(json.dumps(json_output, indent=2, default=str))
+    
+    with open(f'{current_dir}/test.html', 'w') as f:
+        for page in result.pages:
+            f.write(f'<h1>Page {page["page_number"]}</h1>')
+            f.write(page['html'])
+    with open(f'{current_dir}/test.md', 'w') as f:
+        for page in result.pages:
+            f.write(f'# Page {page["page_number"]}\n\n')
+            f.write(clean_pdf_text(page["page_number"], page['raw_text']))
+            f.write('\n\n')
diff --git a/crawl4ai/processors/pdf/utils.py b/crawl4ai/processors/pdf/utils.py
new file mode 100644
index 00000000..3dc0e739
--- /dev/null
+++ b/crawl4ai/processors/pdf/utils.py
@@ -0,0 +1,350 @@
+import re
+
+def apply_png_predictor(data, width, bits, color_channels):
+    """Decode PNG predictor (PDF 1.5+ filter)"""
+    bytes_per_pixel = (bits * color_channels) // 8
+    if (bits * color_channels) % 8 != 0:
+        bytes_per_pixel += 1
+        
+    stride = width * bytes_per_pixel
+    scanline_length = stride + 1  # +1 for filter byte
+    
+    if len(data) % scanline_length != 0:
+        raise ValueError("Invalid scanline structure")
+    
+    num_lines = len(data) // scanline_length
+    output = bytearray()
+    prev_line = b'\x00' * stride
+    
+    for i in range(num_lines):
+        line = data[i*scanline_length:(i+1)*scanline_length]
+        filter_type = line[0]
+        filtered = line[1:]
+        
+        if filter_type == 0:  # None
+            decoded = filtered
+        elif filter_type == 1:  # Sub
+            decoded = bytearray(filtered)
+            for j in range(bytes_per_pixel, len(decoded)):
+                decoded[j] = (decoded[j] + decoded[j - bytes_per_pixel]) % 256
+        elif filter_type == 2:  # Up
+            decoded = bytearray([(filtered[j] + prev_line[j]) % 256 
+                               for j in range(len(filtered))])
+        elif filter_type == 3:  # Average
+            decoded = bytearray(filtered)
+            for j in range(len(decoded)):
+                left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
+                up = prev_line[j]
+                avg = (left + up) // 2
+                decoded[j] = (decoded[j] + avg) % 256
+        elif filter_type == 4:  # Paeth
+            decoded = bytearray(filtered)
+            for j in range(len(decoded)):
+                left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
+                up = prev_line[j]
+                up_left = prev_line[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
+                paeth = paeth_predictor(left, up, up_left)
+                decoded[j] = (decoded[j] + paeth) % 256
+        else:
+            raise ValueError(f"Unsupported filter type: {filter_type}")
+        
+        output.extend(decoded)
+        prev_line = decoded
+    
+    return bytes(output)
+
+def paeth_predictor(a, b, c):
+    p = a + b - c
+    pa = abs(p - a)
+    pb = abs(p - b)
+    pc = abs(p - c)
+    if pa <= pb and pa <= pc:
+        return a
+    elif pb <= pc:
+        return b
+    else:
+        return c
+
+import re
+import html
+
+def clean_pdf_text_to_html(page_number, text):
+    # Decode Unicode escapes and handle surrogate pairs
+    try:
+        decoded = text.encode('latin-1').decode('unicode-escape')
+        decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
+    except Exception as e:
+        decoded = text  # Fallback if decoding fails
+    
+    article_title_detected = False
+    # decoded = re.sub(r'\.\n', '.\n\n', decoded)
+    # decoded = re.sub(r'\.\n', '<|break|>', decoded)
+    lines = decoded.split('\n')
+    output = []
+    current_paragraph = []
+    in_header = False
+    email_pattern = re.compile(r'\{.*?\}')
+    affiliation_pattern = re.compile(r'^†')
+    quote_pattern = re.compile(r'^["“]')
+    author_pattern = re.compile(
+        r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
+        r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
+        r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
+    )
+    
+    def flush_paragraph():
+        if current_paragraph:
+            para = ' '.join(current_paragraph)
+            para = re.sub(r'\s+', ' ', para).strip()
+            if para:
+                # escaped_para = html.escape(para)
+                escaped_para = para
+                # escaped_para = re.sub(r'\.\n', '.\n\n', escaped_para)
+                # Split escaped_para by <|break|> to avoid HTML escaping
+                escaped_para = escaped_para.split('.\n\n')
+                # Wrap each part in <p> tag
+                escaped_para = [f'<p>{part}</p>' for part in escaped_para]
+                output.append(f'<div class="paragraph">{"".join(escaped_para)}</div><hr/>')
+            current_paragraph.clear()
+    
+    for i, line in enumerate(lines):
+        line = line.strip()
+        
+        # Handle empty lines
+        if not line:
+            flush_paragraph()
+            continue
+            
+        # Detect article title (first line with reasonable length)
+        if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and len(lines) > 1:
+            flush_paragraph()
+            escaped_line = html.escape(line)
+            output.append(f'<h2>{escaped_line}</h2>')
+            article_title_detected = True
+            continue
+            
+        # Detect numbered headers like "2.1 Background"
+        numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
+        if i > 0 and not lines[i-1].strip() and numbered_header:
+            flush_paragraph()
+            level = numbered_header.group(1).count('.') + 1
+            header_text = numbered_header.group(2)
+            md_level = min(level + 1, 6)
+            escaped_header = html.escape(header_text)
+            output.append(f'<h{md_level}>{escaped_header}</h{md_level}>')
+            in_header = True
+            continue
+            
+        # Detect authors
+        if page_number == 1 and author_pattern.match(line):
+            authors = re.sub(r'[†â€]', '', line)
+            authors = re.split(r', | and ', authors)
+            formatted_authors = []
+            for author in authors:
+                if author.strip():
+                    parts = [p for p in author.strip().split() if p]
+                    formatted = ' '.join(parts)
+                    escaped_author = html.escape(formatted)
+                    formatted_authors.append(f'<strong>{escaped_author}</strong>')
+            
+            if len(formatted_authors) > 1:
+                joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
+            else:
+                joined = formatted_authors[0]
+            
+            output.append(f'<p>{joined}</p>')
+            continue
+            
+        # Detect affiliation
+        if affiliation_pattern.match(line):
+            escaped_line = html.escape(line)
+            output.append(f'<p><em>{escaped_line}</em></p>')
+            continue
+            
+        # Detect emails
+        if email_pattern.match(line):
+            escaped_line = html.escape(line)
+            output.append(f'<p><code>{escaped_line}</code></p>')
+            continue
+            
+        # Detect section headers
+        if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
+            flush_paragraph()
+            escaped_line = html.escape(line)
+            output.append(f'<h2 class="section-header"><em>{escaped_line}</em></h2>')
+            in_header = True
+            continue
+            
+        # Handle quotes
+        if quote_pattern.match(line):
+            flush_paragraph()
+            escaped_line = html.escape(line)
+            output.append(f'<blockquote><p>{escaped_line}</p></blockquote>')
+            continue
+            
+        # Handle hyphenated words
+        if line.endswith('-'):
+            current_paragraph.append(line[:-1].strip())
+        else:
+            current_paragraph.append(line)
+            
+        # Handle paragraph breaks after headers
+        if in_header and not line.endswith(('.', '!', '?')):
+            flush_paragraph()
+            in_header = False
+    
+    flush_paragraph()
+    
+    # Post-process HTML
+    html_output = '\n'.join(output)
+    
+    # Fix common citation patterns
+    html_output = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'<cite>\1</cite>', html_output)
+    
+    # Fix escaped characters
+    html_output = html_output.replace('\\ud835', '').replace('\\u2020', '†')
+    
+    # Remove leftover hyphens and fix spacing
+    html_output = re.sub(r'\s+-\s+', '', html_output)
+    html_output = re.sub(r'\s+([.,!?)])', r'\1', html_output)
+    
+    return html_output
+
+def clean_pdf_text(page_number, text):
+    # Decode Unicode escapes and handle surrogate pairs
+    try:
+        decoded = text.encode('latin-1').decode('unicode-escape')
+        decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
+    except Exception as e:
+        decoded = text  # Fallback if decoding fails
+    
+    article_title_detected = False
+    decoded = re.sub(r'\.\n', '.\n\n', decoded)
+    lines = decoded.split('\n')
+    output = []
+    current_paragraph = []
+    in_header = False
+    email_pattern = re.compile(r'\{.*?\}')
+    affiliation_pattern = re.compile(r'^†')
+    quote_pattern = re.compile(r'^["“]')
+    author_pattern = re.compile(
+        r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
+        r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
+        r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
+    )
+    
+    def flush_paragraph():
+        if current_paragraph:
+            para = ' '.join(current_paragraph)
+            para = re.sub(r'\s+', ' ', para).strip()
+            if para:
+                output.append(para)
+            current_paragraph.clear()
+    
+    for i, line in enumerate(lines):
+        line = line.strip()
+        
+        # Handle special patterns
+        if not line:
+            flush_paragraph()
+            continue
+            
+        # Detect headline (first line, reasonable length, surrounded by empty lines)
+        if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and (len(lines) > 1):
+            flush_paragraph()
+            output.append(f'## {line}')
+            continue
+            
+        # Detect paragraph breaks for ALL paragraphs
+        if not line and current_paragraph:
+            flush_paragraph()
+            output.append('')  # Add empty line between paragraphs
+            continue
+                    
+        # Detect numbered headers like "2.1 Background"
+        numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
+        if not lines[i-1].strip() and numbered_header:
+            flush_paragraph()
+            level = numbered_header.group(1).count('.') + 1  # Convert 2.1 → level 2
+            header_text = numbered_header.group(2)
+            # Never go beyond ### for subsections
+            md_level = min(level + 1, 6)   # 1 → ##, 2 → ###, 3 → #### etc
+            output.append(f'{"#" * md_level} {header_text}')
+            in_header = True
+            continue            
+            
+                    
+        # Detect authors
+        if page_number == 1 and author_pattern.match(line):
+            # Clean and format author names
+            authors = re.sub(r'[†â€]', '', line)  # Remove affiliation markers
+            authors = re.split(r', | and ', authors)
+            formatted_authors = []
+            for author in authors:
+                if author.strip():
+                    # Handle "First Last" formatting
+                    parts = [p for p in author.strip().split() if p]
+                    formatted = ' '.join(parts)
+                    formatted_authors.append(f'**{formatted}**')
+            
+            # Join with commas and "and"
+            if len(formatted_authors) > 1:
+                joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
+            else:
+                joined = formatted_authors[0]
+            
+            output.append(joined)
+            continue
+            
+        # Detect affiliation
+        if affiliation_pattern.match(line):
+            output.append(f'*{line}*')
+            continue
+            
+        # Detect emails
+        if email_pattern.match(line):
+            output.append(f'`{line}`')
+            continue
+            
+        # Detect section headers
+        if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
+            flush_paragraph()
+            output.append(f'_[{line}]_')
+            in_header = True
+            continue
+            
+           
+        # Handle quotes
+        if quote_pattern.match(line):
+            flush_paragraph()
+            output.append(f'> {line}')
+            continue
+            
+        # Handle hyphenated words
+        if line.endswith('-'):
+            current_paragraph.append(line[:-1].strip())
+        else:
+            current_paragraph.append(line)
+            
+        # Handle paragraph breaks after headers
+        if in_header and not line.endswith(('.', '!', '?')):
+            flush_paragraph()
+            in_header = False
+    
+    flush_paragraph()
+    
+    # Post-processing
+    markdown = '\n\n'.join(output)
+    
+    # Fix common citation patterns
+    markdown = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'[\1]', markdown)
+    
+    # Fix escaped characters
+    markdown = markdown.replace('\\ud835', '').replace('\\u2020', '†')
+    
+    # Remove leftover hyphens and fix spacing
+    markdown = re.sub(r'\s+-\s+', '', markdown)  # Join hyphenated words
+    markdown = re.sub(r'\s+([.,!?)])', r'\1', markdown)  # Fix punctuation spacing
+    
+    
+    return markdown
diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py
index be5e0310..84ffea88 100644
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -198,25 +198,70 @@ Avoid Common Mistakes:
 - Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
 - Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
 - Do not miss closing </blocks> tag at the end of the JSON output.
-- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
+- Do not generate the Python code show me how to do the task, this is your task to extract the information and return it in JSON format.
 
 Result
 Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
 
+PROMPT_EXTRACT_INFERRED_SCHEMA = """Here is the content from the URL:
+<url>{URL}</url>
+
+<url_content>
+{HTML}
+</url_content>
+
+Please carefully read the URL content and the user's request. Analyze the page structure and infer the most appropriate JSON schema based on the content and request.
+
+Extraction Strategy:
+1. First, determine if the page contains repetitive items (like multiple products, articles, etc.) or a single content item (like a single article or page).
+2. For repetitive items: Identify the common pattern and extract each instance as a separate JSON object in an array.
+3. For single content: Extract the key information into a comprehensive JSON object that captures the essential details.
+
+Extraction instructions:
+Return the extracted information as a list of JSON objects. For repetitive content, each object in the list should correspond to a distinct item. For single content, you may return just one detailed JSON object. Wrap the entire JSON list in <blocks>...</blocks> XML tags.
+
+Schema Design Guidelines:
+- Create meaningful property names that clearly describe the data they contain
+- Use nested objects for hierarchical information
+- Use arrays for lists of related items
+- Include all information requested by the user
+- Maintain consistency in property names and data structures
+- Only include properties that are actually present in the content
+- For dates, prefer ISO format (YYYY-MM-DD)
+- For prices or numeric values, extract them without currency symbols when possible
+
+Quality Reflection:
+Before outputting your final answer, double check that:
+1. The inferred schema makes logical sense for the type of content
+2. All requested information is included
+3. The JSON is valid and could be parsed without errors
+4. Property names are consistent and descriptive
+5. The structure is optimal for the type of data being represented
+
+Avoid Common Mistakes:
+- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
+- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
+- Do not miss closing </blocks> tag at the end of the JSON output.
+- Do not generate Python code showing how to do the task; this is your task to extract the information and return it in JSON format.
+- Ensure consistency in property names across all objects
+- Don't include empty properties or null values unless they're meaningful
+- For repetitive content, ensure all objects follow the same schema
+
+Important: If user specific instruction is provided, then stress significantly on what user is requesting and describing about the schema of end result (if any). If user is requesting to extract specific information, then focus on that and ignore the rest of the content.
+<user_request>
+{REQUEST}
+</user_request>
+
+Result:
+Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly.
+
+DO NOT ADD ANY PRE OR POST COMMENTS. JUST RETURN THE JSON OBJECTS INSIDE <blocks>...</blocks> TAGS.
+
+CRITICAL: The content inside the <blocks> tags MUST be a direct array of JSON objects (starting with '[' and ending with ']'), not a dictionary/object containing an array. For example, use <blocks>[{...}, {...}]</blocks> instead of <blocks>{"items": [{...}, {...}]}</blocks>. This is essential for proper parsing.
+"""
 
 PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems.
 
-INPUT HTML: 
-<|HTML_CONTENT_START|>
-{HTML}
-<|HTML_CONTENT_END|>
-
-
-SPECIFIC INSTRUCTION: 
-<|USER_INSTRUCTION_START|>
-{REQUEST}
-<|USER_INSTRUCTION_END|>
-
 TASK DETAILS:
 1. Content Selection
 - DO: Keep essential information, main content, key details
@@ -240,15 +285,7 @@ TASK DETAILS:
 - DON'T: Fragment related content
 - DON'T: Duplicate information
 
-Example Input:
-<div class="main-content"><h1>Setup Guide</h1><p>Follow these steps...</p></div>
-<div class="sidebar">Related articles...</div>
-
-Example Output:
-# Setup Guide
-Follow these steps...
-
-IMPORTANT: If specific instruction is provided above, prioritize those requirements over these general guidelines.
+IMPORTANT: If user specific instruction is provided, ignore above guideline and prioritize those requirements over these general guidelines.
 
 OUTPUT FORMAT: 
 Wrap your response in <content> tags. Use proper markdown throughout.
@@ -256,7 +293,18 @@ Wrap your response in <content> tags. Use proper markdown throughout.
 [Your markdown content here]
 </content>
 
-Begin filtering now."""
+Begin filtering now.
+
+--------------------------------------------
+
+<|HTML_CONTENT_START|>
+{HTML}
+<|HTML_CONTENT_END|>
+
+<|USER_INSTRUCTION_START|>
+{REQUEST}
+<|USER_INSTRUCTION_END|>
+"""
 
 JSON_SCHEMA_BUILDER= """
 # HTML Schema Generation Instructions
diff --git a/crawl4ai/proxy_strategy.py b/crawl4ai/proxy_strategy.py
new file mode 100644
index 00000000..2c01a2f5
--- /dev/null
+++ b/crawl4ai/proxy_strategy.py
@@ -0,0 +1,158 @@
+from typing import List, Dict, Optional
+from abc import ABC, abstractmethod
+from itertools import cycle
+import os
+
+
+########### ATTENTION PEOPLE OF EARTH ###########
+# I have moved this config to async_configs.py, kept it here, in case someone still importing it, however
+# be a dear and follow `from crawl4ai import ProxyConfig` instead :)
+class ProxyConfig:
+    def __init__(
+        self,
+        server: str,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        ip: Optional[str] = None,
+    ):
+        """Configuration class for a single proxy.
+        
+        Args:
+            server: Proxy server URL (e.g., "http://127.0.0.1:8080")
+            username: Optional username for proxy authentication
+            password: Optional password for proxy authentication
+            ip: Optional IP address for verification purposes
+        """
+        self.server = server
+        self.username = username
+        self.password = password
+        
+        # Extract IP from server if not explicitly provided
+        self.ip = ip or self._extract_ip_from_server()
+    
+    def _extract_ip_from_server(self) -> Optional[str]:
+        """Extract IP address from server URL."""
+        try:
+            # Simple extraction assuming http://ip:port format
+            if "://" in self.server:
+                parts = self.server.split("://")[1].split(":")
+                return parts[0]
+            else:
+                parts = self.server.split(":")
+                return parts[0]
+        except Exception:
+            return None
+    
+    @staticmethod
+    def from_string(proxy_str: str) -> "ProxyConfig":
+        """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
+        parts = proxy_str.split(":")
+        if len(parts) == 4:  # ip:port:username:password
+            ip, port, username, password = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                username=username,
+                password=password,
+                ip=ip
+            )
+        elif len(parts) == 2:  # ip:port only
+            ip, port = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                ip=ip
+            )
+        else:
+            raise ValueError(f"Invalid proxy string format: {proxy_str}")
+    
+    @staticmethod
+    def from_dict(proxy_dict: Dict) -> "ProxyConfig":
+        """Create a ProxyConfig from a dictionary."""
+        return ProxyConfig(
+            server=proxy_dict.get("server"),
+            username=proxy_dict.get("username"),
+            password=proxy_dict.get("password"),
+            ip=proxy_dict.get("ip")
+        )
+    
+    @staticmethod
+    def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
+        """Load proxies from environment variable.
+        
+        Args:
+            env_var: Name of environment variable containing comma-separated proxy strings
+            
+        Returns:
+            List of ProxyConfig objects
+        """
+        proxies = []
+        try:
+            proxy_list = os.getenv(env_var, "").split(",")
+            for proxy in proxy_list:
+                if not proxy:
+                    continue
+                proxies.append(ProxyConfig.from_string(proxy))
+        except Exception as e:
+            print(f"Error loading proxies from environment: {e}")
+        return proxies
+    
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "server": self.server,
+            "username": self.username,
+            "password": self.password,
+            "ip": self.ip
+        }
+    
+    def clone(self, **kwargs) -> "ProxyConfig":
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            ProxyConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return ProxyConfig.from_dict(config_dict)
+
+
+class ProxyRotationStrategy(ABC):
+    """Base abstract class for proxy rotation strategies"""
+    
+    @abstractmethod
+    async def get_next_proxy(self) -> Optional[ProxyConfig]:
+        """Get next proxy configuration from the strategy"""
+        pass
+
+    @abstractmethod
+    def add_proxies(self, proxies: List[ProxyConfig]):
+        """Add proxy configurations to the strategy"""
+        pass
+
+class RoundRobinProxyStrategy:
+    """Simple round-robin proxy rotation strategy using ProxyConfig objects"""
+
+    def __init__(self, proxies: List[ProxyConfig] = None):
+        """
+        Initialize with optional list of proxy configurations
+        
+        Args:
+            proxies: List of ProxyConfig objects
+        """
+        self._proxies = []
+        self._proxy_cycle = None
+        if proxies:
+            self.add_proxies(proxies)
+
+    def add_proxies(self, proxies: List[ProxyConfig]):
+        """Add new proxies to the rotation pool"""
+        self._proxies.extend(proxies)
+        self._proxy_cycle = cycle(self._proxies)
+
+    async def get_next_proxy(self) -> Optional[ProxyConfig]:
+        """Get next proxy in round-robin fashion"""
+        if not self._proxy_cycle:
+            return None
+        return next(self._proxy_cycle)
diff --git a/crawl4ai/ssl_certificate.py b/crawl4ai/ssl_certificate.py
index 722bb7f9..a60b7cbc 100644
--- a/crawl4ai/ssl_certificate.py
+++ b/crawl4ai/ssl_certificate.py
@@ -9,83 +9,44 @@ from urllib.parse import urlparse
 import OpenSSL.crypto
 from pathlib import Path
 
-
-class SSLCertificate:
+# === Inherit from dict ===
+class SSLCertificate(dict):
     """
-    A class representing an SSL certificate with methods to export in various formats.
+    A class representing an SSL certificate, behaving like a dictionary
+    for direct JSON serialization. It stores the certificate information internally
+    and provides methods for export and property access.
 
-    Attributes:
-        cert_info (Dict[str, Any]): The certificate information.
-
-        Methods:
-            from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
-            from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
-            from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
-            export_as_pem() -> str: Export the certificate as PEM format.
-            export_as_der() -> bytes: Export the certificate as DER format.
-            export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
-            export_as_text() -> str: Export the certificate as text format.
+    Inherits from dict, so instances are directly JSON serializable.
     """
 
+    # Use __slots__ for potential memory optimization if desired, though less common when inheriting dict
+    # __slots__ = ("_cert_info",) # If using slots, be careful with dict inheritance interaction
+
     def __init__(self, cert_info: Dict[str, Any]):
-        self._cert_info = self._decode_cert_data(cert_info)
-
-    @staticmethod
-    def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
         """
-        Create SSLCertificate instance from a URL.
+        Initializes the SSLCertificate object.
 
         Args:
-            url (str): URL of the website.
-            timeout (int): Timeout for the connection (default: 10).
-
-        Returns:
-            Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
+            cert_info (Dict[str, Any]): The raw certificate dictionary.
         """
-        try:
-            hostname = urlparse(url).netloc
-            if ":" in hostname:
-                hostname = hostname.split(":")[0]
+        # 1. Decode the data (handle bytes -> str)
+        decoded_info = self._decode_cert_data(cert_info)
 
-            context = ssl.create_default_context()
-            with socket.create_connection((hostname, 443), timeout=timeout) as sock:
-                with context.wrap_socket(sock, server_hostname=hostname) as ssock:
-                    cert_binary = ssock.getpeercert(binary_form=True)
-                    x509 = OpenSSL.crypto.load_certificate(
-                        OpenSSL.crypto.FILETYPE_ASN1, cert_binary
-                    )
+        # 2. Store the decoded info internally (optional but good practice)
+        # self._cert_info = decoded_info # You can keep this if methods rely on it
 
-                    cert_info = {
-                        "subject": dict(x509.get_subject().get_components()),
-                        "issuer": dict(x509.get_issuer().get_components()),
-                        "version": x509.get_version(),
-                        "serial_number": hex(x509.get_serial_number()),
-                        "not_before": x509.get_notBefore(),
-                        "not_after": x509.get_notAfter(),
-                        "fingerprint": x509.digest("sha256").hex(),
-                        "signature_algorithm": x509.get_signature_algorithm(),
-                        "raw_cert": base64.b64encode(cert_binary),
-                    }
-
-                    # Add extensions
-                    extensions = []
-                    for i in range(x509.get_extension_count()):
-                        ext = x509.get_extension(i)
-                        extensions.append(
-                            {"name": ext.get_short_name(), "value": str(ext)}
-                        )
-                    cert_info["extensions"] = extensions
-
-                    return SSLCertificate(cert_info)
-
-        except Exception:
-            return None
+        # 3. Initialize the dictionary part of the object with the decoded data
+        super().__init__(decoded_info)
 
     @staticmethod
     def _decode_cert_data(data: Any) -> Any:
         """Helper method to decode bytes in certificate data."""
         if isinstance(data, bytes):
-            return data.decode("utf-8")
+            try:
+                # Try UTF-8 first, fallback to latin-1 for arbitrary bytes
+                return data.decode("utf-8")
+            except UnicodeDecodeError:
+                return data.decode("latin-1") # Or handle as needed, maybe hex representation
         elif isinstance(data, dict):
             return {
                 (
@@ -97,36 +58,119 @@ class SSLCertificate:
             return [SSLCertificate._decode_cert_data(item) for item in data]
         return data
 
+    @staticmethod
+    def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
+        """
+        Create SSLCertificate instance from a URL. Fetches cert info and initializes.
+        (Fetching logic remains the same)
+        """
+        cert_info_raw = None # Variable to hold the fetched dict
+        try:
+            hostname = urlparse(url).netloc
+            if ":" in hostname:
+                hostname = hostname.split(":")[0]
+
+            context = ssl.create_default_context()
+            # Set check_hostname to False and verify_mode to CERT_NONE temporarily
+            # for potentially problematic certificates during fetch, but parse the result regardless.
+            # context.check_hostname = False
+            # context.verify_mode = ssl.CERT_NONE
+
+            with socket.create_connection((hostname, 443), timeout=timeout) as sock:
+                with context.wrap_socket(sock, server_hostname=hostname) as ssock:
+                    cert_binary = ssock.getpeercert(binary_form=True)
+                    if not cert_binary:
+                         print(f"Warning: No certificate returned for {hostname}")
+                         return None
+
+                    x509 = OpenSSL.crypto.load_certificate(
+                        OpenSSL.crypto.FILETYPE_ASN1, cert_binary
+                    )
+
+                    # Create the dictionary directly
+                    cert_info_raw = {
+                        "subject": dict(x509.get_subject().get_components()),
+                        "issuer": dict(x509.get_issuer().get_components()),
+                        "version": x509.get_version(),
+                        "serial_number": hex(x509.get_serial_number()),
+                        "not_before": x509.get_notBefore(), # Keep as bytes initially, _decode handles it
+                        "not_after": x509.get_notAfter(),   # Keep as bytes initially
+                        "fingerprint": x509.digest("sha256").hex(), # hex() is already string
+                        "signature_algorithm": x509.get_signature_algorithm(), # Keep as bytes
+                        "raw_cert": base64.b64encode(cert_binary), # Base64 is bytes, _decode handles it
+                    }
+
+                    # Add extensions
+                    extensions = []
+                    for i in range(x509.get_extension_count()):
+                        ext = x509.get_extension(i)
+                        # get_short_name() returns bytes, str(ext) handles value conversion
+                        extensions.append(
+                            {"name": ext.get_short_name(), "value": str(ext)}
+                        )
+                    cert_info_raw["extensions"] = extensions
+
+        except ssl.SSLCertVerificationError as e:
+             print(f"SSL Verification Error for {url}: {e}")
+             # Decide if you want to proceed or return None based on your needs
+             # You might try fetching without verification here if needed, but be cautious.
+             return None
+        except socket.gaierror:
+            print(f"Could not resolve hostname: {hostname}")
+            return None
+        except socket.timeout:
+            print(f"Connection timed out for {url}")
+            return None
+        except Exception as e:
+            print(f"Error fetching/processing certificate for {url}: {e}")
+            # Log the full error details if needed: logging.exception("Cert fetch error")
+            return None
+
+        # If successful, create the SSLCertificate instance from the dictionary
+        if cert_info_raw:
+             return SSLCertificate(cert_info_raw)
+        else:
+             return None
+
+
+    # --- Properties now access the dictionary items directly via self[] ---
+    @property
+    def issuer(self) -> Dict[str, str]:
+        return self.get("issuer", {}) # Use self.get for safety
+
+    @property
+    def subject(self) -> Dict[str, str]:
+        return self.get("subject", {})
+
+    @property
+    def valid_from(self) -> str:
+        return self.get("not_before", "")
+
+    @property
+    def valid_until(self) -> str:
+        return self.get("not_after", "")
+
+    @property
+    def fingerprint(self) -> str:
+        return self.get("fingerprint", "")
+
+    # --- Export methods can use `self` directly as it is the dict ---
     def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
-        """
-        Export certificate as JSON.
-
-        Args:
-            filepath (Optional[str]): Path to save the JSON file (default: None).
-
-        Returns:
-            Optional[str]: JSON string if successful, None otherwise.
-        """
-        json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
+        """Export certificate as JSON."""
+        # `self` is already the dictionary we want to serialize
+        json_str = json.dumps(self, indent=2, ensure_ascii=False)
         if filepath:
             Path(filepath).write_text(json_str, encoding="utf-8")
             return None
         return json_str
 
     def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
-        """
-        Export certificate as PEM.
-
-        Args:
-            filepath (Optional[str]): Path to save the PEM file (default: None).
-
-        Returns:
-            Optional[str]: PEM string if successful, None otherwise.
-        """
+        """Export certificate as PEM."""
         try:
+            # Decode the raw_cert (which should be string due to _decode)
+            raw_cert_bytes = base64.b64decode(self.get("raw_cert", ""))
             x509 = OpenSSL.crypto.load_certificate(
-                OpenSSL.crypto.FILETYPE_ASN1,
-                base64.b64decode(self._cert_info["raw_cert"]),
+                OpenSSL.crypto.FILETYPE_ASN1, raw_cert_bytes
             )
             pem_data = OpenSSL.crypto.dump_certificate(
                 OpenSSL.crypto.FILETYPE_PEM, x509
@@ -136,49 +180,25 @@ class SSLCertificate:
                 Path(filepath).write_text(pem_data, encoding="utf-8")
                 return None
             return pem_data
-        except Exception:
-            return None
+        except Exception as e:
+             print(f"Error converting to PEM: {e}")
+             return None
 
     def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
-        """
-        Export certificate as DER.
-
-        Args:
-            filepath (Optional[str]): Path to save the DER file (default: None).
-
-        Returns:
-            Optional[bytes]: DER bytes if successful, None otherwise.
-        """
+        """Export certificate as DER."""
         try:
-            der_data = base64.b64decode(self._cert_info["raw_cert"])
+            # Decode the raw_cert (which should be string due to _decode)
+            der_data = base64.b64decode(self.get("raw_cert", ""))
             if filepath:
                 Path(filepath).write_bytes(der_data)
                 return None
             return der_data
-        except Exception:
-            return None
+        except Exception as e:
+             print(f"Error converting to DER: {e}")
+             return None
 
-    @property
-    def issuer(self) -> Dict[str, str]:
-        """Get certificate issuer information."""
-        return self._cert_info.get("issuer", {})
-
-    @property
-    def subject(self) -> Dict[str, str]:
-        """Get certificate subject information."""
-        return self._cert_info.get("subject", {})
-
-    @property
-    def valid_from(self) -> str:
-        """Get certificate validity start date."""
-        return self._cert_info.get("not_before", "")
-
-    @property
-    def valid_until(self) -> str:
-        """Get certificate validity end date."""
-        return self._cert_info.get("not_after", "")
-
-    @property
-    def fingerprint(self) -> str:
-        """Get certificate fingerprint."""
-        return self._cert_info.get("fingerprint", "")
+    # Optional: Add __repr__ for better debugging
+    def __repr__(self) -> str:
+        subject_cn = self.subject.get('CN', 'N/A')
+        issuer_cn = self.issuer.get('CN', 'N/A')
+        return f"<SSLCertificate Subject='{subject_cn}' Issuer='{issuer_cn}'>"
\ No newline at end of file
diff --git a/crawl4ai/types.py b/crawl4ai/types.py
new file mode 100644
index 00000000..63fd45ba
--- /dev/null
+++ b/crawl4ai/types.py
@@ -0,0 +1,187 @@
+from typing import TYPE_CHECKING, Union
+
+# Logger types
+AsyncLoggerBase = Union['AsyncLoggerBaseType']
+AsyncLogger = Union['AsyncLoggerType']
+
+# Crawler core types
+AsyncWebCrawler = Union['AsyncWebCrawlerType']
+CacheMode = Union['CacheModeType']
+CrawlResult = Union['CrawlResultType']
+CrawlerHub = Union['CrawlerHubType']
+BrowserProfiler = Union['BrowserProfilerType']
+
+# Configuration types
+BrowserConfig = Union['BrowserConfigType']
+CrawlerRunConfig = Union['CrawlerRunConfigType']
+HTTPCrawlerConfig = Union['HTTPCrawlerConfigType']
+LLMConfig = Union['LLMConfigType']
+
+# Content scraping types
+ContentScrapingStrategy = Union['ContentScrapingStrategyType']
+WebScrapingStrategy = Union['WebScrapingStrategyType']
+LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
+
+# Proxy types
+ProxyRotationStrategy = Union['ProxyRotationStrategyType']
+RoundRobinProxyStrategy = Union['RoundRobinProxyStrategyType']
+
+# Extraction types
+ExtractionStrategy = Union['ExtractionStrategyType']
+LLMExtractionStrategy = Union['LLMExtractionStrategyType']
+CosineStrategy = Union['CosineStrategyType']
+JsonCssExtractionStrategy = Union['JsonCssExtractionStrategyType']
+JsonXPathExtractionStrategy = Union['JsonXPathExtractionStrategyType']
+
+# Chunking types
+ChunkingStrategy = Union['ChunkingStrategyType']
+RegexChunking = Union['RegexChunkingType']
+
+# Markdown generation types
+DefaultMarkdownGenerator = Union['DefaultMarkdownGeneratorType']
+MarkdownGenerationResult = Union['MarkdownGenerationResultType']
+
+# Content filter types
+RelevantContentFilter = Union['RelevantContentFilterType']
+PruningContentFilter = Union['PruningContentFilterType']
+BM25ContentFilter = Union['BM25ContentFilterType']
+LLMContentFilter = Union['LLMContentFilterType']
+
+# Dispatcher types
+BaseDispatcher = Union['BaseDispatcherType']
+MemoryAdaptiveDispatcher = Union['MemoryAdaptiveDispatcherType']
+SemaphoreDispatcher = Union['SemaphoreDispatcherType']
+RateLimiter = Union['RateLimiterType']
+CrawlerMonitor = Union['CrawlerMonitorType']
+DisplayMode = Union['DisplayModeType']
+RunManyReturn = Union['RunManyReturnType']
+
+# Docker client
+Crawl4aiDockerClient = Union['Crawl4aiDockerClientType']
+
+# Deep crawling types
+DeepCrawlStrategy = Union['DeepCrawlStrategyType']
+BFSDeepCrawlStrategy = Union['BFSDeepCrawlStrategyType']
+FilterChain = Union['FilterChainType']
+ContentTypeFilter = Union['ContentTypeFilterType']
+DomainFilter = Union['DomainFilterType']
+URLFilter = Union['URLFilterType']
+FilterStats = Union['FilterStatsType']
+SEOFilter = Union['SEOFilterType']
+KeywordRelevanceScorer = Union['KeywordRelevanceScorerType']
+URLScorer = Union['URLScorerType']
+CompositeScorer = Union['CompositeScorerType']
+DomainAuthorityScorer = Union['DomainAuthorityScorerType']
+FreshnessScorer = Union['FreshnessScorerType']
+PathDepthScorer = Union['PathDepthScorerType']
+BestFirstCrawlingStrategy = Union['BestFirstCrawlingStrategyType']
+DFSDeepCrawlStrategy = Union['DFSDeepCrawlStrategyType']
+DeepCrawlDecorator = Union['DeepCrawlDecoratorType']
+
+# Only import types during type checking to avoid circular imports
+if TYPE_CHECKING:
+    # Logger imports
+    from .async_logger import (
+        AsyncLoggerBase as AsyncLoggerBaseType,
+        AsyncLogger as AsyncLoggerType,
+    )
+    
+    # Crawler core imports
+    from .async_webcrawler import (
+        AsyncWebCrawler as AsyncWebCrawlerType,
+        CacheMode as CacheModeType,
+    )
+    from .models import CrawlResult as CrawlResultType
+    from .hub import CrawlerHub as CrawlerHubType
+    from .browser_profiler import BrowserProfiler as BrowserProfilerType
+    
+    # Configuration imports
+    from .async_configs import (
+        BrowserConfig as BrowserConfigType,
+        CrawlerRunConfig as CrawlerRunConfigType,
+        HTTPCrawlerConfig as HTTPCrawlerConfigType,
+        LLMConfig as LLMConfigType,
+    )
+    
+    # Content scraping imports
+    from .content_scraping_strategy import (
+        ContentScrapingStrategy as ContentScrapingStrategyType,
+        WebScrapingStrategy as WebScrapingStrategyType,
+        LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
+    )
+    
+    # Proxy imports
+    from .proxy_strategy import (
+        ProxyRotationStrategy as ProxyRotationStrategyType,
+        RoundRobinProxyStrategy as RoundRobinProxyStrategyType,
+    )
+    
+    # Extraction imports
+    from .extraction_strategy import (
+        ExtractionStrategy as ExtractionStrategyType,
+        LLMExtractionStrategy as LLMExtractionStrategyType,
+        CosineStrategy as CosineStrategyType,
+        JsonCssExtractionStrategy as JsonCssExtractionStrategyType,
+        JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType,
+    )
+    
+    # Chunking imports
+    from .chunking_strategy import (
+        ChunkingStrategy as ChunkingStrategyType,
+        RegexChunking as RegexChunkingType,
+    )
+    
+    # Markdown generation imports
+    from .markdown_generation_strategy import (
+        DefaultMarkdownGenerator as DefaultMarkdownGeneratorType,
+    )
+    from .models import MarkdownGenerationResult as MarkdownGenerationResultType
+    
+    # Content filter imports
+    from .content_filter_strategy import (
+        RelevantContentFilter as RelevantContentFilterType,
+        PruningContentFilter as PruningContentFilterType,
+        BM25ContentFilter as BM25ContentFilterType,
+        LLMContentFilter as LLMContentFilterType,
+    )
+    
+    # Dispatcher imports
+    from .async_dispatcher import (
+        BaseDispatcher as BaseDispatcherType,
+        MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType,
+        SemaphoreDispatcher as SemaphoreDispatcherType,
+        RateLimiter as RateLimiterType,
+        CrawlerMonitor as CrawlerMonitorType,
+        DisplayMode as DisplayModeType,
+        RunManyReturn as RunManyReturnType,
+    )
+    
+    # Docker client
+    from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType
+    
+    # Deep crawling imports
+    from .deep_crawling import (
+        DeepCrawlStrategy as DeepCrawlStrategyType,
+        BFSDeepCrawlStrategy as BFSDeepCrawlStrategyType,
+        FilterChain as FilterChainType,
+        ContentTypeFilter as ContentTypeFilterType,
+        DomainFilter as DomainFilterType,
+        URLFilter as URLFilterType,
+        FilterStats as FilterStatsType,
+        SEOFilter as SEOFilterType,
+        KeywordRelevanceScorer as KeywordRelevanceScorerType,
+        URLScorer as URLScorerType,
+        CompositeScorer as CompositeScorerType,
+        DomainAuthorityScorer as DomainAuthorityScorerType,
+        FreshnessScorer as FreshnessScorerType,
+        PathDepthScorer as PathDepthScorerType,
+        BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType,
+        DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
+        DeepCrawlDecorator as DeepCrawlDecoratorType,
+    )
+
+
+
+def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
+    from .async_configs import LLMConfig
+    return LLMConfig(*args, **kwargs)
diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py
index 91e7a31d..df212568 100644
--- a/crawl4ai/user_agent_generator.py
+++ b/crawl4ai/user_agent_generator.py
@@ -3,12 +3,11 @@ from typing import Optional, Literal, List, Dict, Tuple
 import re
 
 from abc import ABC, abstractmethod
-import random
 from fake_useragent import UserAgent
 import requests
 from lxml import html
 import json
-from typing import Optional, List, Union, Dict
+from typing import Union
 
 class UAGen(ABC):
    @abstractmethod
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 2e9e3ff8..bfa8ce9d 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1,33 +1,215 @@
 import time
-from urllib.parse import urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
 import json
 import html
+import lxml
 import re
 import os
 import platform
 from .prompts import PROMPT_EXTRACT_BLOCKS
-from .config import *
+from array import array
+from .html2text import html2text, CustomHTML2Text
+# from .config import *
+from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
+import httpx
+from socket import gaierror
 from pathlib import Path
-from typing import Dict, Any
+from typing import Dict, Any, List, Optional, Callable
 from urllib.parse import urljoin
 import requests
 from requests.exceptions import InvalidSchema
-from typing import Dict, Any
 import xxhash
-from colorama import Fore, Style, init
 import textwrap
 import cProfile
 import pstats
 from functools import wraps
 import asyncio
-
+from lxml import etree, html as lhtml
 import sqlite3
 import hashlib
-from urllib.parse import urljoin, urlparse
+
 from urllib.robotparser import RobotFileParser
 import aiohttp
+from urllib.parse import urlparse, urlunparse
+from functools import lru_cache
+
+from packaging import version
+from . import __version__
+from typing import Sequence
+
+from itertools import chain
+from collections import deque
+from typing import  Generator, Iterable
+
+def chunk_documents(
+    documents: Iterable[str],
+    chunk_token_threshold: int,
+    overlap: int,
+    word_token_rate: float = 0.75,
+    tokenizer: Optional[Callable[[str], List[str]]] = None,
+) -> Generator[str, None, None]:
+    """
+    Efficiently chunks documents into token-limited sections with overlap between chunks.
+
+    Args:
+        documents: Iterable of document strings
+        chunk_token_threshold: Maximum tokens per chunk
+        overlap: Number of tokens to overlap between chunks
+        word_token_rate: Token estimate per word when not using a tokenizer
+        tokenizer: Function that splits text into tokens (if available)
+
+    Yields:
+        Text chunks as strings
+    """
+    token_queue = deque()
+    contribution_queue = deque()
+    current_token_count = 0.0
+
+    for doc in documents:
+        # Tokenize document
+        if tokenizer:
+            tokens = tokenizer(doc)
+            contributions = [1.0] * len(tokens)
+        else:
+            tokens = doc.split()
+            contributions = [word_token_rate] * len(tokens)
+
+        # Add to processing queues
+        token_queue.extend(tokens)
+        contribution_queue.extend(contributions)
+        current_token_count += sum(contributions)
+
+        # Process full chunks
+        while current_token_count >= chunk_token_threshold:
+            # Find chunk split point
+            chunk_tokens = []
+            chunk_contrib = []
+            chunk_total = 0.0
+            
+            # Build chunk up to threshold
+            while contribution_queue:
+                next_contrib = contribution_queue[0]
+                if chunk_total + next_contrib > chunk_token_threshold:
+                    break
+                
+                chunk_total += next_contrib
+                chunk_contrib.append(contribution_queue.popleft())
+                chunk_tokens.append(token_queue.popleft())
+
+            # Handle edge case where first token exceeds threshold
+            if not chunk_contrib:  # Single token exceeds threshold
+                chunk_contrib.append(contribution_queue.popleft())
+                chunk_tokens.append(token_queue.popleft())
+
+            # Calculate overlap
+            overlap_total = 0.0
+            overlap_idx = 0
+            for contrib in reversed(chunk_contrib):
+                if overlap_total + contrib > overlap:
+                    break
+                overlap_total += contrib
+                overlap_idx += 1
+
+            # Prepend overlap to queues
+            if overlap_idx > 0:
+                overlap_tokens = chunk_tokens[-overlap_idx:]
+                overlap_contrib = chunk_contrib[-overlap_idx:]
+                
+                token_queue.extendleft(reversed(overlap_tokens))
+                contribution_queue.extendleft(reversed(overlap_contrib))
+                current_token_count += overlap_total
+
+            # Update current token count and yield chunk
+            current_token_count -= sum(chunk_contrib)
+            yield " ".join(chunk_tokens[:len(chunk_tokens)-overlap_idx] if overlap_idx else chunk_tokens)
+
+    # Yield remaining tokens
+    if token_queue:
+        yield " ".join(token_queue)
+
+def merge_chunks(
+    docs: Sequence[str], 
+    target_size: int,
+    overlap: int = 0,
+    word_token_ratio: float = 1.0,
+    splitter: Callable = None
+) -> List[str]:
+    """Merges documents into chunks of specified token size.
+    
+    Args:
+        docs: Input documents
+        target_size: Desired token count per chunk
+        overlap: Number of tokens to overlap between chunks
+        word_token_ratio: Multiplier for word->token conversion
+    """
+    # Pre-tokenize all docs and store token counts
+    splitter = splitter or str.split
+    token_counts = array('I')
+    all_tokens: List[List[str]] = []
+    total_tokens = 0
+    
+    for doc in docs:
+        tokens = doc.split()
+        count = int(len(tokens) * word_token_ratio)
+        if count:  # Skip empty docs
+            token_counts.append(count)
+            all_tokens.append(tokens)
+            total_tokens += count
+    
+    if not total_tokens:
+        return []
+
+    # Pre-allocate chunks
+    num_chunks = max(1, (total_tokens + target_size - 1) // target_size)
+    chunks: List[List[str]] = [[] for _ in range(num_chunks)]
+    
+    curr_chunk = 0
+    curr_size = 0
+    
+    # Distribute tokens
+    for tokens in chain.from_iterable(all_tokens):
+        if curr_size >= target_size and curr_chunk < num_chunks - 1:
+            if overlap > 0:
+                overlap_tokens = chunks[curr_chunk][-overlap:]
+                curr_chunk += 1
+                chunks[curr_chunk].extend(overlap_tokens)
+                curr_size = len(overlap_tokens)
+            else:
+                curr_chunk += 1
+                curr_size = 0
+                
+        chunks[curr_chunk].append(tokens)
+        curr_size += 1
+
+    # Return only non-empty chunks
+    return [' '.join(chunk) for chunk in chunks if chunk]
+
+
+class VersionManager:
+    def __init__(self):
+        self.home_dir = Path.home() / ".crawl4ai"
+        self.version_file = self.home_dir / "version.txt"
+
+    def get_installed_version(self):
+        """Get the version recorded in home directory"""
+        if not self.version_file.exists():
+            return None
+        try:
+            return version.parse(self.version_file.read_text().strip())
+        except Exception as _ex:
+            return None
+
+    def update_version(self):
+        """Update the version file to current library version"""
+        self.version_file.write_text(__version__.__version__)
+
+    def needs_update(self):
+        """Check if database needs update based on version"""
+        installed = self.get_installed_version()
+        current = version.parse(__version__.__version__)
+        return installed is None or installed < current
+
 
 class RobotsParser:
     # Default 7 days cache TTL
@@ -107,7 +289,7 @@ class RobotsParser:
             domain = parsed.netloc
             if not domain:
                 return True
-        except:
+        except Exception as _ex:
             return True
 
         # Fast path - check cache first
@@ -127,7 +309,7 @@ class RobotsParser:
                             self._cache_rules(domain, rules)
                         else:
                             return True
-            except:
+            except Exception as _ex:
                 # On any error (timeout, connection failed, etc), allow access
                 return True
 
@@ -160,6 +342,77 @@ class InvalidCSSSelectorError(Exception):
     pass
 
 
+SPLITS = bytearray([
+    # Control chars (0-31) + space (32)
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    # Special chars (33-47): ! " # $ % & ' ( ) * + , - . /
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    # Numbers (48-57): Treat as non-splits
+    0,0,0,0,0,0,0,0,0,0,
+    # More special chars (58-64): : ; < = > ? @
+    1,1,1,1,1,1,1,
+    # Uppercase (65-90): Keep
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    # More special chars (91-96): [ \ ] ^ _ `
+    1,1,1,1,1,1,
+    # Lowercase (97-122): Keep
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    # Special chars (123-126): { | } ~
+    1,1,1,1,
+    # Extended ASCII
+    *([1] * 128)
+])
+
+# Additional split chars for HTML/code
+HTML_CODE_CHARS = {
+    # HTML specific
+    '•', '►', '▼', '©', '®', '™', '→', '⇒', '≈', '≤', '≥',
+    # Programming symbols  
+    '+=', '-=', '*=', '/=', '=>', '<=>', '!=', '==', '===',
+    '++', '--', '<<', '>>', '&&', '||', '??', '?:', '?.', 
+    # Common Unicode
+    '…', '"', '"', ''', ''', '«', '»', '—', '–',
+    # Additional splits
+    '+', '=', '~', '@', '#', '$', '%', '^', '&', '*',
+    '(', ')', '{', '}', '[', ']', '|', '\\', '/', '`',
+    '<', '>', ',', '.', '?', '!', ':', ';', '-', '_'
+}
+
+def advanced_split(text: str) -> list[str]:
+    result = []
+    word = array('u')
+    
+    i = 0
+    text_len = len(text)
+    
+    while i < text_len:
+        char = text[i]
+        o = ord(char)
+        
+        # Fast path for ASCII
+        if o < 256 and SPLITS[o]:
+            if word:
+                result.append(word.tounicode())
+                word = array('u')
+        # Check for multi-char symbols
+        elif i < text_len - 1:
+            two_chars = char + text[i + 1]
+            if two_chars in HTML_CODE_CHARS:
+                if word:
+                    result.append(word.tounicode())
+                    word = array('u')
+                i += 1  # Skip next char since we used it
+            else:
+                word.append(char)
+        else:
+            word.append(char)
+        i += 1
+            
+    if word:
+        result.append(word.tounicode())
+        
+    return result
+
 def create_box_message(
     message: str,
     type: str = "info",
@@ -187,14 +440,13 @@ def create_box_message(
         str: A formatted string containing the styled message box.
     """
 
-    init()
-
     # Define border and text colors for different types
     styles = {
-        "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
-        "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
-        "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
-        "error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
+        "warning": ("yellow", "bright_yellow", "⚠"),
+        "info": ("blue", "bright_blue", "ℹ"),
+        "debug": ("lightblack", "bright_black", "⋯"),
+        "success": ("green", "bright_green", "✓"),
+        "error": ("red", "bright_red", "×"),
     }
 
     border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
@@ -226,12 +478,12 @@ def create_box_message(
     # Create the box with colored borders and lighter text
     horizontal_line = h_line * (width - 1)
     box = [
-        f"{border_color}{tl}{horizontal_line}{tr}",
+        f"[{border_color}]{tl}{horizontal_line}{tr}[/{border_color}]",
         *[
-            f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}"
+            f"[{border_color}]{v_line}[{text_color}] {line:<{width-2}}[/{text_color}][{border_color}]{v_line}[/{border_color}]"
             for line in formatted_lines
         ],
-        f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}",
+        f"[{border_color}]{bl}{horizontal_line}{br}[/{border_color}]",
     ]
 
     result = "\n".join(box)
@@ -1124,7 +1376,7 @@ def get_content_of_website_optimized(
             src = img.get("src", "")
             if base64_pattern.match(src):
                 img["src"] = base64_pattern.sub("", src)
-        except:
+        except Exception as _ex:
             pass
 
     cleaned_html = str(body).replace("\n\n", "\n").replace("  ", " ")
@@ -1162,7 +1414,7 @@ def extract_metadata_using_lxml(html, doc=None):
 
     if doc is None:
         try:
-            doc = lhtml.document_fromstring(html)
+            doc = lxml.html.document_fromstring(html)
         except Exception:
             return {}
 
@@ -1297,7 +1549,7 @@ def extract_xml_tags(string):
     return list(set(tags))
 
 
-def extract_xml_data(tags, string):
+def extract_xml_data_legacy(tags, string):
     """
     Extract data for specified XML tags from a string.
 
@@ -1326,6 +1578,38 @@ def extract_xml_data(tags, string):
 
     return data
 
+def extract_xml_data(tags, string):
+    """
+    Extract data for specified XML tags from a string, returning the longest content for each tag.
+
+    How it works:
+    1. Finds all occurrences of each tag in the string using regex.
+    2. For each tag, selects the occurrence with the longest content.
+    3. Returns a dictionary of tag-content pairs.
+
+    Args:
+        tags (List[str]): The list of XML tags to extract.
+        string (str): The input string containing XML data.
+
+    Returns:
+        Dict[str, str]: A dictionary with tag names as keys and longest extracted content as values.
+    """
+
+    data = {}
+
+    for tag in tags:
+        pattern = f"<{tag}>(.*?)</{tag}>"
+        matches = re.findall(pattern, string, re.DOTALL)
+        
+        if matches:
+            # Find the longest content for this tag
+            longest_content = max(matches, key=len).strip()
+            data[tag] = longest_content
+        else:
+            data[tag] = ""
+
+    return data
+
 
 def perform_completion_with_backoff(
     provider,
@@ -1394,6 +1678,19 @@ def perform_completion_with_backoff(
                         "content": ["Rate limit error. Please try again later."],
                     }
                 ]
+        except Exception as e:
+            raise e  # Raise any other exceptions immediately
+            # print("Error during completion request:", str(e))
+            # error_message = e.message
+            # return [
+            #     {
+            #         "index": 0,
+            #         "tags": ["error"],
+            #         "content": [
+            #             f"Error during LLM completion request. {error_message}"
+            #         ],
+            #     }
+            # ]
 
 
 def extract_blocks(url, html, provider=DEFAULT_PROVIDER, api_token=None, base_url=None):
@@ -1478,10 +1775,10 @@ def extract_blocks_batch(batch_data, provider="groq/llama3-70b-8192", api_token=
 
     messages = []
 
-    for url, html in batch_data:
+    for url, _html in batch_data:
         variable_values = {
             "URL": url,
-            "HTML": html,
+            "HTML": _html,
         }
 
         prompt_with_variables = PROMPT_EXTRACT_BLOCKS
@@ -1663,7 +1960,7 @@ def fast_format_html(html_string):
     indent = 0
     indent_str = "  "  # Two spaces for indentation
     formatted = []
-    in_content = False
+    # in_content = False
 
     # Split by < and > to separate tags and content
     parts = html_string.replace(">", ">\n").replace("<", "\n<").split("\n")
@@ -1704,11 +2001,91 @@ def normalize_url(href, base_url):
     if not parsed_base.scheme or not parsed_base.netloc:
         raise ValueError(f"Invalid base URL format: {base_url}")
 
+    # Ensure base_url ends with a trailing slash if it's a directory path
+    if not base_url.endswith('/'):
+        base_url = base_url + '/'
+
     # Use urljoin to handle all cases
     normalized = urljoin(base_url, href.strip())
     return normalized
 
 
+def normalize_url_for_deep_crawl(href, base_url):
+    """Normalize URLs to ensure consistent format"""
+    from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
+
+    # Handle None or empty values
+    if not href:
+        return None
+
+    # Use urljoin to handle relative URLs
+    full_url = urljoin(base_url, href.strip())
+    
+    # Parse the URL for normalization
+    parsed = urlparse(full_url)
+    
+    # Convert hostname to lowercase
+    netloc = parsed.netloc.lower()
+    
+    # Remove fragment entirely
+    fragment = ''
+    
+    # Normalize query parameters if needed
+    query = parsed.query
+    if query:
+        # Parse query parameters
+        params = parse_qs(query)
+        
+        # Remove tracking parameters (example - customize as needed)
+        tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
+        for param in tracking_params:
+            if param in params:
+                del params[param]
+                
+        # Rebuild query string, sorted for consistency
+        query = urlencode(params, doseq=True) if params else ''
+    
+    # Build normalized URL
+    normalized = urlunparse((
+        parsed.scheme,
+        netloc,
+        parsed.path.rstrip('/'),  # Normalize trailing slash
+        parsed.params,
+        query,
+        fragment
+    ))
+    
+    return normalized
+
+@lru_cache(maxsize=10000)
+def efficient_normalize_url_for_deep_crawl(href, base_url):
+    """Efficient URL normalization with proper parsing"""
+    from urllib.parse import urljoin
+    
+    if not href:
+        return None
+    
+    # Resolve relative URLs
+    full_url = urljoin(base_url, href.strip())
+    
+    # Use proper URL parsing
+    parsed = urlparse(full_url)
+    
+    # Only perform the most critical normalizations
+    # 1. Lowercase hostname
+    # 2. Remove fragment
+    normalized = urlunparse((
+        parsed.scheme,
+        parsed.netloc.lower(),
+        parsed.path.rstrip('/'),
+        parsed.params,
+        parsed.query,
+        ''  # Remove fragment
+    ))
+    
+    return normalized
+
+
 def normalize_url_tmp(href, base_url):
     """Normalize URLs to ensure consistent format"""
     # Extract protocol and domain from base URL
@@ -2207,3 +2584,229 @@ def get_error_context(exc_info, context_lines: int = 5):
         "function": func_name,
         "code_context": code_context,
     }
+
+def truncate(value, threshold):
+    if len(value) > threshold:
+        return value[:threshold] + '...'  # Add ellipsis to indicate truncation
+    return value
+
+def optimize_html(html_str, threshold=200):
+    root = lxml.html.fromstring(html_str)
+    
+    for _element in root.iter():
+        # Process attributes
+        for attr in list(_element.attrib):
+            _element.attrib[attr] = truncate(_element.attrib[attr], threshold)
+        
+        # Process text content
+        if _element.text and len(_element.text) > threshold:
+            _element.text = truncate(_element.text, threshold)
+            
+        # Process tail text
+        if _element.tail and len(_element.tail) > threshold:
+            _element.tail = truncate(_element.tail, threshold)
+    
+    return lxml.html.tostring(root, encoding='unicode', pretty_print=False)
+
+class HeadPeekr:
+    @staticmethod
+    async def fetch_head_section(url, timeout=0.3):
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; CrawlBot/1.0)",
+            "Accept": "text/html",
+            "Connection": "close"  # Force close after response
+        }
+        try:
+            async with httpx.AsyncClient(timeout=timeout) as client:
+                response = await client.get(url, headers=headers, follow_redirects=True)
+                
+                # Handle redirects explicitly by using the final URL
+                if response.url != url:
+                    url = str(response.url)
+                    response = await client.get(url, headers=headers)
+                
+                content = b""
+                async for chunk in response.aiter_bytes():
+                    content += chunk
+                    if b"</head>" in content:
+                        break  # Stop after detecting </head>
+                return content.split(b"</head>")[0] + b"</head>"
+        except (httpx.HTTPError, gaierror) :
+            return None
+
+    @staticmethod
+    async def peek_html(url, timeout=0.3):
+        head_section = await HeadPeekr.fetch_head_section(url, timeout=timeout)
+        if head_section:
+            return head_section.decode("utf-8", errors="ignore")
+        return None
+
+    @staticmethod
+    def extract_meta_tags(head_content: str):
+        meta_tags = {}
+        
+        # Find all meta tags
+        meta_pattern = r'<meta[^>]+>'
+        for meta_tag in re.finditer(meta_pattern, head_content):
+            tag = meta_tag.group(0)
+            
+            # Extract name/property and content
+            name_match = re.search(r'name=["\'](.*?)["\']', tag)
+            property_match = re.search(r'property=["\'](.*?)["\']', tag)
+            content_match = re.search(r'content=["\'](.*?)["\']', tag)
+            
+            if content_match and (name_match or property_match):
+                key = name_match.group(1) if name_match else property_match.group(1)
+                meta_tags[key] = content_match.group(1)
+                
+        return meta_tags
+
+    def get_title(head_content: str):
+        title_match = re.search(r'<title>(.*?)</title>', head_content, re.IGNORECASE | re.DOTALL)
+        return title_match.group(1) if title_match else None
+
+def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_threshold=200, max_size=100000):
+    """
+    Preprocess HTML to reduce size while preserving structure for schema generation.
+    
+    Args:
+        html_content (str): Raw HTML content
+        text_threshold (int): Maximum length for text nodes before truncation
+        attr_value_threshold (int): Maximum length for attribute values before truncation
+        max_size (int): Target maximum size for output HTML
+        
+    Returns:
+        str: Preprocessed HTML content
+    """
+    try:
+        # Parse HTML with error recovery
+        parser = etree.HTMLParser(remove_comments=True, remove_blank_text=True)
+        tree = lhtml.fromstring(html_content, parser=parser)
+        
+        # 1. Remove HEAD section (keep only BODY)
+        head_elements = tree.xpath('//head')
+        for head in head_elements:
+            if head.getparent() is not None:
+                head.getparent().remove(head)
+        
+        # 2. Define tags to remove completely
+        tags_to_remove = [
+            'script', 'style', 'noscript', 'iframe', 'canvas', 'svg',
+            'video', 'audio', 'source', 'track', 'map', 'area'
+        ]
+        
+        # Remove unwanted elements
+        for tag in tags_to_remove:
+            elements = tree.xpath(f'//{tag}')
+            for element in elements:
+                if element.getparent() is not None:
+                    element.getparent().remove(element)
+        
+        # 3. Process remaining elements to clean attributes and truncate text
+        for element in tree.iter():
+            # Skip if we're at the root level
+            if element.getparent() is None:
+                continue
+                
+            # Clean non-essential attributes but preserve structural ones
+            # attribs_to_keep = {'id', 'class', 'name', 'href', 'src', 'type', 'value', 'data-'}
+
+            # This is more aggressive than the previous version
+            attribs_to_keep = {'id', 'class', 'name', 'type', 'value'}
+
+            # attributes_hates_truncate = ['id', 'class', "data-"]
+
+            # This means, I don't care, if an attribute is too long, truncate it, go and find a better css selector to build a schema
+            attributes_hates_truncate = []
+            
+            # Process each attribute
+            for attrib in list(element.attrib.keys()):
+                # Keep if it's essential or starts with data-
+                if not (attrib in attribs_to_keep or attrib.startswith('data-')):
+                    element.attrib.pop(attrib)
+                # Truncate long attribute values except for selectors
+                elif attrib not in attributes_hates_truncate and len(element.attrib[attrib]) > attr_value_threshold:
+                    element.attrib[attrib] = element.attrib[attrib][:attr_value_threshold] + '...'
+            
+            # Truncate text content if it's too long
+            if element.text and len(element.text.strip()) > text_threshold:
+                element.text = element.text.strip()[:text_threshold] + '...'
+                
+            # Also truncate tail text if present
+            if element.tail and len(element.tail.strip()) > text_threshold:
+                element.tail = element.tail.strip()[:text_threshold] + '...'
+
+        # 4. Detect duplicates and drop them in a single pass
+        seen: dict[tuple, None] = {}
+        for el in list(tree.xpath('//*[@class]')):          # snapshot once, XPath is fast
+            parent = el.getparent()
+            if parent is None:
+                continue
+
+            cls = el.get('class')
+            if not cls:
+                continue
+
+            # ── build signature ───────────────────────────────────────────
+            h = xxhash.xxh64()                              # stream, no big join()
+            for txt in el.itertext():
+                h.update(txt)
+            sig = (el.tag, cls, h.intdigest())             # tuple cheaper & hashable
+
+            # ── first seen? keep – else drop ─────────────
+            if sig in seen and parent is not None:
+                parent.remove(el)                           # duplicate
+            else:
+                seen[sig] = None
+        
+        # # 4. Find repeated patterns and keep only a few examples
+        # # This is a simplistic approach - more sophisticated pattern detection could be implemented
+        # pattern_elements = {}
+        # for element in tree.xpath('//*[contains(@class, "")]'):
+        #     parent = element.getparent()
+        #     if parent is None:
+        #         continue
+                
+        #     # Create a signature based on tag and classes
+        #     classes = element.get('class', '')
+        #     if not classes:
+        #         continue
+        #     innert_text = ''.join(element.xpath('.//text()'))
+        #     innert_text_hash = xxhash.xxh64(innert_text.encode()).hexdigest()
+        #     signature = f"{element.tag}.{classes}.{innert_text_hash}"
+            
+        #     if signature in pattern_elements:
+        #         pattern_elements[signature].append(element)
+        #     else:
+        #         pattern_elements[signature] = [element]
+        
+        # # Keep only first examples of each repeating pattern
+        # for signature, elements in pattern_elements.items():
+        #     if len(elements) > 1:
+        #         # Keep the first element and remove the rest
+        #         for element in elements[1:]:
+        #             if element.getparent() is not None:
+        #                 element.getparent().remove(element)
+
+
+        # # Keep only 3 examples of each repeating pattern
+        # for signature, elements in pattern_elements.items():
+        #     if len(elements) > 3:
+        #         # Keep the first 2 and last elements
+        #         for element in elements[2:-1]:
+        #             if element.getparent() is not None:
+        #                 element.getparent().remove(element)
+        
+        # 5. Convert back to string
+        result = etree.tostring(tree, encoding='unicode', method='html')
+        
+        # If still over the size limit, apply more aggressive truncation
+        if len(result) > max_size:
+            return result[:max_size] + "..."
+            
+        return result
+    
+    except Exception as e:
+        # Fallback for parsing errors
+        return html_content[:max_size] if len(html_content) > max_size else html_content
+    
diff --git a/deploy/docker/.dockerignore b/deploy/docker/.dockerignore
new file mode 100644
index 00000000..6f126444
--- /dev/null
+++ b/deploy/docker/.dockerignore
@@ -0,0 +1,31 @@
+# .dockerignore
+*
+
+# Allow specific files and directories when using local installation
+!crawl4ai/
+!docs/
+!deploy/docker/
+!setup.py
+!pyproject.toml
+!README.md
+!LICENSE
+!MANIFEST.in
+!setup.cfg
+!mkdocs.yml
+
+.git/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.DS_Store
+.env
+.venv
+venv/
+tests/
+coverage.xml
+*.log
+*.swp
+*.egg-info/
+dist/
+build/
\ No newline at end of file
diff --git a/deploy/docker/.llm.env.example b/deploy/docker/.llm.env.example
new file mode 100644
index 00000000..5fee4a93
--- /dev/null
+++ b/deploy/docker/.llm.env.example
@@ -0,0 +1,8 @@
+# LLM Provider Keys
+OPENAI_API_KEY=your_openai_key_here
+DEEPSEEK_API_KEY=your_deepseek_key_here
+ANTHROPIC_API_KEY=your_anthropic_key_here
+GROQ_API_KEY=your_groq_key_here
+TOGETHER_API_KEY=your_together_key_here
+MISTRAL_API_KEY=your_mistral_key_here
+GEMINI_API_TOKEN=your_gemini_key_here
\ No newline at end of file
diff --git a/deploy/docker/README.md b/deploy/docker/README.md
new file mode 100644
index 00000000..a0273f97
--- /dev/null
+++ b/deploy/docker/README.md
@@ -0,0 +1,821 @@
+# Crawl4AI Docker Guide 🐳
+
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+  - [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
+  - [Option 2: Using Docker Compose](#option-2-using-docker-compose)
+  - [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+  - [Playground Interface](#playground-interface)
+  - [Python SDK](#python-sdk)
+  - [Understanding Request Schema](#understanding-request-schema)
+  - [REST API Examples](#rest-api-examples)
+- [Additional API Endpoints](#additional-api-endpoints)
+  - [HTML Extraction Endpoint](#html-extraction-endpoint)
+  - [Screenshot Endpoint](#screenshot-endpoint)
+  - [PDF Export Endpoint](#pdf-export-endpoint)
+  - [JavaScript Execution Endpoint](#javascript-execution-endpoint)
+  - [Library Context Endpoint](#library-context-endpoint)
+- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
+  - [What is MCP?](#what-is-mcp)
+  - [Connecting via MCP](#connecting-via-mcp)
+  - [Using with Claude Code](#using-with-claude-code)
+  - [Available MCP Tools](#available-mcp-tools)
+  - [Testing MCP Connections](#testing-mcp-connections)
+  - [MCP Schemas](#mcp-schemas)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Server Configuration](#server-configuration)
+  - [Understanding config.yml](#understanding-configyml)
+  - [JWT Authentication](#jwt-authentication)
+  - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
+  - [Customizing Your Configuration](#customizing-your-configuration)
+  - [Configuration Recommendations](#configuration-recommendations)
+- [Getting Help](#getting-help)
+- [Summary](#summary)
+
+## Prerequisites
+
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
+- `git` for cloning the repository.
+- At least 4GB of RAM available for the container (more recommended for heavy use).
+- Python 3.10+ (if using the Python SDK).
+- Node.js 16+ (if using the Node.js examples).
+
+> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+We offer several ways to get the Crawl4AI server running. The quickest way is to use our pre-built Docker Hub images.
+
+### Option 1: Using Pre-built Docker Hub Images (Recommended)
+
+Pull and run images directly from Docker Hub without building locally.
+
+#### 1. Pull the Image
+
+Our latest release candidate is `0.6.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
+
+```bash
+# Pull the release candidate (recommended for latest features)
+docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
+
+# Or pull the latest stable version
+docker pull unclecode/crawl4ai:latest
+```
+
+#### 2. Setup Environment (API Keys)
+
+If you plan to use LLMs, create a `.llm.env` file in your working directory:
+
+```bash
+# Create a .llm.env file with your API keys
+cat > .llm.env << EOL
+# OpenAI
+OPENAI_API_KEY=sk-your-key
+
+# Anthropic
+ANTHROPIC_API_KEY=your-anthropic-key
+
+# Other providers as needed
+# DEEPSEEK_API_KEY=your-deepseek-key
+# GROQ_API_KEY=your-groq-key
+# TOGETHER_API_KEY=your-together-key
+# MISTRAL_API_KEY=your-mistral-key
+# GEMINI_API_TOKEN=your-gemini-token
+EOL
+```
+> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
+
+#### 3. Run the Container
+
+*   **Basic run:**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai \
+      --env-file .llm.env \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
+    ```
+
+> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
+
+#### 4. Stopping the Container
+
+```bash
+docker stop crawl4ai && docker rm crawl4ai
+```
+
+#### Docker Hub Versioning Explained
+
+*   **Image Name:** `unclecode/crawl4ai`
+*   **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r1`)
+    *   `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
+    *   `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
+*   **`latest` Tag:** Points to the most recent stable version
+*   **Multi-Architecture Support:** All images support both `linux/amd64` and `linux/arm64` architectures through a single tag
+
+### Option 2: Using Docker Compose
+
+Docker Compose simplifies building and running the service, especially for local development and testing.
+
+#### 1. Clone Repository
+
+```bash
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+```
+
+#### 2. Environment Setup (API Keys)
+
+If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+cp deploy/docker/.llm.env.example .llm.env
+
+# Now edit .llm.env and add your API keys
+```
+
+#### 3. Build and Run with Compose
+
+The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
+
+*   **Run Pre-built Image from Docker Hub:**
+    ```bash
+    # Pulls and runs the release candidate from Docker Hub
+    # Automatically selects the correct architecture
+    IMAGE=unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number docker compose up -d
+    ```
+
+*   **Build and Run Locally:**
+    ```bash
+    # Builds the image locally using Dockerfile and runs it
+    # Automatically uses the correct architecture for your machine
+    docker compose up --build -d
+    ```
+
+*   **Customize the Build:**
+    ```bash
+    # Build with all features (includes torch and transformers)
+    INSTALL_TYPE=all docker compose up --build -d
+    
+    # Build with GPU support (for AMD64 platforms)
+    ENABLE_GPU=true docker compose up --build -d
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Service
+
+```bash
+# Stop the service
+docker compose down
+```
+
+### Option 3: Manual Local Build & Run
+
+If you prefer not to use Docker Compose for direct control over the build and run process.
+
+#### 1. Clone Repository & Setup Environment
+
+Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
+
+#### 2. Build the Image (Multi-Arch)
+
+Use `docker buildx` to build the image. Crawl4AI now uses buildx to handle multi-architecture builds automatically.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+# Build for the current architecture and load it into Docker
+docker buildx build -t crawl4ai-local:latest --load .
+
+# Or build for multiple architectures (useful for publishing)
+docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
+
+# Build with additional options
+docker buildx build \
+  --build-arg INSTALL_TYPE=all \
+  --build-arg ENABLE_GPU=false \
+  -t crawl4ai-local:latest --load .
+```
+
+#### 3. Run the Container
+
+*   **Basic run (no LLM support):**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --shm-size=1g \
+      crawl4ai-local:latest
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory (project root)
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --env-file .llm.env \
+      --shm-size=1g \
+      crawl4ai-local:latest
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Manual Container
+
+```bash
+docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
+```
+
+---
+
+## MCP (Model Context Protocol) Support
+
+Crawl4AI server includes support for the Model Context Protocol (MCP), allowing you to connect the server's capabilities directly to MCP-compatible clients like Claude Code.
+
+### What is MCP?
+
+MCP is an open protocol that standardizes how applications provide context to LLMs. It allows AI models to access external tools, data sources, and services through a standardized interface.
+
+### Connecting via MCP
+
+The Crawl4AI server exposes two MCP endpoints:
+
+- **Server-Sent Events (SSE)**: `http://localhost:11235/mcp/sse`
+- **WebSocket**: `ws://localhost:11235/mcp/ws`
+
+### Using with Claude Code
+
+You can add Crawl4AI as an MCP tool provider in Claude Code with a simple command:
+
+```bash
+# Add the Crawl4AI server as an MCP provider
+claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
+
+# List all MCP providers to verify it was added
+claude mcp list
+```
+
+Once connected, Claude Code can directly use Crawl4AI's capabilities like screenshot capture, PDF generation, and HTML processing without having to make separate API calls.
+
+### Available MCP Tools
+
+When connected via MCP, the following tools are available:
+
+- `md` - Generate markdown from web content
+- `html` - Extract preprocessed HTML
+- `screenshot` - Capture webpage screenshots
+- `pdf` - Generate PDF documents
+- `execute_js` - Run JavaScript on web pages
+- `crawl` - Perform multi-URL crawling
+- `ask` - Query the Crawl4AI library context
+
+### Testing MCP Connections
+
+You can test the MCP WebSocket connection using the test file included in the repository:
+
+```bash
+# From the repository root
+python tests/mcp/test_mcp_socket.py
+```
+
+### MCP Schemas
+
+Access the MCP tool schemas at `http://localhost:11235/mcp/schema` for detailed information on each tool's parameters and capabilities.
+
+---
+
+## Additional API Endpoints
+
+In addition to the core `/crawl` and `/crawl/stream` endpoints, the server provides several specialized endpoints:
+
+### HTML Extraction Endpoint
+
+```
+POST /html
+```
+
+Crawls the URL and returns preprocessed HTML optimized for schema extraction.
+
+```json
+{
+  "url": "https://example.com"
+}
+```
+
+### Screenshot Endpoint
+
+```
+POST /screenshot
+```
+
+Captures a full-page PNG screenshot of the specified URL.
+
+```json
+{
+  "url": "https://example.com",
+  "screenshot_wait_for": 2,
+  "output_path": "/path/to/save/screenshot.png"
+}
+```
+
+- `screenshot_wait_for`: Optional delay in seconds before capture (default: 2)
+- `output_path`: Optional path to save the screenshot (recommended)
+
+### PDF Export Endpoint
+
+```
+POST /pdf
+```
+
+Generates a PDF document of the specified URL.
+
+```json
+{
+  "url": "https://example.com",
+  "output_path": "/path/to/save/document.pdf"
+}
+```
+
+- `output_path`: Optional path to save the PDF (recommended)
+
+### JavaScript Execution Endpoint
+
+```
+POST /execute_js
+```
+
+Executes JavaScript snippets on the specified URL and returns the full crawl result.
+
+```json
+{
+  "url": "https://example.com",
+  "scripts": [
+    "return document.title",
+    "return Array.from(document.querySelectorAll('a')).map(a => a.href)"
+  ]
+}
+```
+
+- `scripts`: List of JavaScript snippets to execute sequentially
+
+---
+
+## Dockerfile Parameters
+
+You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
+
+```bash
+# Example: Build with 'all' features using buildx
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  --build-arg INSTALL_TYPE=all \
+  -t yourname/crawl4ai-all:latest \
+  --load \
+  . # Build from root context
+```
+
+### Build Arguments Explained
+
+| Argument     | Description                              | Default   | Options                            |
+| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
+| INSTALL_TYPE | Feature set                              | `default` | `default`, `all`, `torch`, `transformer` |
+| ENABLE_GPU   | GPU support (CUDA for AMD64)           | `false`   | `true`, `false`                    |
+| APP_HOME     | Install path inside container (advanced) | `/app`    | any valid path                   |
+| USE_LOCAL    | Install library from local source        | `true`    | `true`, `false`                    |
+| GITHUB_REPO  | Git repo to clone if USE_LOCAL=false   | *(see Dockerfile)* | any git URL                  |
+| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false   | `main`    | any branch name                  |
+
+*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
+
+### Build Best Practices
+
+1.  **Choose the Right Install Type**
+    *   `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
+    *   `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
+2.  **Platform Considerations**
+    *   Use `buildx` for building multi-architecture images, especially for pushing to registries.
+    *   Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
+3.  **Performance Optimization**
+    *   The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
+
+---
+
+## Using the API
+
+Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
+
+### Playground Interface
+
+A built-in web playground is available at `http://localhost:11235/playground` for testing and generating API requests. The playground allows you to:
+
+1. Configure `CrawlerRunConfig` and `BrowserConfig` using the main library's Python syntax
+2. Test crawling operations directly from the interface
+3. Generate corresponding JSON for REST API requests based on your configuration
+
+This is the easiest way to translate Python configuration to JSON requests when building integrations.
+
+### Python SDK
+
+Install the SDK: `pip install crawl4ai`
+
+```python
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
+
+async def main():
+    # Point to the correct server port
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
+        # If JWT is enabled on the server, authenticate first:
+        # await client.authenticate("user@example.com") # See Server Configuration section
+
+        # Example Non-streaming crawl
+        print("--- Running Non-Streaming Crawl ---")
+        results = await client.crawl(
+            ["https://httpbin.org/html"],
+            browser_config=BrowserConfig(headless=True), # Use library classes for config aid
+            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        if results: # client.crawl returns None on failure
+          print(f"Non-streaming results success: {results.success}")
+          if results.success:
+              for result in results: # Iterate through the CrawlResultContainer
+                  print(f"URL: {result.url}, Success: {result.success}")
+        else:
+            print("Non-streaming crawl failed.")
+
+
+        # Example Streaming crawl
+        print("\n--- Running Streaming Crawl ---")
+        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
+        try:
+            async for result in await client.crawl( # client.crawl returns an async generator for streaming
+                ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
+                browser_config=BrowserConfig(headless=True),
+                crawler_config=stream_config
+            ):
+                print(f"Streamed result: URL: {result.url}, Success: {result.success}")
+        except Exception as e:
+            print(f"Streaming crawl failed: {e}")
+
+
+        # Example Get schema
+        print("\n--- Getting Schema ---")
+        schema = await client.get_schema()
+        print(f"Schema received: {bool(schema)}") # Print whether schema was received
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+*(SDK parameters like timeout, verify_ssl etc. remain the same)*
+
+### Second Approach: Direct API Calls
+
+Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
+
+*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
+
+#### More Examples *(Ensure Schema example uses type/value wrapper)*
+
+**Advanced Crawler Configuration**
+*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
+
+**Extraction Strategy**
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "JsonCssExtractionStrategy",
+                "params": {
+                    "schema": {
+                        "type": "dict",
+                        "value": {
+                           "baseSelector": "article.post",
+                           "fields": [
+                               {"name": "title", "selector": "h1", "type": "text"},
+                               {"name": "content", "selector": ".content", "type": "html"}
+                           ]
+                         }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
+*(Keep Deep Crawler Example)*
+
+### REST API Examples
+
+Update URLs to use port `11235`.
+
+#### Simple Crawl
+
+```python
+import requests
+
+# Configuration objects converted to the required JSON structure
+browser_config_payload = {
+    "type": "BrowserConfig",
+    "params": {"headless": True}
+}
+crawler_config_payload = {
+    "type": "CrawlerRunConfig",
+    "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
+}
+
+crawl_payload = {
+    "urls": ["https://httpbin.org/html"],
+    "browser_config": browser_config_payload,
+    "crawler_config": crawler_config_payload
+}
+response = requests.post(
+    "http://localhost:11235/crawl", # Updated port
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled
+    json=crawl_payload
+)
+print(f"Status Code: {response.status_code}")
+if response.ok:
+    print(response.json())
+else:
+    print(f"Error: {response.text}")
+
+```
+
+#### Streaming Results
+
+```python
+import json
+import httpx # Use httpx for async streaming example
+
+async def test_stream_crawl(token: str = None): # Made token optional
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:11235/crawl/stream" # Updated port
+    payload = {
+        "urls": [
+            "https://httpbin.org/html",
+            "https://httpbin.org/links/5/0",
+        ],
+        "browser_config": {
+            "type": "BrowserConfig",
+            "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
+        },
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"stream": True, "cache_mode": "bypass"}
+        }
+    }
+
+    headers = {}
+    # if token:
+    #    headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
+
+    try:
+        async with httpx.AsyncClient() as client:
+            async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
+                print(f"Status: {response.status_code} (Expected: 200)")
+                response.raise_for_status() # Raise exception for bad status codes
+
+                # Read streaming response line-by-line (NDJSON)
+                async for line in response.aiter_lines():
+                    if line:
+                        try:
+                            data = json.loads(line)
+                            # Check for completion marker
+                            if data.get("status") == "completed":
+                                print("Stream completed.")
+                                break
+                            print(f"Streamed Result: {json.dumps(data, indent=2)}")
+                        except json.JSONDecodeError:
+                            print(f"Warning: Could not decode JSON line: {line}")
+
+    except httpx.HTTPStatusError as e:
+         print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+
+# To run this example:
+# import asyncio
+# asyncio.run(test_stream_crawl())
+```
+
+---
+
+## Metrics & Monitoring
+
+Keep an eye on your crawler with these endpoints:
+
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
+
+Example health check:
+```bash
+curl http://localhost:11235/health
+```
+
+---
+
+*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
+
+---
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file.
+
+### Understanding config.yml
+
+The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
+
+Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
+
+```yaml
+# Application Configuration
+app:
+  title: "Crawl4AI API"
+  version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
+  host: "0.0.0.0"
+  port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
+  reload: False # Default set to False - suitable for production
+  timeout_keep_alive: 300
+
+# Default LLM Configuration
+llm:
+  provider: "openai/gpt-4o-mini"
+  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
+
+# Redis Configuration (Used by internal Redis server managed by supervisord)
+redis:
+  host: "localhost"
+  port: 6379
+  db: 0
+  password: ""
+  # ... other redis options ...
+
+# Rate Limiting Configuration
+rate_limiting:
+  enabled: True
+  default_limit: "1000/minute"
+  trusted_proxies: []
+  storage_uri: "memory://"  # Use "redis://localhost:6379" if you need persistent/shared limits
+
+# Security Configuration
+security:
+  enabled: false # Master toggle for security features
+  jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
+  https_redirect: false # Force HTTPS (requires security.enabled=true)
+  trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
+  headers: # Security headers (applied if security.enabled=true)
+    x_content_type_options: "nosniff"
+    x_frame_options: "DENY"
+    content_security_policy: "default-src 'self'"
+    strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+  memory_threshold_percent: 95.0
+  rate_limiter:
+    base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
+  timeouts:
+    stream_init: 30.0  # Timeout for stream initialization
+    batch_process: 300.0 # Timeout for non-streaming /crawl processing
+
+# Logging Configuration
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+  prometheus:
+    enabled: True
+    endpoint: "/metrics"
+  health_check:
+    endpoint: "/health"
+```
+
+*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
+
+*(Configuration Tips and Best Practices remain the same)*
+
+### Customizing Your Configuration
+
+You can override the default `config.yml`.
+
+#### Method 1: Modify Before Build
+
+1.  Edit the `deploy/docker/config.yml` file in your local repository clone.
+2.  Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
+
+#### Method 2: Runtime Mount (Recommended for Custom Deploys)
+
+1.  Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
+2.  Mount it when running the container:
+
+    *   **Using `docker run`:**
+        ```bash
+        # Assumes my-custom-config.yml is in the current directory
+        docker run -d -p 11235:11235 \
+          --name crawl4ai-custom-config \
+          --env-file .llm.env \
+          --shm-size=1g \
+          -v $(pwd)/my-custom-config.yml:/app/config.yml \
+          unclecode/crawl4ai:latest # Or your specific tag
+        ```
+
+    *   **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
+        ```yaml
+        services:
+          crawl4ai-hub-amd64: # Or your chosen service
+            image: unclecode/crawl4ai:latest
+            profiles: ["hub-amd64"]
+            <<: *base-config
+            volumes:
+              # Mount local custom config over the default one in the container
+              - ./my-custom-config.yml:/app/config.yml
+              # Keep the shared memory volume from base-config
+              - /dev/shm:/dev/shm
+        ```
+        *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
+
+> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
+
+### Configuration Recommendations
+
+1. **Security First** 🔒
+   - Always enable security in production
+   - Use specific trusted_hosts instead of wildcards
+   - Set up proper rate limiting to protect your server
+   - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** 💻
+   - Adjust memory_threshold_percent based on available RAM
+   - Set timeouts according to your content size and network conditions
+   - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** 📊
+   - Enable Prometheus if you need metrics
+   - Set DEBUG logging in development, INFO in production
+   - Regular health check monitoring is crucial
+
+4. **Performance Tuning** ⚡
+   - Start with conservative rate limiter delays
+   - Increase batch_process timeout for large content
+   - Adjust stream_init timeout based on initial response times
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- 📖 Check our [full documentation](https://docs.crawl4ai.com)
+- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
+- ⭐ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment  
+- Using the interactive playground for testing
+- Making API requests with proper typing
+- Using the Python SDK
+- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
+- Connecting via the Model Context Protocol (MCP)
+- Monitoring your deployment
+
+The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
+
+For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
+
+Happy crawling! 🕷️
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
new file mode 100644
index 00000000..732371f7
--- /dev/null
+++ b/deploy/docker/api.py
@@ -0,0 +1,571 @@
+import os
+import json
+import asyncio
+from typing import List, Tuple, Dict
+from functools import partial
+from uuid import uuid4
+from datetime import datetime
+
+import logging
+from typing import Optional, AsyncGenerator
+from urllib.parse import unquote
+from fastapi import HTTPException, Request, status
+from fastapi.background import BackgroundTasks
+from fastapi.responses import JSONResponse
+from redis import asyncio as aioredis
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    LLMExtractionStrategy,
+    CacheMode,
+    BrowserConfig,
+    MemoryAdaptiveDispatcher,
+    RateLimiter, 
+    LLMConfig
+)
+from crawl4ai.utils import perform_completion_with_backoff
+from crawl4ai.content_filter_strategy import (
+    PruningContentFilter,
+    BM25ContentFilter,
+    LLMContentFilter
+)
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+
+from utils import (
+    TaskStatus,
+    FilterType,
+    get_base_url,
+    is_task_id,
+    should_cleanup_task,
+    decode_redis_hash
+)
+
+import psutil, time
+
+logger = logging.getLogger(__name__)
+
+# --- Helper to get memory ---
+def _get_memory_mb():
+    try:
+        return psutil.Process().memory_info().rss / (1024 * 1024)
+    except Exception as e:
+        logger.warning(f"Could not get memory info: {e}")
+        return None
+
+
+async def handle_llm_qa(
+    url: str,
+    query: str,
+    config: dict
+) -> str:
+    """Process QA using LLM with crawled content as context."""
+    try:
+        if not url.startswith(('http://', 'https://')):
+            url = 'https://' + url
+        # Extract base URL by finding last '?q=' occurrence
+        last_q_index = url.rfind('?q=')
+        if last_q_index != -1:
+            url = url[:last_q_index]
+
+        # Get markdown content
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url)
+            if not result.success:
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail=result.error_message
+                )
+            content = result.markdown.fit_markdown or result.markdown.raw_markdown
+
+        # Create prompt and get LLM response
+        prompt = f"""Use the following content as context to answer the question.
+    Content:
+    {content}
+
+    Question: {query}
+
+    Answer:"""
+
+        response = perform_completion_with_backoff(
+            provider=config["llm"]["provider"],
+            prompt_with_variables=prompt,
+            api_token=os.environ.get(config["llm"].get("api_key_env", ""))
+        )
+
+        return response.choices[0].message.content
+    except Exception as e:
+        logger.error(f"QA processing error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=str(e)
+        )
+
+async def process_llm_extraction(
+    redis: aioredis.Redis,
+    config: dict,
+    task_id: str,
+    url: str,
+    instruction: str,
+    schema: Optional[str] = None,
+    cache: str = "0"
+) -> None:
+    """Process LLM extraction in background."""
+    try:
+        # If config['llm'] has api_key then ignore the api_key_env
+        api_key = ""
+        if "api_key" in config["llm"]:
+            api_key = config["llm"]["api_key"]
+        else:
+            api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
+        llm_strategy = LLMExtractionStrategy(
+            llm_config=LLMConfig(
+                provider=config["llm"]["provider"],
+                api_token=api_key
+            ),
+            instruction=instruction,
+            schema=json.loads(schema) if schema else None,
+        )
+
+        cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
+
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=url,
+                config=CrawlerRunConfig(
+                    extraction_strategy=llm_strategy,
+                    scraping_strategy=LXMLWebScrapingStrategy(),
+                    cache_mode=cache_mode
+                )
+            )
+
+        if not result.success:
+            await redis.hset(f"task:{task_id}", mapping={
+                "status": TaskStatus.FAILED,
+                "error": result.error_message
+            })
+            return
+
+        try:
+            content = json.loads(result.extracted_content)
+        except json.JSONDecodeError:
+            content = result.extracted_content
+        await redis.hset(f"task:{task_id}", mapping={
+            "status": TaskStatus.COMPLETED,
+            "result": json.dumps(content)
+        })
+
+    except Exception as e:
+        logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
+        await redis.hset(f"task:{task_id}", mapping={
+            "status": TaskStatus.FAILED,
+            "error": str(e)
+        })
+
+async def handle_markdown_request(
+    url: str,
+    filter_type: FilterType,
+    query: Optional[str] = None,
+    cache: str = "0",
+    config: Optional[dict] = None
+) -> str:
+    """Handle markdown generation requests."""
+    try:
+        decoded_url = unquote(url)
+        if not decoded_url.startswith(('http://', 'https://')):
+            decoded_url = 'https://' + decoded_url
+
+        if filter_type == FilterType.RAW:
+            md_generator = DefaultMarkdownGenerator()
+        else:
+            content_filter = {
+                FilterType.FIT: PruningContentFilter(),
+                FilterType.BM25: BM25ContentFilter(user_query=query or ""),
+                FilterType.LLM: LLMContentFilter(
+                    llm_config=LLMConfig(
+                        provider=config["llm"]["provider"],
+                        api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
+                    ),
+                    instruction=query or "Extract main content"
+                )
+            }[filter_type]
+            md_generator = DefaultMarkdownGenerator(content_filter=content_filter)
+
+        cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
+
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=decoded_url,
+                config=CrawlerRunConfig(
+                    markdown_generator=md_generator,
+                    scraping_strategy=LXMLWebScrapingStrategy(),
+                    cache_mode=cache_mode
+                )
+            )
+            
+            if not result.success:
+                raise HTTPException(
+                    status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                    detail=result.error_message
+                )
+
+            return (result.markdown.raw_markdown 
+                   if filter_type == FilterType.RAW 
+                   else result.markdown.fit_markdown)
+
+    except Exception as e:
+        logger.error(f"Markdown error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=str(e)
+        )
+
+async def handle_llm_request(
+    redis: aioredis.Redis,
+    background_tasks: BackgroundTasks,
+    request: Request,
+    input_path: str,
+    query: Optional[str] = None,
+    schema: Optional[str] = None,
+    cache: str = "0",
+    config: Optional[dict] = None
+) -> JSONResponse:
+    """Handle LLM extraction requests."""
+    base_url = get_base_url(request)
+    
+    try:
+        if is_task_id(input_path):
+            return await handle_task_status(
+                redis, input_path, base_url
+            )
+
+        if not query:
+            return JSONResponse({
+                "message": "Please provide an instruction",
+                "_links": {
+                    "example": {
+                        "href": f"{base_url}/llm/{input_path}?q=Extract+main+content",
+                        "title": "Try this example"
+                    }
+                }
+            })
+
+        return await create_new_task(
+            redis,
+            background_tasks,
+            input_path,
+            query,
+            schema,
+            cache,
+            base_url,
+            config
+        )
+
+    except Exception as e:
+        logger.error(f"LLM endpoint error: {str(e)}", exc_info=True)
+        return JSONResponse({
+            "error": str(e),
+            "_links": {
+                "retry": {"href": str(request.url)}
+            }
+        }, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
+
+async def handle_task_status(
+    redis: aioredis.Redis,
+    task_id: str,
+    base_url: str,
+    *,
+    keep: bool = False
+) -> JSONResponse:
+    """Handle task status check requests."""
+    task = await redis.hgetall(f"task:{task_id}")
+    if not task:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="Task not found"
+        )
+
+    task = decode_redis_hash(task)
+    response = create_task_response(task, task_id, base_url)
+
+    if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
+        if not keep and should_cleanup_task(task["created_at"]):
+            await redis.delete(f"task:{task_id}")
+
+    return JSONResponse(response)
+
+async def create_new_task(
+    redis: aioredis.Redis,
+    background_tasks: BackgroundTasks,
+    input_path: str,
+    query: str,
+    schema: Optional[str],
+    cache: str,
+    base_url: str,
+    config: dict
+) -> JSONResponse:
+    """Create and initialize a new task."""
+    decoded_url = unquote(input_path)
+    if not decoded_url.startswith(('http://', 'https://')):
+        decoded_url = 'https://' + decoded_url
+
+    from datetime import datetime
+    task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
+    
+    await redis.hset(f"task:{task_id}", mapping={
+        "status": TaskStatus.PROCESSING,
+        "created_at": datetime.now().isoformat(),
+        "url": decoded_url
+    })
+
+    background_tasks.add_task(
+        process_llm_extraction,
+        redis,
+        config,
+        task_id,
+        decoded_url,
+        query,
+        schema,
+        cache
+    )
+
+    return JSONResponse({
+        "task_id": task_id,
+        "status": TaskStatus.PROCESSING,
+        "url": decoded_url,
+        "_links": {
+            "self": {"href": f"{base_url}/llm/{task_id}"},
+            "status": {"href": f"{base_url}/llm/{task_id}"}
+        }
+    })
+
+def create_task_response(task: dict, task_id: str, base_url: str) -> dict:
+    """Create response for task status check."""
+    response = {
+        "task_id": task_id,
+        "status": task["status"],
+        "created_at": task["created_at"],
+        "url": task["url"],
+        "_links": {
+            "self": {"href": f"{base_url}/llm/{task_id}"},
+            "refresh": {"href": f"{base_url}/llm/{task_id}"}
+        }
+    }
+
+    if task["status"] == TaskStatus.COMPLETED:
+        response["result"] = json.loads(task["result"])
+    elif task["status"] == TaskStatus.FAILED:
+        response["error"] = task["error"]
+
+    return response
+
+async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
+    """Stream results with heartbeats and completion markers."""
+    import json
+    from utils import datetime_handler
+
+    try:
+        async for result in results_gen:
+            try:
+                server_memory_mb = _get_memory_mb()
+                result_dict = result.model_dump()
+                result_dict['server_memory_mb'] = server_memory_mb
+                logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
+                data = json.dumps(result_dict, default=datetime_handler) + "\n"
+                yield data.encode('utf-8')
+            except Exception as e:
+                logger.error(f"Serialization error: {e}")
+                error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')}
+                yield (json.dumps(error_response) + "\n").encode('utf-8')
+
+        yield json.dumps({"status": "completed"}).encode('utf-8')
+        
+    except asyncio.CancelledError:
+        logger.warning("Client disconnected during streaming")
+    finally:
+        # try:
+        #     await crawler.close()
+        # except Exception as e:
+        #     logger.error(f"Crawler cleanup error: {e}")
+        pass
+
+async def handle_crawl_request(
+    urls: List[str],
+    browser_config: dict,
+    crawler_config: dict,
+    config: dict
+) -> dict:
+    """Handle non-streaming crawl requests."""
+    start_mem_mb = _get_memory_mb() # <--- Get memory before
+    start_time = time.time()
+    mem_delta_mb = None
+    peak_mem_mb = start_mem_mb
+    
+    try:
+        urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls]
+        browser_config = BrowserConfig.load(browser_config)
+        crawler_config = CrawlerRunConfig.load(crawler_config)
+
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
+            rate_limiter=RateLimiter(
+                base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
+            ) if config["crawler"]["rate_limiter"]["enabled"] else None
+        )
+        
+        from crawler_pool import get_crawler
+        crawler = await get_crawler(browser_config)
+
+        # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
+        # await crawler.start()
+        
+        base_config = config["crawler"]["base_config"]
+        # Iterate on key-value pairs in global_config then use haseattr to set them 
+        for key, value in base_config.items():
+            if hasattr(crawler_config, key):
+                setattr(crawler_config, key, value)
+
+        results = []
+        func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
+        partial_func = partial(func, 
+                                urls[0] if len(urls) == 1 else urls, 
+                                config=crawler_config, 
+                                dispatcher=dispatcher)
+        results = await partial_func()
+
+        # await crawler.close()
+        
+        end_mem_mb = _get_memory_mb() # <--- Get memory after
+        end_time = time.time()
+        
+        if start_mem_mb is not None and end_mem_mb is not None:
+            mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
+            peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
+        logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
+                              
+        return {
+            "success": True,
+            "results": [result.model_dump() for result in results],
+            "server_processing_time_s": end_time - start_time,
+            "server_memory_delta_mb": mem_delta_mb,
+            "server_peak_memory_mb": peak_mem_mb
+        }
+
+    except Exception as e:
+        logger.error(f"Crawl error: {str(e)}", exc_info=True)
+        if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
+            #  try:
+            #      await crawler.close()
+            #  except Exception as close_e:
+            #       logger.error(f"Error closing crawler during exception handling: {close_e}")
+            logger.error(f"Error closing crawler during exception handling: {close_e}")
+
+        # Measure memory even on error if possible
+        end_mem_mb_error = _get_memory_mb()
+        if start_mem_mb is not None and end_mem_mb_error is not None:
+            mem_delta_mb = end_mem_mb_error - start_mem_mb
+
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=json.dumps({ # Send structured error
+                "error": str(e),
+                "server_memory_delta_mb": mem_delta_mb,
+                "server_peak_memory_mb": max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb_error or 0)
+            })
+        )
+
+async def handle_stream_crawl_request(
+    urls: List[str],
+    browser_config: dict,
+    crawler_config: dict,
+    config: dict
+) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
+    """Handle streaming crawl requests."""
+    try:
+        browser_config = BrowserConfig.load(browser_config)
+        # browser_config.verbose = True # Set to False or remove for production stress testing
+        browser_config.verbose = False
+        crawler_config = CrawlerRunConfig.load(crawler_config)
+        crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
+        crawler_config.stream = True
+
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
+            rate_limiter=RateLimiter(
+                base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
+            )
+        )
+
+        from crawler_pool import get_crawler
+        crawler = await get_crawler(browser_config)
+
+        # crawler = AsyncWebCrawler(config=browser_config)
+        # await crawler.start()
+
+        results_gen = await crawler.arun_many(
+            urls=urls,
+            config=crawler_config,
+            dispatcher=dispatcher
+        )
+
+        return crawler, results_gen
+
+    except Exception as e:
+        # Make sure to close crawler if started during an error here
+        if 'crawler' in locals() and crawler.ready:
+            #  try:
+            #       await crawler.close()
+            #  except Exception as close_e:
+            #       logger.error(f"Error closing crawler during stream setup exception: {close_e}")
+            logger.error(f"Error closing crawler during stream setup exception: {close_e}")
+        logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
+        # Raising HTTPException here will prevent streaming response
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=str(e)
+        )
+        
+async def handle_crawl_job(
+    redis,
+    background_tasks: BackgroundTasks,
+    urls: List[str],
+    browser_config: Dict,
+    crawler_config: Dict,
+    config: Dict,
+) -> Dict:
+    """
+    Fire-and-forget version of handle_crawl_request.
+    Creates a task in Redis, runs the heavy work in a background task,
+    lets /crawl/job/{task_id} polling fetch the result.
+    """
+    task_id = f"crawl_{uuid4().hex[:8]}"
+    await redis.hset(f"task:{task_id}", mapping={
+        "status": TaskStatus.PROCESSING,         # <-- keep enum values consistent
+        "created_at": datetime.utcnow().isoformat(),
+        "url": json.dumps(urls),                 # store list as JSON string
+        "result": "",
+        "error": "",
+    })
+
+    async def _runner():
+        try:
+            result = await handle_crawl_request(
+                urls=urls,
+                browser_config=browser_config,
+                crawler_config=crawler_config,
+                config=config,
+            )
+            await redis.hset(f"task:{task_id}", mapping={
+                "status": TaskStatus.COMPLETED,
+                "result": json.dumps(result),
+            })
+            await asyncio.sleep(5)  # Give Redis time to process the update
+        except Exception as exc:
+            await redis.hset(f"task:{task_id}", mapping={
+                "status": TaskStatus.FAILED,
+                "error": str(exc),
+            })
+
+    background_tasks.add_task(_runner)
+    return {"task_id": task_id}
\ No newline at end of file
diff --git a/deploy/docker/auth.py b/deploy/docker/auth.py
new file mode 100644
index 00000000..f9e75d78
--- /dev/null
+++ b/deploy/docker/auth.py
@@ -0,0 +1,55 @@
+import os
+from datetime import datetime, timedelta, timezone
+from typing import Dict, Optional
+from jwt import JWT, jwk_from_dict
+from jwt.utils import get_int_from_datetime
+from fastapi import Depends, HTTPException
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from pydantic import EmailStr
+from pydantic.main import BaseModel
+import base64
+
+instance = JWT()
+security = HTTPBearer(auto_error=False)
+SECRET_KEY = os.environ.get("SECRET_KEY", "mysecret")
+ACCESS_TOKEN_EXPIRE_MINUTES = 60
+
+def get_jwk_from_secret(secret: str):
+    """Convert a secret string into a JWK object."""
+    secret_bytes = secret.encode('utf-8')
+    b64_secret = base64.urlsafe_b64encode(secret_bytes).rstrip(b'=').decode('utf-8')
+    return jwk_from_dict({"kty": "oct", "k": b64_secret})
+
+def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
+    """Create a JWT access token with an expiration."""
+    to_encode = data.copy()
+    expire = datetime.now(timezone.utc) + (expires_delta or timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES))
+    to_encode.update({"exp": get_int_from_datetime(expire)})
+    signing_key = get_jwk_from_secret(SECRET_KEY)
+    return instance.encode(to_encode, signing_key, alg='HS256')
+
+def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
+    """Verify the JWT token from the Authorization header."""
+
+    if credentials is None:
+        return None
+    token = credentials.credentials
+    verifying_key = get_jwk_from_secret(SECRET_KEY)
+    try:
+        payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
+        return payload
+    except Exception:
+        raise HTTPException(status_code=401, detail="Invalid or expired token")
+
+
+def get_token_dependency(config: Dict):
+    """Return the token dependency if JWT is enabled, else a function that returns None."""
+
+    if config.get("security", {}).get("jwt_enabled", False):
+        return verify_token
+    else:
+        return lambda: None
+
+
+class TokenRequest(BaseModel):
+    email: EmailStr
\ No newline at end of file
diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md
new file mode 100644
index 00000000..f2551c01
--- /dev/null
+++ b/deploy/docker/c4ai-code-context.md
@@ -0,0 +1,11631 @@
+# Crawl4AI Code Context
+
+Generated on 2025-04-21
+
+## File: crawl4ai/async_configs.py
+
+```py
+import os
+from .config import (
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
+    MIN_WORD_THRESHOLD,
+    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+    PROVIDER_MODELS,
+    PROVIDER_MODELS_PREFIXES,
+    SCREENSHOT_HEIGHT_TRESHOLD,
+    PAGE_TIMEOUT,
+    IMAGE_SCORE_THRESHOLD,
+    SOCIAL_MEDIA_DOMAINS,
+)
+
+from .user_agent_generator import UAGen, ValidUAGenerator  # , OnlineUAGenerator
+from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
+from .chunking_strategy import ChunkingStrategy, RegexChunking
+
+from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
+from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
+from .deep_crawling import DeepCrawlStrategy
+
+from .cache_context import CacheMode
+from .proxy_strategy import ProxyRotationStrategy
+
+from typing import Union, List
+import inspect
+from typing import Any, Dict, Optional
+from enum import Enum
+
+# from .proxy_strategy import ProxyConfig
+
+
+
+def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
+    """
+    Recursively convert an object to a serializable dictionary using {type, params} structure
+    for complex objects.
+    """
+    if obj is None:
+        return None
+
+    # Handle basic types
+    if isinstance(obj, (str, int, float, bool)):
+        return obj
+
+    # Handle Enum
+    if isinstance(obj, Enum):
+        return {"type": obj.__class__.__name__, "params": obj.value}
+
+    # Handle datetime objects
+    if hasattr(obj, "isoformat"):
+        return obj.isoformat()
+
+    # Handle lists, tuples, and sets, and basically any iterable
+    if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__') and not isinstance(obj, dict):
+        return [to_serializable_dict(item) for item in obj]
+
+    # Handle frozensets, which are not iterable
+    if isinstance(obj, frozenset):
+        return [to_serializable_dict(item) for item in list(obj)]
+
+    # Handle dictionaries - preserve them as-is
+    if isinstance(obj, dict):
+        return {
+            "type": "dict",  # Mark as plain dictionary
+            "value": {str(k): to_serializable_dict(v) for k, v in obj.items()},
+        }
+
+    _type = obj.__class__.__name__
+
+    # Handle class instances
+    if hasattr(obj, "__class__"):
+        # Get constructor signature
+        sig = inspect.signature(obj.__class__.__init__)
+        params = sig.parameters
+
+        # Get current values
+        current_values = {}
+        for name, param in params.items():
+            if name == "self":
+                continue
+
+            value = getattr(obj, name, param.default)
+
+            # Only include if different from default, considering empty values
+            if not (is_empty_value(value) and is_empty_value(param.default)):
+                if value != param.default and not ignore_default_value:
+                    current_values[name] = to_serializable_dict(value)
+        
+        if hasattr(obj, '__slots__'):
+            for slot in obj.__slots__:
+                if slot.startswith('_'):  # Handle private slots
+                    attr_name = slot[1:]  # Remove leading '_'
+                    value = getattr(obj, slot, None)
+                    if value is not None:
+                        current_values[attr_name] = to_serializable_dict(value)
+
+            
+        
+        return {
+            "type": obj.__class__.__name__,
+            "params": current_values
+        }
+        
+    return str(obj)
+
+
+def from_serializable_dict(data: Any) -> Any:
+    """
+    Recursively convert a serializable dictionary back to an object instance.
+    """
+    if data is None:
+        return None
+
+    # Handle basic types
+    if isinstance(data, (str, int, float, bool)):
+        return data
+
+    # Handle typed data
+    if isinstance(data, dict) and "type" in data:
+        # Handle plain dictionaries
+        if data["type"] == "dict" and "value" in data:
+            return {k: from_serializable_dict(v) for k, v in data["value"].items()}
+
+        # Import from crawl4ai for class instances
+        import crawl4ai
+
+        if hasattr(crawl4ai, data["type"]):
+            cls = getattr(crawl4ai, data["type"])
+
+            # Handle Enum
+            if issubclass(cls, Enum):
+                return cls(data["params"])
+
+            if "params" in data:
+                # Handle class instances
+                constructor_args = {
+                    k: from_serializable_dict(v) for k, v in data["params"].items()
+                }
+                return cls(**constructor_args)
+
+    # Handle lists
+    if isinstance(data, list):
+        return [from_serializable_dict(item) for item in data]
+
+    # Handle raw dictionaries (legacy support)
+    if isinstance(data, dict):
+        return {k: from_serializable_dict(v) for k, v in data.items()}
+
+    return data
+
+
+def is_empty_value(value: Any) -> bool:
+    """Check if a value is effectively empty/null."""
+    if value is None:
+        return True
+    if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
+        return True
+    return False
+
+class ProxyConfig:
+    def __init__(
+        self,
+        server: str,
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        ip: Optional[str] = None,
+    ):
+        """Configuration class for a single proxy.
+        
+        Args:
+            server: Proxy server URL (e.g., "http://127.0.0.1:8080")
+            username: Optional username for proxy authentication
+            password: Optional password for proxy authentication
+            ip: Optional IP address for verification purposes
+        """
+        self.server = server
+        self.username = username
+        self.password = password
+        
+        # Extract IP from server if not explicitly provided
+        self.ip = ip or self._extract_ip_from_server()
+    
+    def _extract_ip_from_server(self) -> Optional[str]:
+        """Extract IP address from server URL."""
+        try:
+            # Simple extraction assuming http://ip:port format
+            if "://" in self.server:
+                parts = self.server.split("://")[1].split(":")
+                return parts[0]
+            else:
+                parts = self.server.split(":")
+                return parts[0]
+        except Exception:
+            return None
+    
+    @staticmethod
+    def from_string(proxy_str: str) -> "ProxyConfig":
+        """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
+        parts = proxy_str.split(":")
+        if len(parts) == 4:  # ip:port:username:password
+            ip, port, username, password = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                username=username,
+                password=password,
+                ip=ip
+            )
+        elif len(parts) == 2:  # ip:port only
+            ip, port = parts
+            return ProxyConfig(
+                server=f"http://{ip}:{port}",
+                ip=ip
+            )
+        else:
+            raise ValueError(f"Invalid proxy string format: {proxy_str}")
+    
+    @staticmethod
+    def from_dict(proxy_dict: Dict) -> "ProxyConfig":
+        """Create a ProxyConfig from a dictionary."""
+        return ProxyConfig(
+            server=proxy_dict.get("server"),
+            username=proxy_dict.get("username"),
+            password=proxy_dict.get("password"),
+            ip=proxy_dict.get("ip")
+        )
+    
+    @staticmethod
+    def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
+        """Load proxies from environment variable.
+        
+        Args:
+            env_var: Name of environment variable containing comma-separated proxy strings
+            
+        Returns:
+            List of ProxyConfig objects
+        """
+        proxies = []
+        try:
+            proxy_list = os.getenv(env_var, "").split(",")
+            for proxy in proxy_list:
+                if not proxy:
+                    continue
+                proxies.append(ProxyConfig.from_string(proxy))
+        except Exception as e:
+            print(f"Error loading proxies from environment: {e}")
+        return proxies
+    
+    def to_dict(self) -> Dict:
+        """Convert to dictionary representation."""
+        return {
+            "server": self.server,
+            "username": self.username,
+            "password": self.password,
+            "ip": self.ip
+        }
+    
+    def clone(self, **kwargs) -> "ProxyConfig":
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            ProxyConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return ProxyConfig.from_dict(config_dict)
+
+
+
+class BrowserConfig:
+    """
+    Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
+
+    This class centralizes all parameters that affect browser and context creation. Instead of passing
+    scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
+    code will then reference these settings to initialize the browser in a consistent, documented manner.
+
+    Attributes:
+        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                            Default: "chromium".
+        headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                         Default: True.
+        browser_mode (str): Determines how the browser should be initialized:
+                           "builtin" - use the builtin CDP browser running in background
+                           "dedicated" - create a new dedicated browser instance each time
+                           "cdp" - use explicit CDP settings provided in cdp_url
+                           "docker" - run browser in Docker container with isolation
+                           Default: "dedicated"
+        use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
+                                    advanced manipulation. Default: False.
+        cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
+        debugging_port (int): Port for the browser debugging protocol. Default: 9222.
+        use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
+                                       Automatically sets use_managed_browser=True. Default: False.
+        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                     temporary directory may be used. Default: None.
+        chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
+                              is "chromium". Default: "chromium".
+        channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
+                              is "chromium". Default: "chromium".
+        proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
+                             Default: None.
+        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+                                     If None, no additional proxy config. Default: None.
+        viewport_width (int): Default viewport width for pages. Default: 1080.
+        viewport_height (int): Default viewport height for pages. Default: 600.
+        viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
+                         Default: None.
+        verbose (bool): Enable verbose logging.
+                        Default: True.
+        accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
+                                 Default: False.
+        downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
+                                      a default path will be created. Default: None.
+        storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
+                                             Default: None.
+        ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
+        java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
+        cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
+                        {"name": "...", "value": "...", "url": "..."}.
+                        Default: [].
+        headers (dict): Extra HTTP headers to apply to all requests in this context.
+                        Default: {}.
+        user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                           "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
+        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
+                                       user_agent as-is. Default: None.
+        user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
+                                                    Default: None.
+        text_mode (bool): If True, disables images and other rich content for potentially faster load times.
+                          Default: False.
+        light_mode (bool): Disables certain background features for performance gains. Default: False.
+        extra_args (list): Additional command-line arguments passed to the browser.
+                           Default: [].
+    """
+
+    def __init__(
+        self,
+        browser_type: str = "chromium",
+        headless: bool = True,
+        browser_mode: str = "dedicated",
+        use_managed_browser: bool = False,
+        cdp_url: str = None,
+        use_persistent_context: bool = False,
+        user_data_dir: str = None,
+        chrome_channel: str = "chromium",
+        channel: str = "chromium",
+        proxy: str = None,
+        proxy_config: Union[ProxyConfig, dict, None] = None,
+        viewport_width: int = 1080,
+        viewport_height: int = 600,
+        viewport: dict = None,
+        accept_downloads: bool = False,
+        downloads_path: str = None,
+        storage_state: Union[str, dict, None] = None,
+        ignore_https_errors: bool = True,
+        java_script_enabled: bool = True,
+        sleep_on_close: bool = False,
+        verbose: bool = True,
+        cookies: list = None,
+        headers: dict = None,
+        user_agent: str = (
+            # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
+            # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+            # "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
+            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
+        ),
+        user_agent_mode: str = "",
+        user_agent_generator_config: dict = {},
+        text_mode: bool = False,
+        light_mode: bool = False,
+        extra_args: list = None,
+        debugging_port: int = 9222,
+        host: str = "localhost",
+    ):
+        self.browser_type = browser_type
+        self.headless = headless or True
+        self.browser_mode = browser_mode
+        self.use_managed_browser = use_managed_browser
+        self.cdp_url = cdp_url
+        self.use_persistent_context = use_persistent_context
+        self.user_data_dir = user_data_dir
+        self.chrome_channel = chrome_channel or self.browser_type or "chromium"
+        self.channel = channel or self.browser_type or "chromium"
+        if self.browser_type in ["firefox", "webkit"]:
+            self.channel = ""
+            self.chrome_channel = ""
+        self.proxy = proxy
+        self.proxy_config = proxy_config
+
+
+        self.viewport_width = viewport_width
+        self.viewport_height = viewport_height
+        self.viewport = viewport
+        if self.viewport is not None:
+            self.viewport_width = self.viewport.get("width", 1080)
+            self.viewport_height = self.viewport.get("height", 600)
+        self.accept_downloads = accept_downloads
+        self.downloads_path = downloads_path
+        self.storage_state = storage_state
+        self.ignore_https_errors = ignore_https_errors
+        self.java_script_enabled = java_script_enabled
+        self.cookies = cookies if cookies is not None else []
+        self.headers = headers if headers is not None else {}
+        self.user_agent = user_agent
+        self.user_agent_mode = user_agent_mode
+        self.user_agent_generator_config = user_agent_generator_config
+        self.text_mode = text_mode
+        self.light_mode = light_mode
+        self.extra_args = extra_args if extra_args is not None else []
+        self.sleep_on_close = sleep_on_close
+        self.verbose = verbose
+        self.debugging_port = debugging_port
+        self.host = host
+
+        fa_user_agenr_generator = ValidUAGenerator()
+        if self.user_agent_mode == "random":
+            self.user_agent = fa_user_agenr_generator.generate(
+                **(self.user_agent_generator_config or {})
+            )
+        else:
+            pass
+
+        self.browser_hint = UAGen.generate_client_hints(self.user_agent)
+        self.headers.setdefault("sec-ch-ua", self.browser_hint)
+
+        # Set appropriate browser management flags based on browser_mode
+        if self.browser_mode == "builtin":
+            # Builtin mode uses managed browser connecting to builtin CDP endpoint
+            self.use_managed_browser = True
+            # cdp_url will be set later by browser_manager
+        elif self.browser_mode == "docker":
+            # Docker mode uses managed browser with CDP to connect to browser in container
+            self.use_managed_browser = True
+            # cdp_url will be set later by docker browser strategy
+        elif self.browser_mode == "custom" and self.cdp_url:
+            # Custom mode with explicit CDP URL
+            self.use_managed_browser = True
+        elif self.browser_mode == "dedicated":
+            # Dedicated mode uses a new browser instance each time
+            pass
+
+        # If persistent context is requested, ensure managed browser is enabled
+        if self.use_persistent_context:
+            self.use_managed_browser = True
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "BrowserConfig":
+        return BrowserConfig(
+            browser_type=kwargs.get("browser_type", "chromium"),
+            headless=kwargs.get("headless", True),
+            browser_mode=kwargs.get("browser_mode", "dedicated"),
+            use_managed_browser=kwargs.get("use_managed_browser", False),
+            cdp_url=kwargs.get("cdp_url"),
+            use_persistent_context=kwargs.get("use_persistent_context", False),
+            user_data_dir=kwargs.get("user_data_dir"),
+            chrome_channel=kwargs.get("chrome_channel", "chromium"),
+            channel=kwargs.get("channel", "chromium"),
+            proxy=kwargs.get("proxy"),
+            proxy_config=kwargs.get("proxy_config", None),
+            viewport_width=kwargs.get("viewport_width", 1080),
+            viewport_height=kwargs.get("viewport_height", 600),
+            accept_downloads=kwargs.get("accept_downloads", False),
+            downloads_path=kwargs.get("downloads_path"),
+            storage_state=kwargs.get("storage_state"),
+            ignore_https_errors=kwargs.get("ignore_https_errors", True),
+            java_script_enabled=kwargs.get("java_script_enabled", True),
+            cookies=kwargs.get("cookies", []),
+            headers=kwargs.get("headers", {}),
+            user_agent=kwargs.get(
+                "user_agent",
+                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
+            ),
+            user_agent_mode=kwargs.get("user_agent_mode"),
+            user_agent_generator_config=kwargs.get("user_agent_generator_config"),
+            text_mode=kwargs.get("text_mode", False),
+            light_mode=kwargs.get("light_mode", False),
+            extra_args=kwargs.get("extra_args", []),
+            debugging_port=kwargs.get("debugging_port", 9222),
+            host=kwargs.get("host", "localhost"),
+        )
+
+    def to_dict(self):
+        result = {
+            "browser_type": self.browser_type,
+            "headless": self.headless,
+            "browser_mode": self.browser_mode,
+            "use_managed_browser": self.use_managed_browser,
+            "cdp_url": self.cdp_url,
+            "use_persistent_context": self.use_persistent_context,
+            "user_data_dir": self.user_data_dir,
+            "chrome_channel": self.chrome_channel,
+            "channel": self.channel,
+            "proxy": self.proxy,
+            "proxy_config": self.proxy_config,
+            "viewport_width": self.viewport_width,
+            "viewport_height": self.viewport_height,
+            "accept_downloads": self.accept_downloads,
+            "downloads_path": self.downloads_path,
+            "storage_state": self.storage_state,
+            "ignore_https_errors": self.ignore_https_errors,
+            "java_script_enabled": self.java_script_enabled,
+            "cookies": self.cookies,
+            "headers": self.headers,
+            "user_agent": self.user_agent,
+            "user_agent_mode": self.user_agent_mode,
+            "user_agent_generator_config": self.user_agent_generator_config,
+            "text_mode": self.text_mode,
+            "light_mode": self.light_mode,
+            "extra_args": self.extra_args,
+            "sleep_on_close": self.sleep_on_close,
+            "verbose": self.verbose,
+            "debugging_port": self.debugging_port,
+            "host": self.host,
+        }
+
+                
+        return result
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            BrowserConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return BrowserConfig.from_kwargs(config_dict)
+
+    # Create a funciton returns dict of the object
+    def dump(self) -> dict:
+        # Serialize the object to a dictionary
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "BrowserConfig":
+        # Deserialize the object from a dictionary
+        config = from_serializable_dict(data)
+        if isinstance(config, BrowserConfig):
+            return config
+        return BrowserConfig.from_kwargs(config)
+
+
+class HTTPCrawlerConfig:
+    """HTTP-specific crawler configuration"""
+
+    method: str = "GET"
+    headers: Optional[Dict[str, str]] = None
+    data: Optional[Dict[str, Any]] = None
+    json: Optional[Dict[str, Any]] = None
+    follow_redirects: bool = True
+    verify_ssl: bool = True
+
+    def __init__(
+        self,
+        method: str = "GET",
+        headers: Optional[Dict[str, str]] = None,
+        data: Optional[Dict[str, Any]] = None,
+        json: Optional[Dict[str, Any]] = None,
+        follow_redirects: bool = True,
+        verify_ssl: bool = True,
+    ):
+        self.method = method
+        self.headers = headers
+        self.data = data
+        self.json = json
+        self.follow_redirects = follow_redirects
+        self.verify_ssl = verify_ssl
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig":
+        return HTTPCrawlerConfig(
+            method=kwargs.get("method", "GET"),
+            headers=kwargs.get("headers"),
+            data=kwargs.get("data"),
+            json=kwargs.get("json"),
+            follow_redirects=kwargs.get("follow_redirects", True),
+            verify_ssl=kwargs.get("verify_ssl", True),
+        )
+
+    def to_dict(self):
+        return {
+            "method": self.method,
+            "headers": self.headers,
+            "data": self.data,
+            "json": self.json,
+            "follow_redirects": self.follow_redirects,
+            "verify_ssl": self.verify_ssl,
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            HTTPCrawlerConfig: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return HTTPCrawlerConfig.from_kwargs(config_dict)
+
+    def dump(self) -> dict:
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "HTTPCrawlerConfig":
+        config = from_serializable_dict(data)
+        if isinstance(config, HTTPCrawlerConfig):
+            return config
+        return HTTPCrawlerConfig.from_kwargs(config)
+
+class CrawlerRunConfig():
+    _UNWANTED_PROPS = {
+        'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
+        'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
+        'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
+        'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
+    }
+
+    """
+    Configuration class for controlling how the crawler runs each crawl operation.
+    This includes parameters for content extraction, page manipulation, waiting conditions,
+    caching, and other runtime behaviors.
+
+    This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
+    By using this class, you have a single place to understand and adjust the crawling options.
+
+    Attributes:
+        # Deep Crawl Parameters
+        deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling.
+
+        # Content Processing Parameters
+        word_count_threshold (int): Minimum word count threshold before processing content.
+                                    Default: MIN_WORD_THRESHOLD (typically 200).
+        extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
+                                                          Default: None (NoExtractionStrategy is used if None).
+        chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
+                                              Default: RegexChunking().
+        markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
+                                                         Default: None.
+        only_text (bool): If True, attempt to extract text-only content where applicable.
+                          Default: False.
+        css_selector (str or None): CSS selector to extract a specific portion of the page.
+                                    Default: None.
+        
+        target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation 
+                                                and structured data extraction. When you set this, only the contents 
+                                                of these elements are processed for extraction and Markdown generation. 
+                                                If you do not set any value, the entire page is processed. 
+                                                The difference between this and css_selector is that this will shrink 
+                                                the initial raw HTML to the selected element, while this will only affect 
+                                                the extraction and Markdown generation.
+                                    Default: None
+        excluded_tags (list of str or None): List of HTML tags to exclude from processing.
+                                             Default: None.
+        excluded_selector (str or None): CSS selector to exclude from processing.
+                                         Default: None.
+        keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
+                                     Default: False.
+        keep_attrs (list of str): List of HTML attributes to keep during processing.
+                                      Default: [].
+        remove_forms (bool): If True, remove all `<form>` elements from the HTML.
+                             Default: False.
+        prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
+                          Default: False.
+        parser_type (str): Type of parser to use for HTML parsing.
+                           Default: "lxml".
+        scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
+                           Default: WebScrapingStrategy.
+        proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+                                     If None, no additional proxy config. Default: None.
+
+        # SSL Parameters
+        fetch_ssl_certificate: bool = False,
+        # Caching Parameters
+        cache_mode (CacheMode or None): Defines how caching is handled.
+                                        If None, defaults to CacheMode.ENABLED internally.
+                                        Default: CacheMode.BYPASS.
+        session_id (str or None): Optional session ID to persist the browser context and the created
+                                  page instance. If the ID already exists, the crawler does not
+                                  create a new page and uses the current page to preserve the state.
+        bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
+                             Default: False.
+        disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
+                              Default: False.
+        no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
+                              Default: False.
+        no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
+                               Default: False.
+        shared_data (dict or None): Shared data to be passed between hooks.
+                                     Default: None.
+
+        # Page Navigation and Timing Parameters
+        wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
+                          Default: "domcontentloaded".
+        page_timeout (int): Timeout in ms for page operations like navigation.
+                            Default: 60000 (60 seconds).
+        wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
+                                Default: None.
+        wait_for_images (bool): If True, wait for images to load before extracting content.
+                                Default: False.
+        delay_before_return_html (float): Delay in seconds before retrieving final HTML.
+                                          Default: 0.1.
+        mean_delay (float): Mean base delay between requests when calling arun_many.
+                            Default: 0.1.
+        max_range (float): Max random additional delay range for requests in arun_many.
+                           Default: 0.3.
+        semaphore_count (int): Number of concurrent operations allowed.
+                               Default: 5.
+
+        # Page Interaction Parameters
+        js_code (str or list of str or None): JavaScript code/snippets to run on the page.
+                                              Default: None.
+        js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
+                        Default: False.
+        ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
+                                       Default: True.
+        scan_full_page (bool): If True, scroll through the entire page to load all content.
+                               Default: False.
+        scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
+                              Default: 0.2.
+        process_iframes (bool): If True, attempts to process and inline iframe content.
+                                Default: False.
+        remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
+                                        Default: False.
+        simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
+                              Default: False.
+        override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
+                                   Default: False.
+        magic (bool): If True, attempts automatic handling of overlays/popups.
+                      Default: False.
+        adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
+                                           Default: False.
+
+        # Media Handling Parameters
+        screenshot (bool): Whether to take a screenshot after crawling.
+                           Default: False.
+        screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
+                                             Default: None.
+        screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
+                                           Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
+        pdf (bool): Whether to generate a PDF of the page.
+                    Default: False.
+        image_description_min_word_threshold (int): Minimum words for image description extraction.
+                                                    Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
+        image_score_threshold (int): Minimum score threshold for processing an image.
+                                     Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
+        exclude_external_images (bool): If True, exclude all external images from processing.
+                                         Default: False.
+        table_score_threshold (int): Minimum score threshold for processing a table.
+                                     Default: 7.
+
+        # Link and Domain Handling Parameters
+        exclude_social_media_domains (list of str): List of domains to exclude for social media links.
+                                                    Default: SOCIAL_MEDIA_DOMAINS (from config).
+        exclude_external_links (bool): If True, exclude all external links from the results.
+                                       Default: False.
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.
+        exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
+                                           Default: False.
+        exclude_domains (list of str): List of specific domains to exclude from results.
+                                       Default: [].
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.
+
+        # Debugging and Logging Parameters
+        verbose (bool): Enable verbose logging.
+                        Default: True.
+        log_console (bool): If True, log console messages from the page.
+                            Default: False.
+
+        # HTTP Crwler Strategy Parameters
+        method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
+                        Default: "GET".
+        data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+                        Default: None.
+        json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
+
+        # Connection Parameters
+        stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
+                      Default: False.
+
+        check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
+                                 Default: False.
+        user_agent (str): Custom User-Agent string to use.
+                          Default: None.
+        user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
+                                       Default: None.
+        user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
+                                                    Default: None.
+
+        # Experimental Parameters
+        experimental (dict): Dictionary containing experimental parameters that are in beta phase.
+                            This allows passing temporary features that are not yet fully integrated 
+                            into the main parameter set.
+                            Default: None.
+
+        url: str = None  # This is not a compulsory parameter
+    """
+
+    def __init__(
+        self,
+        # Content Processing Parameters
+        word_count_threshold: int = MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
+        only_text: bool = False,
+        css_selector: str = None,
+        target_elements: List[str] = None,
+        excluded_tags: list = None,
+        excluded_selector: str = None,
+        keep_data_attributes: bool = False,
+        keep_attrs: list = None,
+        remove_forms: bool = False,
+        prettiify: bool = False,
+        parser_type: str = "lxml",
+        scraping_strategy: ContentScrapingStrategy = None,
+        proxy_config: Union[ProxyConfig, dict, None] = None,
+        proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
+        # SSL Parameters
+        fetch_ssl_certificate: bool = False,
+        # Caching Parameters
+        cache_mode: CacheMode = CacheMode.BYPASS,
+        session_id: str = None,
+        bypass_cache: bool = False,
+        disable_cache: bool = False,
+        no_cache_read: bool = False,
+        no_cache_write: bool = False,
+        shared_data: dict = None,
+        # Page Navigation and Timing Parameters
+        wait_until: str = "domcontentloaded",
+        page_timeout: int = PAGE_TIMEOUT,
+        wait_for: str = None,
+        wait_for_images: bool = False,
+        delay_before_return_html: float = 0.1,
+        mean_delay: float = 0.1,
+        max_range: float = 0.3,
+        semaphore_count: int = 5,
+        # Page Interaction Parameters
+        js_code: Union[str, List[str]] = None,
+        js_only: bool = False,
+        ignore_body_visibility: bool = True,
+        scan_full_page: bool = False,
+        scroll_delay: float = 0.2,
+        process_iframes: bool = False,
+        remove_overlay_elements: bool = False,
+        simulate_user: bool = False,
+        override_navigator: bool = False,
+        magic: bool = False,
+        adjust_viewport_to_content: bool = False,
+        # Media Handling Parameters
+        screenshot: bool = False,
+        screenshot_wait_for: float = None,
+        screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
+        pdf: bool = False,
+        capture_mhtml: bool = False,
+        image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+        image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
+        table_score_threshold: int = 7,
+        exclude_external_images: bool = False,
+        exclude_all_images: bool = False,
+        # Link and Domain Handling Parameters
+        exclude_social_media_domains: list = None,
+        exclude_external_links: bool = False,
+        exclude_social_media_links: bool = False,
+        exclude_domains: list = None,
+        exclude_internal_links: bool = False,
+        # Debugging and Logging Parameters
+        verbose: bool = True,
+        log_console: bool = False,
+        # Network and Console Capturing Parameters
+        capture_network_requests: bool = False,
+        capture_console_messages: bool = False,
+        # Connection Parameters
+        method: str = "GET",
+        stream: bool = False,
+        url: str = None,
+        check_robots_txt: bool = False,
+        user_agent: str = None,
+        user_agent_mode: str = None,
+        user_agent_generator_config: dict = {},
+        # Deep Crawl Parameters
+        deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
+        # Experimental Parameters
+        experimental: Dict[str, Any] = None,
+    ):
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        self.url = url
+
+        # Content Processing Parameters
+        self.word_count_threshold = word_count_threshold
+        self.extraction_strategy = extraction_strategy
+        self.chunking_strategy = chunking_strategy
+        self.markdown_generator = markdown_generator
+        self.only_text = only_text
+        self.css_selector = css_selector
+        self.target_elements = target_elements or []
+        self.excluded_tags = excluded_tags or []
+        self.excluded_selector = excluded_selector or ""
+        self.keep_data_attributes = keep_data_attributes
+        self.keep_attrs = keep_attrs or []
+        self.remove_forms = remove_forms
+        self.prettiify = prettiify
+        self.parser_type = parser_type
+        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
+        self.proxy_config = proxy_config
+        self.proxy_rotation_strategy = proxy_rotation_strategy
+
+        # SSL Parameters
+        self.fetch_ssl_certificate = fetch_ssl_certificate
+
+        # Caching Parameters
+        self.cache_mode = cache_mode
+        self.session_id = session_id
+        self.bypass_cache = bypass_cache
+        self.disable_cache = disable_cache
+        self.no_cache_read = no_cache_read
+        self.no_cache_write = no_cache_write
+        self.shared_data = shared_data
+
+        # Page Navigation and Timing Parameters
+        self.wait_until = wait_until
+        self.page_timeout = page_timeout
+        self.wait_for = wait_for
+        self.wait_for_images = wait_for_images
+        self.delay_before_return_html = delay_before_return_html
+        self.mean_delay = mean_delay
+        self.max_range = max_range
+        self.semaphore_count = semaphore_count
+
+        # Page Interaction Parameters
+        self.js_code = js_code
+        self.js_only = js_only
+        self.ignore_body_visibility = ignore_body_visibility
+        self.scan_full_page = scan_full_page
+        self.scroll_delay = scroll_delay
+        self.process_iframes = process_iframes
+        self.remove_overlay_elements = remove_overlay_elements
+        self.simulate_user = simulate_user
+        self.override_navigator = override_navigator
+        self.magic = magic
+        self.adjust_viewport_to_content = adjust_viewport_to_content
+
+        # Media Handling Parameters
+        self.screenshot = screenshot
+        self.screenshot_wait_for = screenshot_wait_for
+        self.screenshot_height_threshold = screenshot_height_threshold
+        self.pdf = pdf
+        self.capture_mhtml = capture_mhtml
+        self.image_description_min_word_threshold = image_description_min_word_threshold
+        self.image_score_threshold = image_score_threshold
+        self.exclude_external_images = exclude_external_images
+        self.exclude_all_images = exclude_all_images
+        self.table_score_threshold = table_score_threshold
+
+        # Link and Domain Handling Parameters
+        self.exclude_social_media_domains = (
+            exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
+        )
+        self.exclude_external_links = exclude_external_links
+        self.exclude_social_media_links = exclude_social_media_links
+        self.exclude_domains = exclude_domains or []
+        self.exclude_internal_links = exclude_internal_links
+
+        # Debugging and Logging Parameters
+        self.verbose = verbose
+        self.log_console = log_console
+        
+        # Network and Console Capturing Parameters
+        self.capture_network_requests = capture_network_requests
+        self.capture_console_messages = capture_console_messages
+
+        # Connection Parameters
+        self.stream = stream
+        self.method = method
+
+        # Robots.txt Handling Parameters
+        self.check_robots_txt = check_robots_txt
+
+        # User Agent Parameters
+        self.user_agent = user_agent
+        self.user_agent_mode = user_agent_mode
+        self.user_agent_generator_config = user_agent_generator_config
+
+        # Validate type of extraction strategy and chunking strategy if they are provided
+        if self.extraction_strategy is not None and not isinstance(
+            self.extraction_strategy, ExtractionStrategy
+        ):
+            raise ValueError(
+                "extraction_strategy must be an instance of ExtractionStrategy"
+            )
+        if self.chunking_strategy is not None and not isinstance(
+            self.chunking_strategy, ChunkingStrategy
+        ):
+            raise ValueError(
+                "chunking_strategy must be an instance of ChunkingStrategy"
+            )
+
+        # Set default chunking strategy if None
+        if self.chunking_strategy is None:
+            self.chunking_strategy = RegexChunking()
+
+        # Deep Crawl Parameters
+        self.deep_crawl_strategy = deep_crawl_strategy
+        
+        # Experimental Parameters
+        self.experimental = experimental or {}
+
+
+    def __getattr__(self, name):
+        """Handle attribute access."""
+        if name in self._UNWANTED_PROPS:
+            raise AttributeError(f"Getting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        raise AttributeError(f"'{self.__class__.__name__}' has no attribute '{name}'")
+
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
+        return CrawlerRunConfig(
+            # Content Processing Parameters
+            word_count_threshold=kwargs.get("word_count_threshold", 200),
+            extraction_strategy=kwargs.get("extraction_strategy"),
+            chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
+            markdown_generator=kwargs.get("markdown_generator"),
+            only_text=kwargs.get("only_text", False),
+            css_selector=kwargs.get("css_selector"),
+            target_elements=kwargs.get("target_elements", []),
+            excluded_tags=kwargs.get("excluded_tags", []),
+            excluded_selector=kwargs.get("excluded_selector", ""),
+            keep_data_attributes=kwargs.get("keep_data_attributes", False),
+            keep_attrs=kwargs.get("keep_attrs", []),
+            remove_forms=kwargs.get("remove_forms", False),
+            prettiify=kwargs.get("prettiify", False),
+            parser_type=kwargs.get("parser_type", "lxml"),
+            scraping_strategy=kwargs.get("scraping_strategy"),
+            proxy_config=kwargs.get("proxy_config"),
+            proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
+            # SSL Parameters
+            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
+            # Caching Parameters
+            cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
+            session_id=kwargs.get("session_id"),
+            bypass_cache=kwargs.get("bypass_cache", False),
+            disable_cache=kwargs.get("disable_cache", False),
+            no_cache_read=kwargs.get("no_cache_read", False),
+            no_cache_write=kwargs.get("no_cache_write", False),
+            shared_data=kwargs.get("shared_data", None),
+            # Page Navigation and Timing Parameters
+            wait_until=kwargs.get("wait_until", "domcontentloaded"),
+            page_timeout=kwargs.get("page_timeout", 60000),
+            wait_for=kwargs.get("wait_for"),
+            wait_for_images=kwargs.get("wait_for_images", False),
+            delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
+            mean_delay=kwargs.get("mean_delay", 0.1),
+            max_range=kwargs.get("max_range", 0.3),
+            semaphore_count=kwargs.get("semaphore_count", 5),
+            # Page Interaction Parameters
+            js_code=kwargs.get("js_code"),
+            js_only=kwargs.get("js_only", False),
+            ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
+            scan_full_page=kwargs.get("scan_full_page", False),
+            scroll_delay=kwargs.get("scroll_delay", 0.2),
+            process_iframes=kwargs.get("process_iframes", False),
+            remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
+            simulate_user=kwargs.get("simulate_user", False),
+            override_navigator=kwargs.get("override_navigator", False),
+            magic=kwargs.get("magic", False),
+            adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
+            # Media Handling Parameters
+            screenshot=kwargs.get("screenshot", False),
+            screenshot_wait_for=kwargs.get("screenshot_wait_for"),
+            screenshot_height_threshold=kwargs.get(
+                "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
+            ),
+            pdf=kwargs.get("pdf", False),
+            capture_mhtml=kwargs.get("capture_mhtml", False),
+            image_description_min_word_threshold=kwargs.get(
+                "image_description_min_word_threshold",
+                IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+            ),
+            image_score_threshold=kwargs.get(
+                "image_score_threshold", IMAGE_SCORE_THRESHOLD
+            ),
+            table_score_threshold=kwargs.get("table_score_threshold", 7),
+            exclude_all_images=kwargs.get("exclude_all_images", False),
+            exclude_external_images=kwargs.get("exclude_external_images", False),
+            # Link and Domain Handling Parameters
+            exclude_social_media_domains=kwargs.get(
+                "exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS
+            ),
+            exclude_external_links=kwargs.get("exclude_external_links", False),
+            exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
+            exclude_domains=kwargs.get("exclude_domains", []),
+            exclude_internal_links=kwargs.get("exclude_internal_links", False),
+            # Debugging and Logging Parameters
+            verbose=kwargs.get("verbose", True),
+            log_console=kwargs.get("log_console", False),
+            # Network and Console Capturing Parameters
+            capture_network_requests=kwargs.get("capture_network_requests", False),
+            capture_console_messages=kwargs.get("capture_console_messages", False),
+            # Connection Parameters
+            method=kwargs.get("method", "GET"),
+            stream=kwargs.get("stream", False),
+            check_robots_txt=kwargs.get("check_robots_txt", False),
+            user_agent=kwargs.get("user_agent"),
+            user_agent_mode=kwargs.get("user_agent_mode"),
+            user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
+            # Deep Crawl Parameters
+            deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
+            url=kwargs.get("url"),
+            # Experimental Parameters 
+            experimental=kwargs.get("experimental"),
+        )
+
+    # Create a funciton returns dict of the object
+    def dump(self) -> dict:
+        # Serialize the object to a dictionary
+        return to_serializable_dict(self)
+
+    @staticmethod
+    def load(data: dict) -> "CrawlerRunConfig":
+        # Deserialize the object from a dictionary
+        config = from_serializable_dict(data)
+        if isinstance(config, CrawlerRunConfig):
+            return config
+        return CrawlerRunConfig.from_kwargs(config)
+
+    def to_dict(self):
+        return {
+            "word_count_threshold": self.word_count_threshold,
+            "extraction_strategy": self.extraction_strategy,
+            "chunking_strategy": self.chunking_strategy,
+            "markdown_generator": self.markdown_generator,
+            "only_text": self.only_text,
+            "css_selector": self.css_selector,
+            "target_elements": self.target_elements,
+            "excluded_tags": self.excluded_tags,
+            "excluded_selector": self.excluded_selector,
+            "keep_data_attributes": self.keep_data_attributes,
+            "keep_attrs": self.keep_attrs,
+            "remove_forms": self.remove_forms,
+            "prettiify": self.prettiify,
+            "parser_type": self.parser_type,
+            "scraping_strategy": self.scraping_strategy,
+            "proxy_config": self.proxy_config,
+            "proxy_rotation_strategy": self.proxy_rotation_strategy,
+            "fetch_ssl_certificate": self.fetch_ssl_certificate,
+            "cache_mode": self.cache_mode,
+            "session_id": self.session_id,
+            "bypass_cache": self.bypass_cache,
+            "disable_cache": self.disable_cache,
+            "no_cache_read": self.no_cache_read,
+            "no_cache_write": self.no_cache_write,
+            "shared_data": self.shared_data,
+            "wait_until": self.wait_until,
+            "page_timeout": self.page_timeout,
+            "wait_for": self.wait_for,
+            "wait_for_images": self.wait_for_images,
+            "delay_before_return_html": self.delay_before_return_html,
+            "mean_delay": self.mean_delay,
+            "max_range": self.max_range,
+            "semaphore_count": self.semaphore_count,
+            "js_code": self.js_code,
+            "js_only": self.js_only,
+            "ignore_body_visibility": self.ignore_body_visibility,
+            "scan_full_page": self.scan_full_page,
+            "scroll_delay": self.scroll_delay,
+            "process_iframes": self.process_iframes,
+            "remove_overlay_elements": self.remove_overlay_elements,
+            "simulate_user": self.simulate_user,
+            "override_navigator": self.override_navigator,
+            "magic": self.magic,
+            "adjust_viewport_to_content": self.adjust_viewport_to_content,
+            "screenshot": self.screenshot,
+            "screenshot_wait_for": self.screenshot_wait_for,
+            "screenshot_height_threshold": self.screenshot_height_threshold,
+            "pdf": self.pdf,
+            "capture_mhtml": self.capture_mhtml,
+            "image_description_min_word_threshold": self.image_description_min_word_threshold,
+            "image_score_threshold": self.image_score_threshold,
+            "table_score_threshold": self.table_score_threshold,
+            "exclude_all_images": self.exclude_all_images,
+            "exclude_external_images": self.exclude_external_images,
+            "exclude_social_media_domains": self.exclude_social_media_domains,
+            "exclude_external_links": self.exclude_external_links,
+            "exclude_social_media_links": self.exclude_social_media_links,
+            "exclude_domains": self.exclude_domains,
+            "exclude_internal_links": self.exclude_internal_links,
+            "verbose": self.verbose,
+            "log_console": self.log_console,
+            "capture_network_requests": self.capture_network_requests,
+            "capture_console_messages": self.capture_console_messages,
+            "method": self.method,
+            "stream": self.stream,
+            "check_robots_txt": self.check_robots_txt,
+            "user_agent": self.user_agent,
+            "user_agent_mode": self.user_agent_mode,
+            "user_agent_generator_config": self.user_agent_generator_config,
+            "deep_crawl_strategy": self.deep_crawl_strategy,
+            "url": self.url,
+            "experimental": self.experimental,
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            CrawlerRunConfig: A new instance with the specified updates
+
+        Example:
+            ```python
+            # Create a new config with streaming enabled
+            stream_config = config.clone(stream=True)
+
+            # Create a new config with multiple updates
+            new_config = config.clone(
+                stream=True,
+                cache_mode=CacheMode.BYPASS,
+                verbose=True
+            )
+            ```
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return CrawlerRunConfig.from_kwargs(config_dict)
+
+
+class LLMConfig:
+    def __init__(
+        self,
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: Optional[str] = None,
+        temprature: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        top_p: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        stop: Optional[List[str]] = None,
+        n: Optional[int] = None,    
+    ):
+        """Configuaration class for LLM provider and API token."""
+        self.provider = provider
+        if api_token and not api_token.startswith("env:"):
+            self.api_token = api_token
+        elif api_token and api_token.startswith("env:"):
+            self.api_token = os.getenv(api_token[4:])
+        else:
+            # Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
+            # If not, check if it is in PROVIDER_MODELS
+            prefixes = PROVIDER_MODELS_PREFIXES.keys()
+            if any(provider.startswith(prefix) for prefix in prefixes):
+                selected_prefix = next(
+                    (prefix for prefix in prefixes if provider.startswith(prefix)),
+                    None,
+                )
+                self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)                    
+            else:
+                self.provider = DEFAULT_PROVIDER
+                self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
+        self.base_url = base_url
+        self.temprature = temprature
+        self.max_tokens = max_tokens
+        self.top_p = top_p
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+        self.stop = stop
+        self.n = n
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "LLMConfig":
+        return LLMConfig(
+            provider=kwargs.get("provider", DEFAULT_PROVIDER),
+            api_token=kwargs.get("api_token"),
+            base_url=kwargs.get("base_url"),
+            temprature=kwargs.get("temprature"),
+            max_tokens=kwargs.get("max_tokens"),
+            top_p=kwargs.get("top_p"),
+            frequency_penalty=kwargs.get("frequency_penalty"),
+            presence_penalty=kwargs.get("presence_penalty"),
+            stop=kwargs.get("stop"),
+            n=kwargs.get("n")
+        )
+
+    def to_dict(self):
+        return {
+            "provider": self.provider,
+            "api_token": self.api_token,
+            "base_url": self.base_url,
+            "temprature": self.temprature,
+            "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "stop": self.stop,
+            "n": self.n
+        }
+
+    def clone(self, **kwargs):
+        """Create a copy of this configuration with updated values.
+
+        Args:
+            **kwargs: Key-value pairs of configuration options to update
+
+        Returns:
+            llm_config: A new instance with the specified updates
+        """
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return LLMConfig.from_kwargs(config_dict)
+
+
+
+```
+
+
+## File: crawl4ai/async_webcrawler.py
+
+```py
+from .__version__ import __version__ as crawl4ai_version
+import os
+import sys
+import time
+from colorama import Fore
+from pathlib import Path
+from typing import Optional, List
+import json
+import asyncio
+
+# from contextlib import nullcontext, asynccontextmanager
+from contextlib import asynccontextmanager
+from .models import (
+    CrawlResult,
+    MarkdownGenerationResult,
+    DispatchResult,
+    ScrapingResult,
+    CrawlResultContainer,
+    RunManyReturn
+)
+from .async_database import async_db_manager
+from .chunking_strategy import *  # noqa: F403
+from .chunking_strategy import IdentityChunking
+from .content_filter_strategy import *  # noqa: F403
+from .extraction_strategy import *  # noqa: F403
+from .extraction_strategy import NoExtractionStrategy
+from .async_crawler_strategy import (
+    AsyncCrawlerStrategy,
+    AsyncPlaywrightCrawlerStrategy,
+    AsyncCrawlResponse,
+)
+from .cache_context import CacheMode, CacheContext
+from .markdown_generation_strategy import (
+    DefaultMarkdownGenerator,
+    MarkdownGenerationStrategy,
+)
+from .deep_crawling import DeepCrawlDecorator
+from .async_logger import AsyncLogger, AsyncLoggerBase
+from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
+from .async_dispatcher import *  # noqa: F403
+from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
+
+from .utils import (
+    sanitize_input_encode,
+    InvalidCSSSelectorError,
+    fast_format_html,
+    create_box_message,
+    get_error_context,
+    RobotsParser,
+    preprocess_html_for_schema,
+)
+
+
+class AsyncWebCrawler:
+    """
+    Asynchronous web crawler with flexible caching capabilities.
+
+    There are two ways to use the crawler:
+
+    1. Using context manager (recommended for simple cases):
+        ```python
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url="https://example.com")
+        ```
+
+    2. Using explicit lifecycle management (recommended for long-running applications):
+        ```python
+        crawler = AsyncWebCrawler()
+        await crawler.start()
+
+        # Use the crawler multiple times
+        result1 = await crawler.arun(url="https://example.com")
+        result2 = await crawler.arun(url="https://another.com")
+
+        await crawler.close()
+        ```
+
+    Attributes:
+        browser_config (BrowserConfig): Configuration object for browser settings.
+        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
+        logger (AsyncLogger): Logger instance for recording events and errors.
+        crawl4ai_folder (str): Directory for storing cache.
+        base_directory (str): Base directory for storing cache.
+        ready (bool): Whether the crawler is ready for use.
+
+    Methods:
+        start(): Start the crawler explicitly without using context manager.
+        close(): Close the crawler explicitly without using context manager.
+        arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
+        awarmup(): Perform warmup sequence.
+        arun_many(): Run the crawler for multiple sources.
+        aprocess_html(): Process HTML content.
+
+    Typical Usage:
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url="https://example.com")
+            print(result.markdown)
+
+        Using configuration:
+        browser_config = BrowserConfig(browser_type="chromium", headless=True)
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS
+            )
+            result = await crawler.arun(url="https://example.com", config=crawler_config)
+            print(result.markdown)
+    """
+
+    _domain_last_hit = {}
+
+    def __init__(
+        self,
+        crawler_strategy: AsyncCrawlerStrategy = None,
+        config: BrowserConfig = None,
+        base_directory: str = str(
+            os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
+        thread_safe: bool = False,
+        logger: AsyncLoggerBase = None,
+        **kwargs,
+    ):
+        """
+        Initialize the AsyncWebCrawler.
+
+        Args:
+            crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
+            config: Configuration object for browser settings. Default BrowserConfig()
+            base_directory: Base directory for storing cache
+            thread_safe: Whether to use thread-safe operations
+            **kwargs: Additional arguments for backwards compatibility
+        """
+        # Handle browser configuration
+        browser_config = config or BrowserConfig()
+
+        self.browser_config = browser_config
+
+        # Initialize logger first since other components may need it
+        self.logger = logger or AsyncLogger(
+            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
+            verbose=self.browser_config.verbose,
+            tag_width=10,
+        )
+
+        # Initialize crawler strategy
+        params = {k: v for k, v in kwargs.items() if k in [
+            "browser_config", "logger"]}
+        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
+            browser_config=browser_config,
+            logger=self.logger,
+            **params,  # Pass remaining kwargs for backwards compatibility
+        )
+
+        # Thread safety setup
+        self._lock = asyncio.Lock() if thread_safe else None
+
+        # Initialize directories
+        self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
+        os.makedirs(self.crawl4ai_folder, exist_ok=True)
+        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
+
+        # Initialize robots parser
+        self.robots_parser = RobotsParser()
+
+        self.ready = False
+
+        # Decorate arun method with deep crawling capabilities
+        self._deep_handler = DeepCrawlDecorator(self)
+        self.arun = self._deep_handler(self.arun)
+
+    async def start(self):
+        """
+        Start the crawler explicitly without using context manager.
+        This is equivalent to using 'async with' but gives more control over the lifecycle.
+        Returns:
+            AsyncWebCrawler: The initialized crawler instance
+        """
+        await self.crawler_strategy.__aenter__()
+        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
+        self.ready = True
+        return self
+
+    async def close(self):
+        """
+        Close the crawler explicitly without using context manager.
+        This should be called when you're done with the crawler if you used start().
+
+        This method will:
+        1. Clean up browser resources
+        2. Close any open pages and contexts
+        """
+        await self.crawler_strategy.__aexit__(None, None, None)
+
+    async def __aenter__(self):
+        return await self.start()
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+
+    @asynccontextmanager
+    async def nullcontext(self):
+        """异步空上下文管理器"""
+        yield
+
+    async def arun(
+        self,
+        url: str,
+        config: CrawlerRunConfig = None,
+        **kwargs,
+    ) -> RunManyReturn:
+        """
+        Runs the crawler for a single source: URL (web, local file, or raw HTML).
+
+        Migration Guide:
+        Old way (deprecated):
+            result = await crawler.arun(
+                url="https://example.com",
+                word_count_threshold=200,
+                screenshot=True,
+                ...
+            )
+
+        New way (recommended):
+            config = CrawlerRunConfig(
+                word_count_threshold=200,
+                screenshot=True,
+                ...
+            )
+            result = await crawler.arun(url="https://example.com", crawler_config=config)
+
+        Args:
+            url: The URL to crawl (http://, https://, file://, or raw:)
+            crawler_config: Configuration object controlling crawl behavior
+            [other parameters maintained for backwards compatibility]
+
+        Returns:
+            CrawlResult: The result of crawling and processing
+        """
+        # Auto-start if not ready
+        if not self.ready:
+            await self.start()
+
+        config = config or CrawlerRunConfig()
+        if not isinstance(url, str) or not url:
+            raise ValueError(
+                "Invalid URL, make sure the URL is a non-empty string")
+
+        async with self._lock or self.nullcontext():
+            try:
+                self.logger.verbose = config.verbose
+
+                # Default to ENABLED if no cache mode specified
+                if config.cache_mode is None:
+                    config.cache_mode = CacheMode.ENABLED
+
+                # Create cache context
+                cache_context = CacheContext(url, config.cache_mode, False)
+
+                # Initialize processing variables
+                async_response: AsyncCrawlResponse = None
+                cached_result: CrawlResult = None
+                screenshot_data = None
+                pdf_data = None
+                extracted_content = None
+                start_time = time.perf_counter()
+
+                # Try to get cached result if appropriate
+                if cache_context.should_read():
+                    cached_result = await async_db_manager.aget_cached_url(url)
+
+                if cached_result:
+                    html = sanitize_input_encode(cached_result.html)
+                    extracted_content = sanitize_input_encode(
+                        cached_result.extracted_content or ""
+                    )
+                    extracted_content = (
+                        None
+                        if not extracted_content or extracted_content == "[]"
+                        else extracted_content
+                    )
+                    # If screenshot is requested but its not in cache, then set cache_result to None
+                    screenshot_data = cached_result.screenshot
+                    pdf_data = cached_result.pdf
+                    # if config.screenshot and not screenshot or config.pdf and not pdf:
+                    if config.screenshot and not screenshot_data:
+                        cached_result = None
+
+                    if config.pdf and not pdf_data:
+                        cached_result = None
+
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - start_time,
+                        tag="FETCH",
+                    )
+
+                # Update proxy configuration from rotation strategy if available
+                if config and config.proxy_rotation_strategy:
+                    next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
+                    if next_proxy:
+                        self.logger.info(
+                            message="Switch proxy: {proxy}",
+                            tag="PROXY",
+                            params={"proxy": next_proxy.server}
+                        )
+                        config.proxy_config = next_proxy
+                        # config = config.clone(proxy_config=next_proxy)
+
+                # Fetch fresh content if needed
+                if not cached_result or not html:
+                    t1 = time.perf_counter()
+
+                    if config.user_agent:
+                        self.crawler_strategy.update_user_agent(
+                            config.user_agent)
+
+                    # Check robots.txt if enabled
+                    if config and config.check_robots_txt:
+                        if not await self.robots_parser.can_fetch(
+                            url, self.browser_config.user_agent
+                        ):
+                            return CrawlResult(
+                                url=url,
+                                html="",
+                                success=False,
+                                status_code=403,
+                                error_message="Access denied by robots.txt",
+                                response_headers={
+                                    "X-Robots-Status": "Blocked by robots.txt"
+                                },
+                            )
+
+                    ##############################
+                    # Call CrawlerStrategy.crawl #
+                    ##############################
+                    async_response = await self.crawler_strategy.crawl(
+                        url,
+                        config=config,  # Pass the entire config object
+                    )
+
+                    html = sanitize_input_encode(async_response.html)
+                    screenshot_data = async_response.screenshot
+                    pdf_data = async_response.pdf_data
+                    js_execution_result = async_response.js_execution_result
+
+                    t2 = time.perf_counter()
+                    self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=t2 - t1,
+                        tag="FETCH",
+                    )
+
+                    ###############################################################
+                    # Process the HTML content, Call CrawlerStrategy.process_html #
+                    ###############################################################
+                    crawl_result: CrawlResult = await self.aprocess_html(
+                        url=url,
+                        html=html,
+                        extracted_content=extracted_content,
+                        config=config,  # Pass the config object instead of individual parameters
+                        screenshot=screenshot_data,
+                        pdf_data=pdf_data,
+                        verbose=config.verbose,
+                        is_raw_html=True if url.startswith("raw:") else False,
+                        **kwargs,
+                    )
+
+                    crawl_result.status_code = async_response.status_code
+                    crawl_result.redirected_url = async_response.redirected_url or url
+                    crawl_result.response_headers = async_response.response_headers
+                    crawl_result.downloaded_files = async_response.downloaded_files
+                    crawl_result.js_execution_result = js_execution_result
+                    crawl_result.mhtml = async_response.mhtml_data
+                    crawl_result.ssl_certificate = async_response.ssl_certificate
+                    # Add captured network and console data if available
+                    crawl_result.network_requests = async_response.network_requests
+                    crawl_result.console_messages = async_response.console_messages
+
+                    crawl_result.success = bool(html)
+                    crawl_result.session_id = getattr(
+                        config, "session_id", None)
+
+                    self.logger.success(
+                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        tag="COMPLETE",
+                        params={
+                            "url": cache_context.display_url,
+                            "status": crawl_result.success,
+                            "timing": f"{time.perf_counter() - start_time:.2f}s",
+                        },
+                        colors={
+                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
+                            "timing": Fore.YELLOW,
+                        },
+                    )
+
+                    # Update cache if appropriate
+                    if cache_context.should_write() and not bool(cached_result):
+                        await async_db_manager.acache_url(crawl_result)
+
+                    return CrawlResultContainer(crawl_result)
+
+                else:
+                    self.logger.success(
+                        message="{url:.50}... | Status: {status} | Total: {timing}",
+                        tag="COMPLETE",
+                        params={
+                            "url": cache_context.display_url,
+                            "status": True,
+                            "timing": f"{time.perf_counter() - start_time:.2f}s",
+                        },
+                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
+                    )
+
+                    cached_result.success = bool(html)
+                    cached_result.session_id = getattr(
+                        config, "session_id", None)
+                    cached_result.redirected_url = cached_result.redirected_url or url
+                    return CrawlResultContainer(cached_result)
+
+            except Exception as e:
+                error_context = get_error_context(sys.exc_info())
+
+                error_message = (
+                    f"Unexpected error in _crawl_web at line {error_context['line_no']} "
+                    f"in {error_context['function']} ({error_context['filename']}):\n"
+                    f"Error: {str(e)}\n\n"
+                    f"Code context:\n{error_context['code_context']}"
+                )
+
+                self.logger.error_status(
+                    url=url,
+                    error=create_box_message(error_message, type="error"),
+                    tag="ERROR",
+                )
+
+                return CrawlResultContainer(
+                    CrawlResult(
+                        url=url, html="", success=False, error_message=error_message
+                    )
+                )
+
+    async def aprocess_html(
+        self,
+        url: str,
+        html: str,
+        extracted_content: str,
+        config: CrawlerRunConfig,
+        screenshot: str,
+        pdf_data: str,
+        verbose: bool,
+        **kwargs,
+    ) -> CrawlResult:
+        """
+        Process HTML content using the provided configuration.
+
+        Args:
+            url: The URL being processed
+            html: Raw HTML content
+            extracted_content: Previously extracted content (if any)
+            config: Configuration object controlling processing behavior
+            screenshot: Screenshot data (if any)
+            pdf_data: PDF data (if any)
+            verbose: Whether to enable verbose logging
+            **kwargs: Additional parameters for backwards compatibility
+
+        Returns:
+            CrawlResult: Processed result containing extracted and formatted content
+        """
+        cleaned_html = ""
+        try:
+            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
+            t1 = time.perf_counter()
+
+            # Get scraping strategy and ensure it has a logger
+            scraping_strategy = config.scraping_strategy
+            if not scraping_strategy.logger:
+                scraping_strategy.logger = self.logger
+
+            # Process HTML content
+            params = config.__dict__.copy()
+            params.pop("url", None)
+            # add keys from kwargs to params that doesn't exist in params
+            params.update({k: v for k, v in kwargs.items()
+                          if k not in params.keys()})
+
+            ################################
+            # Scraping Strategy Execution  #
+            ################################
+            result: ScrapingResult = scraping_strategy.scrap(
+                url, html, **params)
+
+            if result is None:
+                raise ValueError(
+                    f"Process HTML, Failed to extract content from the website: {url}"
+                )
+
+        except InvalidCSSSelectorError as e:
+            raise ValueError(str(e))
+        except Exception as e:
+            raise ValueError(
+                f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}"
+            )
+
+        # Extract results - handle both dict and ScrapingResult
+        if isinstance(result, dict):
+            cleaned_html = sanitize_input_encode(
+                result.get("cleaned_html", ""))
+            media = result.get("media", {})
+            links = result.get("links", {})
+            metadata = result.get("metadata", {})
+        else:
+            cleaned_html = sanitize_input_encode(result.cleaned_html)
+            media = result.media.model_dump()
+            links = result.links.model_dump()
+            metadata = result.metadata
+
+        ################################
+        # Generate Markdown            #
+        ################################
+        markdown_generator: Optional[MarkdownGenerationStrategy] = (
+            config.markdown_generator or DefaultMarkdownGenerator()
+        )
+
+        # --- SELECT HTML SOURCE BASED ON CONTENT_SOURCE ---
+        # Get the desired source from the generator config, default to 'cleaned_html'
+        selected_html_source = getattr(markdown_generator, 'content_source', 'cleaned_html')
+
+        # Define the source selection logic using dict dispatch
+        html_source_selector = {
+            "raw_html": lambda: html,  # The original raw HTML
+            "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
+            "fit_html": lambda: preprocess_html_for_schema(html_content=html),  # Preprocessed raw HTML
+        }
+
+        markdown_input_html = cleaned_html  # Default to cleaned_html
+
+        try:
+            # Get the appropriate lambda function, default to returning cleaned_html if key not found
+            source_lambda = html_source_selector.get(selected_html_source, lambda: cleaned_html)
+            # Execute the lambda to get the selected HTML
+            markdown_input_html = source_lambda()
+
+            # Log which source is being used (optional, but helpful for debugging)
+            # if self.logger and verbose:
+            #     actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
+            #     self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
+
+        except Exception as e:
+            # Handle potential errors, especially from preprocess_html_for_schema
+            if self.logger:
+                self.logger.warning(
+                    f"Error getting/processing '{selected_html_source}' for markdown source: {e}. Falling back to cleaned_html.",
+                    tag="MARKDOWN_SRC"
+                )
+            # Ensure markdown_input_html is still the default cleaned_html in case of error
+            markdown_input_html = cleaned_html
+        # --- END: HTML SOURCE SELECTION ---
+
+        # Uncomment if by default we want to use PruningContentFilter
+        # if not config.content_filter and not markdown_generator.content_filter:
+        #     markdown_generator.content_filter = PruningContentFilter()
+
+        markdown_result: MarkdownGenerationResult = (
+            markdown_generator.generate_markdown(
+                input_html=markdown_input_html,
+                base_url=url,
+                # html2text_options=kwargs.get('html2text', {})
+            )
+        )
+
+        # Log processing completion
+        self.logger.info(
+            message="{url:.50}... | Time: {timing}s",
+            tag="SCRAPE",
+            params={
+                "url": _url,
+                "timing": int((time.perf_counter() - t1) * 1000) / 1000,
+            },
+        )
+
+        ################################
+        # Structured Content Extraction           #
+        ################################
+        if (
+            not bool(extracted_content)
+            and config.extraction_strategy
+            and not isinstance(config.extraction_strategy, NoExtractionStrategy)
+        ):
+            t1 = time.perf_counter()
+            # Choose content based on input_format
+            content_format = config.extraction_strategy.input_format
+            if content_format == "fit_markdown" and not markdown_result.fit_markdown:
+                self.logger.warning(
+                    message="Fit markdown requested but not available. Falling back to raw markdown.",
+                    tag="EXTRACT",
+                    params={"url": _url},
+                )
+                content_format = "markdown"
+
+            content = {
+                "markdown": markdown_result.raw_markdown,
+                "html": html,
+                "cleaned_html": cleaned_html,
+                "fit_markdown": markdown_result.fit_markdown,
+            }.get(content_format, markdown_result.raw_markdown)
+
+            # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
+            chunking = (
+                IdentityChunking()
+                if content_format in ["html", "cleaned_html"]
+                else config.chunking_strategy
+            )
+            sections = chunking.chunk(content)
+            extracted_content = config.extraction_strategy.run(url, sections)
+            extracted_content = json.dumps(
+                extracted_content, indent=4, default=str, ensure_ascii=False
+            )
+
+            # Log extraction completion
+            self.logger.info(
+                message="Completed for {url:.50}... | Time: {timing}s",
+                tag="EXTRACT",
+                params={"url": _url, "timing": time.perf_counter() - t1},
+            )
+
+        # Handle screenshot and PDF data
+        screenshot_data = None if not screenshot else screenshot
+        pdf_data = None if not pdf_data else pdf_data
+
+        # Apply HTML formatting if requested
+        if config.prettiify:
+            cleaned_html = fast_format_html(cleaned_html)
+
+        # Return complete crawl result
+        return CrawlResult(
+            url=url,
+            html=html,
+            cleaned_html=cleaned_html,
+            markdown=markdown_result,
+            media=media,
+            links=links,
+            metadata=metadata,
+            screenshot=screenshot_data,
+            pdf=pdf_data,
+            extracted_content=extracted_content,
+            success=True,
+            error_message="",
+        )
+
+    async def arun_many(
+        self,
+        urls: List[str],
+        config: Optional[CrawlerRunConfig] = None,
+        dispatcher: Optional[BaseDispatcher] = None,
+        # Legacy parameters maintained for backwards compatibility
+        # word_count_threshold=MIN_WORD_THRESHOLD,
+        # extraction_strategy: ExtractionStrategy = None,
+        # chunking_strategy: ChunkingStrategy = RegexChunking(),
+        # content_filter: RelevantContentFilter = None,
+        # cache_mode: Optional[CacheMode] = None,
+        # bypass_cache: bool = False,
+        # css_selector: str = None,
+        # screenshot: bool = False,
+        # pdf: bool = False,
+        # user_agent: str = None,
+        # verbose=True,
+        **kwargs,
+    ) -> RunManyReturn:
+        """
+        Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
+
+        Args:
+        urls: List of URLs to crawl
+        config: Configuration object controlling crawl behavior for all URLs
+        dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
+        [other parameters maintained for backwards compatibility]
+
+        Returns:
+        Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
+            Either a list of all results or an async generator yielding results
+
+        Examples:
+
+        # Batch processing (default)
+        results = await crawler.arun_many(
+            urls=["https://example1.com", "https://example2.com"],
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        for result in results:
+            print(f"Processed {result.url}: {len(result.markdown)} chars")
+
+        # Streaming results
+        async for result in await crawler.arun_many(
+            urls=["https://example1.com", "https://example2.com"],
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True),
+        ):
+            print(f"Processed {result.url}: {len(result.markdown)} chars")
+        """
+        config = config or CrawlerRunConfig()
+        # if config is None:
+        #     config = CrawlerRunConfig(
+        #         word_count_threshold=word_count_threshold,
+        #         extraction_strategy=extraction_strategy,
+        #         chunking_strategy=chunking_strategy,
+        #         content_filter=content_filter,
+        #         cache_mode=cache_mode,
+        #         bypass_cache=bypass_cache,
+        #         css_selector=css_selector,
+        #         screenshot=screenshot,
+        #         pdf=pdf,
+        #         verbose=verbose,
+        #         **kwargs,
+        #     )
+
+        if dispatcher is None:
+            dispatcher = MemoryAdaptiveDispatcher(
+                rate_limiter=RateLimiter(
+                    base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3
+                ),
+            )
+
+        def transform_result(task_result):
+            return (
+                setattr(
+                    task_result.result,
+                    "dispatch_result",
+                    DispatchResult(
+                        task_id=task_result.task_id,
+                        memory_usage=task_result.memory_usage,
+                        peak_memory=task_result.peak_memory,
+                        start_time=task_result.start_time,
+                        end_time=task_result.end_time,
+                        error_message=task_result.error_message,
+                    ),
+                )
+                or task_result.result
+            )
+
+        stream = config.stream
+
+        if stream:
+
+            async def result_transformer():
+                async for task_result in dispatcher.run_urls_stream(
+                    crawler=self, urls=urls, config=config
+                ):
+                    yield transform_result(task_result)
+
+            return result_transformer()
+        else:
+            _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
+            return [transform_result(res) for res in _results]
+
+```
+
+
+## File: crawl4ai/cli.py
+
+```py
+import click
+import os
+import sys
+import time
+
+import humanize
+from typing import Dict, Any, Optional, List
+import json
+import yaml
+import anyio
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.prompt import Prompt, Confirm
+
+from crawl4ai import (
+    CacheMode,
+    AsyncWebCrawler, 
+    CrawlResult,
+    BrowserConfig, 
+    CrawlerRunConfig,
+    LLMExtractionStrategy, 
+    LXMLWebScrapingStrategy,
+    JsonCssExtractionStrategy,
+    JsonXPathExtractionStrategy,
+    BM25ContentFilter, 
+    PruningContentFilter,
+    BrowserProfiler,
+    DefaultMarkdownGenerator,
+    LLMConfig
+)
+from crawl4ai.config import USER_SETTINGS
+from litellm import completion
+from pathlib import Path
+
+
+# Initialize rich console
+console = Console()
+
+def get_global_config() -> dict:
+    config_dir = Path.home() / ".crawl4ai"
+    config_file = config_dir / "global.yml"
+    
+    if not config_file.exists():
+        config_dir.mkdir(parents=True, exist_ok=True)
+        return {}
+        
+    with open(config_file) as f:
+        return yaml.safe_load(f) or {}
+
+def save_global_config(config: dict):
+    config_file = Path.home() / ".crawl4ai" / "global.yml"
+    with open(config_file, "w") as f:
+        yaml.dump(config, f)
+
+def setup_llm_config() -> tuple[str, str]:
+    config = get_global_config()
+    provider = config.get("DEFAULT_LLM_PROVIDER")
+    token = config.get("DEFAULT_LLM_PROVIDER_TOKEN")
+    
+    if not provider:
+        click.echo("\nNo default LLM provider configured.")
+        click.echo("Provider format: 'company/model' (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')")
+        click.echo("See available providers at: https://docs.litellm.ai/docs/providers")
+        provider = click.prompt("Enter provider")
+        
+    if not provider.startswith("ollama/"):
+        if not token:
+            token = click.prompt("Enter API token for " + provider, hide_input=True)
+    else:
+        token = "no-token"
+    
+    if not config.get("DEFAULT_LLM_PROVIDER") or not config.get("DEFAULT_LLM_PROVIDER_TOKEN"):
+        config["DEFAULT_LLM_PROVIDER"] = provider
+        config["DEFAULT_LLM_PROVIDER_TOKEN"] = token
+        save_global_config(config)
+        click.echo("\nConfiguration saved to ~/.crawl4ai/global.yml")
+    
+    return provider, token
+
+async def stream_llm_response(url: str, markdown: str, query: str, provider: str, token: str):
+    response = completion(
+        model=provider,
+        api_key=token,
+        messages=[
+            {
+                "content": f"You are Crawl4ai assistant, answering user question based on the provided context which is crawled from {url}.",
+                "role": "system"
+            },
+            {
+                "content": f"<|start of context|>\n{markdown}\n<|end of context|>\n\n{query}",
+                "role": "user"
+            },
+        ],
+        stream=True,
+    )
+    
+    for chunk in response:
+        if content := chunk["choices"][0]["delta"].get("content"):
+            print(content, end="", flush=True)
+    print()  # New line at end
+
+
+
+def parse_key_values(ctx, param, value) -> Dict[str, Any]:
+    if not value:
+        return {}
+    result = {}
+    pairs = value.split(',')
+    for pair in pairs:
+        try:
+            k, v = pair.split('=', 1)
+            # Handle common value types 
+            if v.lower() == 'true': v = True
+            elif v.lower() == 'false': v = False
+            elif v.isdigit(): v = int(v)
+            elif v.replace('.','',1).isdigit(): v = float(v)
+            elif v.startswith('[') and v.endswith(']'):
+                v = [x.strip() for x in v[1:-1].split(',') if x.strip()]
+            elif v.startswith('{') and v.endswith('}'):
+                try:
+                    v = json.loads(v)
+                except json.JSONDecodeError:
+                    raise click.BadParameter(f'Invalid JSON object: {v}')
+            result[k.strip()] = v
+        except ValueError:
+            raise click.BadParameter(f'Invalid key=value pair: {pair}')
+    return result
+
+def load_config_file(path: Optional[str]) -> dict:
+    if not path:
+        return {}
+    
+    try:
+        with open(path) as f:
+            if path.endswith((".yaml", ".yml")):
+                return yaml.safe_load(f)
+            return json.load(f)
+    except Exception as e:
+        raise click.BadParameter(f'Error loading config file {path}: {str(e)}')
+
+def load_schema_file(path: Optional[str]) -> dict:
+    if not path:
+        return None
+    return load_config_file(path)
+
+async def run_crawler(url: str, browser_cfg: BrowserConfig, crawler_cfg: CrawlerRunConfig, verbose: bool):
+    if verbose:
+        click.echo("Starting crawler with configurations:")
+        click.echo(f"Browser config: {browser_cfg.dump()}")
+        click.echo(f"Crawler config: {crawler_cfg.dump()}")
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        try:
+            result = await crawler.arun(url=url, config=crawler_cfg)
+            return result
+        except Exception as e:
+            raise click.ClickException(f"Crawling failed: {str(e)}")
+
+def show_examples():
+    examples = """
+🚀 Crawl4AI CLI Examples
+
+1️⃣  Basic Usage:
+    # Simple crawl with default settings
+    crwl https://example.com
+
+    # Get markdown output
+    crwl https://example.com -o markdown
+
+    # Verbose JSON output with cache bypass
+    crwl https://example.com -o json -v --bypass-cache
+
+2️⃣  Using Config Files:
+    # Using browser and crawler configs
+    crwl https://example.com -B browser.yml -C crawler.yml
+
+    # CSS-based extraction
+    crwl https://example.com -e extract_css.yml -s css_schema.json -o json
+
+    # LLM-based extraction with config file
+    crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json
+    
+    # Quick LLM-based JSON extraction (prompts for LLM provider first time)
+    crwl https://example.com -j  # Auto-extracts structured data
+    crwl https://example.com -j "Extract product details including name, price, and features"  # With specific instructions
+
+3️⃣  Direct Parameters:
+    # Browser settings
+    crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+
+    # Crawler settings
+    crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+
+4️⃣  Profile Management for Identity-Based Crawling:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create, list, and delete browser profiles for identity-based crawling
+    # Use a profile for crawling (keeps you logged in)
+    crwl https://example.com -p my-profile-name
+
+    # Example: Crawl a site that requires login
+    # 1. First create a profile and log in:
+    crwl profiles
+    # 2. Then use that profile to crawl the authenticated site:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
+
+5️⃣  CDP Mode for Browser Automation:
+    # Launch browser with CDP debugging on default port 9222
+    crwl cdp
+
+    # Use a specific profile and custom port
+    crwl cdp -p my-profile -P 9223
+
+    # Launch headless browser with CDP enabled
+    crwl cdp --headless
+
+    # Launch in incognito mode (ignores profile)
+    crwl cdp --incognito
+
+    # Use the CDP URL with other tools (Puppeteer, Playwright, etc.)
+    # The URL will be displayed in the terminal when the browser starts
+
+    
+6️⃣  Sample Config Files:
+
+browser.yml:
+    headless: true
+    viewport_width: 1280
+    user_agent_mode: "random"
+    verbose: true
+    ignore_https_errors: true
+
+extract_css.yml:
+    type: "json-css"
+    params:
+        verbose: true
+
+css_schema.json:
+    {
+      "name": "ArticleExtractor",
+      "baseSelector": ".article",
+      "fields": [
+        {
+          "name": "title",
+          "selector": "h1.title",
+          "type": "text"
+        },
+        {
+          "name": "link",
+          "selector": "a.read-more",
+          "type": "attribute",
+          "attribute": "href"
+        }
+      ]
+    }
+
+extract_llm.yml:
+    type: "llm"
+    provider: "openai/gpt-4"
+    instruction: "Extract all articles with their titles and links"
+    api_token: "your-token"
+    params:
+        temperature: 0.3
+        max_tokens: 1000
+
+llm_schema.json:
+    {
+      "title": "Article",
+      "type": "object",
+      "properties": {
+        "title": {
+          "type": "string",
+          "description": "The title of the article"
+        },
+        "link": {
+          "type": "string",
+          "description": "URL to the full article"
+        }
+      }
+    }
+
+7️⃣  Advanced Usage:
+    # Combine configs with direct parameters
+    crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
+
+    # Full extraction pipeline with config files
+    crwl https://example.com \\
+        -B browser.yml \\
+        -C crawler.yml \\
+        -e extract_llm.yml \\
+        -s llm_schema.json \\
+        -o json \\
+        -v
+        
+    # Quick LLM-based extraction with specific instructions
+    crwl https://amazon.com/dp/B01DFKC2SO \\
+        -j "Extract product title, current price, original price, rating, and all product specifications" \\
+        -b "headless=true,viewport_width=1280" \\
+        -v
+
+    # Content filtering with BM25
+    crwl https://example.com \\
+        -f filter_bm25.yml \\
+        -o markdown-fit
+
+    # Authenticated crawling with profile
+    crwl https://login-required-site.com \\
+        -p my-authenticated-profile \\
+        -c "css_selector=.dashboard-content" \\
+        -o markdown
+
+For more documentation visit: https://github.com/unclecode/crawl4ai
+
+8️⃣  Q&A with LLM:
+    # Ask a question about the content
+    crwl https://example.com -q "What is the main topic discussed?"
+
+    # First view content, then ask questions
+    crwl https://example.com -o markdown  # See the crawled content first
+    crwl https://example.com -q "Summarize the key points"
+    crwl https://example.com -q "What are the conclusions?"
+
+    # Advanced crawling with Q&A
+    crwl https://example.com \\
+        -B browser.yml \\
+        -c "css_selector=article,scan_full_page=true" \\
+        -q "What are the pros and cons mentioned?"
+
+    Note: First time using -q will prompt for LLM provider and API token.
+    These will be saved in ~/.crawl4ai/global.yml for future use.
+    
+    Supported provider format: 'company/model'
+    Examples:
+      - ollama/llama3.3
+      - openai/gpt-4
+      - anthropic/claude-3-sonnet
+      - cohere/command
+      - google/gemini-pro
+    
+    See full list of providers: https://docs.litellm.ai/docs/providers
+    
+    # Set default LLM provider and token in advance
+    crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet"
+    crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token-here"
+    
+    # Set default browser behavior
+    crwl config set BROWSER_HEADLESS false  # Always show browser window
+    crwl config set USER_AGENT_MODE random  # Use random user agent
+
+9️⃣ Profile Management:
+    # Launch interactive profile manager
+    crwl profiles
+
+    # Create a profile and use it for crawling
+    crwl profiles  # Create and set up your profile interactively
+    crwl https://example.com -p my-profile-name  # Use profile for crawling
+
+    # Example workflow for authenticated site
+    # 1. First create a profile and log in to the site:
+    crwl profiles  # Select "Create new profile" option
+    # 2. Then use that profile to crawl authenticated content:
+    crwl https://site-requiring-login.com/dashboard -p my-profile-name
+
+🔄 Builtin Browser Management:
+    # Start a builtin browser (runs in the background)
+    crwl browser start
+    
+    # Check builtin browser status
+    crwl browser status
+    
+    # Open a visible window to see the browser
+    crwl browser view --url https://example.com
+    
+    # Stop the builtin browser
+    crwl browser stop
+    
+    # Restart with different options
+    crwl browser restart --browser-type chromium --port 9223 --no-headless
+    
+    # Use the builtin browser in your code
+    # (Just set browser_mode="builtin" in your BrowserConfig)
+    browser_config = BrowserConfig(
+        browser_mode="builtin", 
+        headless=True
+    )
+    
+    # Usage via CLI:
+    crwl https://example.com -b "browser_mode=builtin"
+"""
+    click.echo(examples)
+
+def get_directory_size(path: str) -> int:
+    """Calculate the total size of a directory in bytes"""
+    total_size = 0
+    for dirpath, _, filenames in os.walk(path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            if not os.path.islink(fp):
+                total_size += os.path.getsize(fp)
+    return total_size
+
+def display_profiles_table(profiles: List[Dict[str, Any]]):
+    """Display a rich table of browser profiles"""
+    if not profiles:
+        console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]", 
+                          title="Browser Profiles", border_style="blue"))
+        return
+    
+    table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("#", style="dim", width=4)
+    table.add_column("Name", style="cyan", no_wrap=True)
+    table.add_column("Path", style="green")
+    table.add_column("Created", style="yellow")
+    table.add_column("Browser", style="magenta")
+    table.add_column("Size", style="blue", justify="right")
+    
+    for i, profile in enumerate(profiles):
+        # Calculate folder size
+        size = get_directory_size(profile["path"])
+        human_size = humanize.naturalsize(size)
+        
+        # Format creation date
+        created = profile["created"].strftime("%Y-%m-%d %H:%M")
+        
+        # Add row to table
+        table.add_row(
+            str(i+1), 
+            profile["name"], 
+            profile["path"], 
+            created, 
+            profile["type"].capitalize(), 
+            human_size
+        )
+    
+    console.print(table)
+
+async def create_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile creation wizard"""
+    console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n"
+                      "This will open a browser window for you to set up your identity.\n"
+                      "Log in to sites, adjust settings, then press 'q' to save.",
+                      border_style="cyan"))
+    
+    profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}")
+    
+    console.print("[cyan]Creating profile...[/cyan]")
+    console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]")
+    
+    # Create the profile
+    try:
+        profile_path = await profiler.create_profile(profile_name)
+        
+        if profile_path:
+            console.print(f"[green]Profile successfully created at:[/green] {profile_path}")
+        else:
+            console.print("[red]Failed to create profile.[/red]")
+    except Exception as e:
+        console.print(f"[red]Error creating profile: {str(e)}[/red]")
+
+def delete_profile_interactive(profiler: BrowserProfiler):
+    """Interactive profile deletion"""
+    profiles = profiler.list_profiles()
+    
+    if not profiles:
+        console.print("[yellow]No profiles found to delete.[/yellow]")
+        return
+    
+    # Display profiles
+    display_profiles_table(profiles)
+    
+    # Get profile selection
+    idx = Prompt.ask(
+        "[red]Enter number of profile to delete[/red]", 
+        console=console,
+        choices=[str(i+1) for i in range(len(profiles))],
+        show_choices=False
+    )
+    
+    try:
+        idx = int(idx) - 1
+        profile = profiles[idx]
+        
+        # Confirm deletion
+        if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"):
+            success = profiler.delete_profile(profile["path"])
+            
+            if success:
+                console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
+            else:
+                console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]")
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection.[/red]")
+        
+async def crawl_with_profile_cli(profile_path, url):
+    """Use a profile to crawl a website via CLI"""
+    console.print(f"[cyan]Crawling [bold]{url}[/bold] using profile at [bold]{profile_path}[/bold][/cyan]")
+    
+    # Create browser config with the profile
+    browser_cfg = BrowserConfig(
+        headless=False,  # Set to False to see the browser in action
+        use_managed_browser=True,
+        user_data_dir=profile_path
+    )
+    
+    # Default crawler config
+    crawler_cfg = CrawlerRunConfig()
+    
+    # Ask for output format
+    output_format = Prompt.ask(
+        "[cyan]Output format[/cyan]",
+        choices=["all", "json", "markdown", "md", "title"],
+        default="markdown"
+    )
+    
+    try:
+        # Run the crawler
+        result = await run_crawler(url, browser_cfg, crawler_cfg, True)
+        
+        # Handle output
+        if output_format == "all":
+            console.print(json.dumps(result.model_dump(), indent=2))
+        elif output_format == "json":
+            console.print(json.dumps(json.loads(result.extracted_content), indent=2))
+        elif output_format in ["markdown", "md"]:
+            console.print(result.markdown.raw_markdown)
+        elif output_format == "title":
+            console.print(result.metadata.get("title", "No title found"))
+        
+        console.print(f"[green]Successfully crawled[/green] {url}")
+        return result
+    except Exception as e:
+        console.print(f"[red]Error crawling:[/red] {str(e)}")
+        return None
+        
+async def use_profile_to_crawl():
+    """Interactive profile selection for crawling"""
+    profiler = BrowserProfiler()
+    profiles = profiler.list_profiles()
+    
+    if not profiles:
+        console.print("[yellow]No profiles found. Create one first.[/yellow]")
+        return
+        
+    # Display profiles
+    display_profiles_table(profiles)
+    
+    # Get profile selection
+    idx = Prompt.ask(
+        "[cyan]Enter number of profile to use[/cyan]", 
+        console=console,
+        choices=[str(i+1) for i in range(len(profiles))],
+        show_choices=False
+    )
+    
+    try:
+        idx = int(idx) - 1
+        profile = profiles[idx]
+        
+        # Get URL
+        url = Prompt.ask("[cyan]Enter URL to crawl[/cyan]")
+        if url:
+            # Crawl with the selected profile
+            await crawl_with_profile_cli(profile["path"], url)
+        else:
+            console.print("[red]No URL provided[/red]")
+    except (ValueError, IndexError):
+        console.print("[red]Invalid selection[/red]")
+
+async def manage_profiles():
+    """Interactive profile management menu"""
+    profiler = BrowserProfiler()
+    
+    options = {
+        "1": "List profiles",
+        "2": "Create new profile",
+        "3": "Delete profile",
+        "4": "Use a profile to crawl a website",
+        "5": "Exit",
+    }
+    
+    while True:
+        console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
+        
+        for key, value in options.items():
+            color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "blue" if key == "4" else "cyan"
+            console.print(f"[{color}]{key}[/{color}]. {value}")
+        
+        choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
+        
+        if choice == "1":
+            # List profiles
+            profiles = profiler.list_profiles()
+            display_profiles_table(profiles)
+        
+        elif choice == "2":
+            # Create profile
+            await create_profile_interactive(profiler)
+        
+        elif choice == "3":
+            # Delete profile
+            delete_profile_interactive(profiler)
+            
+        elif choice == "4":
+            # Use profile to crawl
+            await use_profile_to_crawl()
+        
+        elif choice == "5":
+            # Exit
+            console.print("[cyan]Exiting profile manager.[/cyan]")
+            break
+        
+        # Add a separator between operations
+        console.print("\n")
+
+
+
+@click.group(context_settings={"help_option_names": ["-h", "--help"]})
+def cli():
+    """Crawl4AI CLI - Web content extraction and browser profile management tool"""
+    pass
+
+
+@cli.group("browser")
+def browser_cmd():
+    """Manage browser instances for Crawl4AI
+    
+    Commands to manage browser instances for Crawl4AI, including:
+    - status - Check status of the builtin browser
+    - start - Start a new builtin browser
+    - stop - Stop the running builtin browser
+    - restart - Restart the builtin browser
+    """
+    pass
+    
+@browser_cmd.command("status")
+def browser_status_cmd():
+    """Show status of the builtin browser"""
+    profiler = BrowserProfiler()
+    
+    try:
+        status = anyio.run(profiler.get_builtin_browser_status)
+        
+        if status["running"]:
+            info = status["info"]
+            console.print(Panel(
+                f"[green]Builtin browser is running[/green]\n\n"
+                f"CDP URL: [cyan]{info['cdp_url']}[/cyan]\n"
+                f"Process ID: [yellow]{info['pid']}[/yellow]\n"
+                f"Browser type: [blue]{info['browser_type']}[/blue]\n"
+                f"User data directory: [magenta]{info['user_data_dir']}[/magenta]\n"
+                f"Started: [cyan]{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(info['start_time']))}[/cyan]",
+                title="Builtin Browser Status",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[yellow]Builtin browser is not running[/yellow]\n\n"
+                "Use 'crwl browser start' to start a builtin browser",
+                title="Builtin Browser Status",
+                border_style="yellow"
+            ))
+            
+    except Exception as e:
+        console.print(f"[red]Error checking browser status: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("start")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", 
+              help="Browser type (default: chromium)")
+@click.option("--port", "-p", type=int, default=9222, help="Debugging port (default: 9222)")
+@click.option("--headless/--no-headless", default=True, help="Run browser in headless mode")
+def browser_start_cmd(browser_type: str, port: int, headless: bool):
+    """Start a builtin browser instance
+    
+    This will start a persistent browser instance that can be used by Crawl4AI
+    by setting browser_mode="builtin" in BrowserConfig.
+    """
+    profiler = BrowserProfiler()
+    
+    # First check if browser is already running
+    status = anyio.run(profiler.get_builtin_browser_status)
+    if status["running"]:
+        console.print(Panel(
+            "[yellow]Builtin browser is already running[/yellow]\n\n"
+            f"CDP URL: [cyan]{status['cdp_url']}[/cyan]\n\n"
+            "Use 'crwl browser restart' to restart the browser",
+            title="Builtin Browser Start",
+            border_style="yellow"
+        ))
+        return
+    
+    try:
+        console.print(Panel(
+            f"[cyan]Starting builtin browser[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
+            title="Builtin Browser Start",
+            border_style="cyan"
+        ))
+        
+        cdp_url = anyio.run(
+            profiler.launch_builtin_browser,
+            browser_type,
+            port,
+            headless
+        )
+        
+        if cdp_url:
+            console.print(Panel(
+                f"[green]Builtin browser started successfully[/green]\n\n"
+                f"CDP URL: [cyan]{cdp_url}[/cyan]\n\n"
+                "This browser will be used automatically when setting browser_mode='builtin'",
+                title="Builtin Browser Start",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to start builtin browser[/red]",
+                title="Builtin Browser Start",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error starting builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("stop")
+def browser_stop_cmd():
+    """Stop the running builtin browser"""
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running
+        status = anyio.run(profiler.get_builtin_browser_status)
+        if not status["running"]:
+            console.print(Panel(
+                "[yellow]No builtin browser is currently running[/yellow]",
+                title="Builtin Browser Stop",
+                border_style="yellow"
+            ))
+            return
+            
+        console.print(Panel(
+            "[cyan]Stopping builtin browser...[/cyan]",
+            title="Builtin Browser Stop", 
+            border_style="cyan"
+        ))
+        
+        success = anyio.run(profiler.kill_builtin_browser)
+        
+        if success:
+            console.print(Panel(
+                "[green]Builtin browser stopped successfully[/green]",
+                title="Builtin Browser Stop",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to stop builtin browser[/red]",
+                title="Builtin Browser Stop",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error stopping builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+        
+@browser_cmd.command("view")
+@click.option("--url", "-u", help="URL to navigate to (defaults to about:blank)")
+def browser_view_cmd(url: Optional[str]):
+    """
+    Open a visible window of the builtin browser
+    
+    This command connects to the running builtin browser and opens a visible window,
+    allowing you to see what the browser is currently viewing or navigate to a URL.
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running
+        status = anyio.run(profiler.get_builtin_browser_status)
+        if not status["running"]:
+            console.print(Panel(
+                "[yellow]No builtin browser is currently running[/yellow]\n\n"
+                "Use 'crwl browser start' to start a builtin browser first",
+                title="Builtin Browser View",
+                border_style="yellow"
+            ))
+            return
+        
+        info = status["info"]
+        cdp_url = info["cdp_url"]
+        
+        console.print(Panel(
+            f"[cyan]Opening visible window connected to builtin browser[/cyan]\n\n"
+            f"CDP URL: [green]{cdp_url}[/green]\n"
+            f"URL to load: [yellow]{url or 'about:blank'}[/yellow]",
+            title="Builtin Browser View",
+            border_style="cyan"
+        ))
+        
+        # Use the CDP URL to launch a new visible window
+        import subprocess
+        import os
+        
+        # Determine the browser command based on platform
+        if sys.platform == "darwin":  # macOS
+            browser_cmd = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"]
+        elif sys.platform == "win32":  # Windows
+            browser_cmd = ["C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"]
+        else:  # Linux
+            browser_cmd = ["google-chrome"]
+        
+        # Add arguments
+        browser_args = [
+            f"--remote-debugging-port={info['debugging_port']}",
+            "--remote-debugging-address=localhost",
+            "--no-first-run",
+            "--no-default-browser-check"
+        ]
+        
+        # Add URL if provided
+        if url:
+            browser_args.append(url)
+        
+        # Launch browser
+        try:
+            subprocess.Popen(browser_cmd + browser_args)
+            console.print("[green]Browser window opened. Close it when finished viewing.[/green]")
+        except Exception as e:
+            console.print(f"[red]Error launching browser: {str(e)}[/red]")
+            console.print(f"[yellow]Try connecting manually to {cdp_url} in Chrome or using the '--remote-debugging-port' flag.[/yellow]")
+    
+    except Exception as e:
+        console.print(f"[red]Error viewing builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+
+@browser_cmd.command("restart")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default=None, 
+              help="Browser type (defaults to same as current)")
+@click.option("--port", "-p", type=int, default=None, help="Debugging port (defaults to same as current)")
+@click.option("--headless/--no-headless", default=None, help="Run browser in headless mode")
+def browser_restart_cmd(browser_type: Optional[str], port: Optional[int], headless: Optional[bool]):
+    """Restart the builtin browser
+    
+    Stops the current builtin browser if running and starts a new one.
+    By default, uses the same configuration as the current browser.
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # First check if browser is running and get its config
+        status = anyio.run(profiler.get_builtin_browser_status)
+        current_config = {}
+        
+        if status["running"]:
+            info = status["info"]
+            current_config = {
+                "browser_type": info["browser_type"],
+                "port": info["debugging_port"],
+                "headless": True  # Default assumption
+            }
+            
+            # Stop the browser
+            console.print(Panel(
+                "[cyan]Stopping current builtin browser...[/cyan]",
+                title="Builtin Browser Restart", 
+                border_style="cyan"
+            ))
+            
+            success = anyio.run(profiler.kill_builtin_browser)
+            if not success:
+                console.print(Panel(
+                    "[red]Failed to stop current browser[/red]",
+                    title="Builtin Browser Restart",
+                    border_style="red"
+                ))
+                sys.exit(1)
+        
+        # Use provided options or defaults from current config
+        browser_type = browser_type or current_config.get("browser_type", "chromium")
+        port = port or current_config.get("port", 9222)
+        headless = headless if headless is not None else current_config.get("headless", True)
+        
+        # Start a new browser
+        console.print(Panel(
+            f"[cyan]Starting new builtin browser[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
+            title="Builtin Browser Restart",
+            border_style="cyan"
+        ))
+        
+        cdp_url = anyio.run(
+            profiler.launch_builtin_browser,
+            browser_type,
+            port,
+            headless
+        )
+        
+        if cdp_url:
+            console.print(Panel(
+                f"[green]Builtin browser restarted successfully[/green]\n\n"
+                f"CDP URL: [cyan]{cdp_url}[/cyan]",
+                title="Builtin Browser Restart",
+                border_style="green"
+            ))
+        else:
+            console.print(Panel(
+                "[red]Failed to restart builtin browser[/red]",
+                title="Builtin Browser Restart",
+                border_style="red"
+            ))
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error restarting builtin browser: {str(e)}[/red]")
+        sys.exit(1)
+
+@cli.command("cdp")
+@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)")
+@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)")
+@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", 
+              help="Browser type (default: chromium)")
+@click.option("--headless", is_flag=True, help="Run browser in headless mode")
+@click.option("--incognito", is_flag=True, help="Run in incognito/private mode (ignores user-data-dir)")
+def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless: bool, incognito: bool):
+    """Launch a standalone browser with CDP debugging enabled
+    
+    This command launches a browser with Chrome DevTools Protocol (CDP) debugging enabled,
+    prints the CDP URL, and keeps the browser running until you press 'q'.
+    
+    The CDP URL can be used for various automation and debugging tasks.
+    
+    Examples:
+        # Launch Chromium with CDP on default port 9222
+        crwl cdp
+        
+        # Use a specific directory for browser data and custom port
+        crwl cdp --user-data-dir ~/browser-data --port 9223
+        
+        # Launch in headless mode
+        crwl cdp --headless
+        
+        # Launch in incognito mode (ignores user-data-dir)
+        crwl cdp --incognito
+    """
+    profiler = BrowserProfiler()
+    
+    try:
+        # Handle data directory
+        data_dir = None
+        if not incognito and user_data_dir:
+            # Expand user path (~/something)
+            expanded_path = os.path.expanduser(user_data_dir)
+            
+            # Create directory if it doesn't exist
+            if not os.path.exists(expanded_path):
+                console.print(f"[yellow]Directory '{expanded_path}' doesn't exist. Creating it.[/yellow]")
+                os.makedirs(expanded_path, exist_ok=True)
+            
+            data_dir = expanded_path
+        
+        # Print launch info
+        console.print(Panel(
+            f"[cyan]Launching browser with CDP debugging[/cyan]\n\n"
+            f"Browser type: [green]{browser_type}[/green]\n"
+            f"Debugging port: [yellow]{port}[/yellow]\n"
+            f"User data directory: [cyan]{data_dir or 'Temporary directory'}[/cyan]\n"
+            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]\n"
+            f"Incognito: [cyan]{'Yes' if incognito else 'No'}[/cyan]\n\n"
+            f"[yellow]Press 'q' to quit when done[/yellow]",
+            title="CDP Browser",
+            border_style="cyan"
+        ))
+        
+        # Run the browser
+        cdp_url = anyio.run(
+            profiler.launch_standalone_browser,
+            browser_type,
+            data_dir,
+            port,
+            headless
+        )
+        
+        if not cdp_url:
+            console.print("[red]Failed to launch browser or get CDP URL[/red]")
+            sys.exit(1)
+            
+    except Exception as e:
+        console.print(f"[red]Error launching CDP browser: {str(e)}[/red]")
+        sys.exit(1)
+
+
+@cli.command("crawl")
+@click.argument("url", required=True)
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
+@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
+@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, 
+           extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
+           output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl a website and extract content
+    
+    Simple Usage:
+        crwl crawl https://example.com
+    """
+    
+    # Handle profile option
+    if profile:
+        profiler = BrowserProfiler()
+        profile_path = profiler.get_profile_path(profile)
+        
+        if not profile_path:
+            profiles = profiler.list_profiles()
+            
+            if profiles:
+                console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]")
+                display_profiles_table(profiles)
+            else:
+                console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]")
+            
+            return
+        
+        # Include the profile in browser config
+        if not browser:
+            browser = {}
+        browser["user_data_dir"] = profile_path
+        browser["use_managed_browser"] = True
+        
+        if verbose:
+            console.print(f"[green]Using browser profile:[/green] {profile}")
+            
+    try:
+        # Load base configurations
+        browser_cfg = BrowserConfig.load(load_config_file(browser_config))
+        crawler_cfg = CrawlerRunConfig.load(load_config_file(crawler_config))
+        
+        # Override with CLI params
+        if browser:
+            browser_cfg = browser_cfg.clone(**browser)
+        if crawler:
+            crawler_cfg = crawler_cfg.clone(**crawler)
+            
+        # Handle content filter config
+        if filter_config or output in ["markdown-fit", "md-fit"]:
+            if filter_config:
+                filter_conf = load_config_file(filter_config)
+            elif not filter_config and output in ["markdown-fit", "md-fit"]:
+                filter_conf = {
+                    "type": "pruning",
+                    "query": "",
+                    "threshold": 0.48
+                }
+            if filter_conf["type"] == "bm25":
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = BM25ContentFilter(
+                        user_query=filter_conf.get("query"),
+                        bm25_threshold=filter_conf.get("threshold", 1.0)
+                    )
+                )
+            elif filter_conf["type"] == "pruning":
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = PruningContentFilter(
+                        user_query=filter_conf.get("query"),
+                        threshold=filter_conf.get("threshold", 0.48)
+                    )
+                )
+        
+        # Handle json-extract option (takes precedence over extraction-config)
+        if json_extract is not None:
+            # Get LLM provider and token
+            provider, token = setup_llm_config()
+            
+            # Default sophisticated instruction for structured data extraction
+            default_instruction = """Analyze the web page content and extract structured data as JSON. 
+If the page contains a list of items with repeated patterns, extract all items in an array. 
+If the page is an article or contains unique content, extract a comprehensive JSON object with all relevant information.
+Look at the content, intention of content, what it offers and find the data item(s) in the page.
+Always return valid, properly formatted JSON."""
+            
+            
+            default_instruction_with_user_query = """Analyze the web page content and extract structured data as JSON, following the below instruction and explanation of schema and always return valid, properly formatted JSON. \n\nInstruction:\n\n""" + json_extract
+            
+            # Determine instruction based on whether json_extract is empty or has content
+            instruction = default_instruction_with_user_query if json_extract else default_instruction
+            
+            # Create LLM extraction strategy
+            crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                llm_config=LLMConfig(provider=provider, api_token=token),
+                instruction=instruction,
+                schema=load_schema_file(schema),  # Will be None if no schema is provided
+                extraction_type="schema", #if schema else "block",
+                apply_chunking=False,
+                force_json_response=True,
+                verbose=verbose,
+            )
+            
+            # Set output to JSON if not explicitly specified
+            if output == "all":
+                output = "json"
+                
+        # Handle extraction strategy from config file (only if json-extract wasn't used)
+        elif extraction_config:
+            extract_conf = load_config_file(extraction_config)
+            schema_data = load_schema_file(schema)
+            
+            # Check if type does not exist show proper message
+            if not extract_conf.get("type"):
+                raise click.ClickException("Extraction type not specified")
+            if extract_conf["type"] not in ["llm", "json-css", "json-xpath"]:
+                raise click.ClickException(f"Invalid extraction type: {extract_conf['type']}")
+            
+            if extract_conf["type"] == "llm":
+                # if no provider show error emssage
+                if not extract_conf.get("provider") or not extract_conf.get("api_token"):
+                    raise click.ClickException("LLM provider and API token are required for LLM extraction")
+
+                crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                    llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
+                    instruction=extract_conf["instruction"],
+                    schema=schema_data,
+                    **extract_conf.get("params", {})
+                )
+            elif extract_conf["type"] == "json-css":
+                crawler_cfg.extraction_strategy = JsonCssExtractionStrategy(
+                    schema=schema_data
+                )
+            elif extract_conf["type"] == "json-xpath":
+                crawler_cfg.extraction_strategy = JsonXPathExtractionStrategy(
+                    schema=schema_data
+                )
+                
+
+        # No cache
+        if bypass_cache:
+            crawler_cfg.cache_mode = CacheMode.BYPASS
+
+        crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()    
+
+        config = get_global_config()
+        
+        browser_cfg.verbose = config.get("VERBOSE", False)
+        crawler_cfg.verbose = config.get("VERBOSE", False)
+        
+        # Run crawler
+        result : CrawlResult = anyio.run(
+            run_crawler,
+            url,
+            browser_cfg,
+            crawler_cfg,
+            verbose
+        )
+
+        # Handle question
+        if question:
+            provider, token = setup_llm_config()
+            markdown = result.markdown.raw_markdown
+            anyio.run(stream_llm_response, url, markdown, question, provider, token)
+            return
+        
+        # Handle output
+        if not output_file:
+            if output == "all":
+                click.echo(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                print(result.extracted_content)
+                extracted_items = json.loads(result.extracted_content)
+                click.echo(json.dumps(extracted_items, indent=2))
+                
+            elif output in ["markdown", "md"]:
+                click.echo(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                click.echo(result.markdown.fit_markdown)
+        else:
+            if output == "all":
+                with open(output_file, "w") as f:
+                    f.write(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                with open(output_file, "w") as f:
+                    f.write(result.extracted_content)
+            elif output in ["markdown", "md"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.fit_markdown)
+            
+    except Exception as e:
+        raise click.ClickException(str(e))
+
+@cli.command("examples")
+def examples_cmd():
+    """Show usage examples"""
+    show_examples()
+
+@cli.group("config")
+def config_cmd():
+    """Manage global configuration settings
+    
+    Commands to view and update global configuration settings:
+    - list: Display all current configuration settings
+    - get: Get the value of a specific setting
+    - set: Set the value of a specific setting
+    """
+    pass
+
+@config_cmd.command("list")
+def config_list_cmd():
+    """List all configuration settings"""
+    config = get_global_config()
+    
+    table = Table(title="Crawl4AI Configuration", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("Setting", style="cyan")
+    table.add_column("Value", style="green")
+    table.add_column("Default", style="yellow")
+    table.add_column("Description", style="white")
+    
+    for key, setting in USER_SETTINGS.items():
+        value = config.get(key, setting["default"])
+        
+        # Handle secret values
+        display_value = value
+        if setting.get("secret", False) and value:
+            display_value = "********"
+            
+        # Handle boolean values
+        if setting["type"] == "boolean":
+            display_value = str(value).lower()
+            default_value = str(setting["default"]).lower()
+        else:
+            default_value = str(setting["default"])
+        
+        table.add_row(
+            key,
+            str(display_value),
+            default_value,
+            setting["description"]
+        )
+    
+    console.print(table)
+
+@config_cmd.command("get")
+@click.argument("key", required=True)
+def config_get_cmd(key: str):
+    """Get a specific configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        return
+    
+    value = config.get(key, USER_SETTINGS[key]["default"])
+    
+    # Handle secret values
+    display_value = value
+    if USER_SETTINGS[key].get("secret", False) and value:
+        display_value = "********"
+    
+    console.print(f"[cyan]{key}[/cyan] = [green]{display_value}[/green]")
+    console.print(f"[dim]Description: {USER_SETTINGS[key]['description']}[/dim]")
+
+@config_cmd.command("set")
+@click.argument("key", required=True)
+@click.argument("value", required=True)
+def config_set_cmd(key: str, value: str):
+    """Set a configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        console.print(f"[yellow]Available settings: {', '.join(USER_SETTINGS.keys())}[/yellow]")
+        return
+    
+    setting = USER_SETTINGS[key]
+    
+    # Type conversion and validation
+    if setting["type"] == "boolean":
+        if value.lower() in ["true", "yes", "1", "y"]:
+            typed_value = True
+        elif value.lower() in ["false", "no", "0", "n"]:
+            typed_value = False
+        else:
+            console.print(f"[red]Error: Invalid boolean value. Use 'true' or 'false'.[/red]")
+            return
+    elif setting["type"] == "string":
+        typed_value = value
+        
+        # Check if the value should be one of the allowed options
+        if "options" in setting and value not in setting["options"]:
+            console.print(f"[red]Error: Value must be one of: {', '.join(setting['options'])}[/red]")
+            return
+    
+    # Update config
+    config[key] = typed_value
+    save_global_config(config)
+    
+    # Handle secret values for display
+    display_value = typed_value
+    if setting.get("secret", False) and typed_value:
+        display_value = "********"
+        
+    console.print(f"[green]Successfully set[/green] [cyan]{key}[/cyan] = [green]{display_value}[/green]")
+
+@cli.command("profiles")
+def profiles_cmd():
+    """Manage browser profiles interactively
+    
+    Launch an interactive browser profile manager where you can:
+    - List all existing profiles
+    - Create new profiles for authenticated browsing
+    - Delete unused profiles
+    """
+    # Run interactive profile manager
+    anyio.run(manage_profiles)
+
+@cli.command(name="")
+@click.argument("url", required=False)
+@click.option("--example", is_flag=True, help="Show usage examples")
+@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
+@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
+@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
+@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
+@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
+@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
+@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
+@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
+@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--question", "-q", help="Ask a question about the crawled content")
+@click.option("--verbose", "-v", is_flag=True)
+@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
+        extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
+        output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+    """Crawl4AI CLI - Web content extraction tool
+
+    Simple Usage:
+        crwl https://example.com
+    
+    Run with --example to see detailed usage examples.
+    
+    Other commands:
+        crwl profiles   - Manage browser profiles for identity-based crawling
+        crwl crawl      - Crawl a website with advanced options
+        crwl cdp        - Launch browser with CDP debugging enabled
+        crwl browser    - Manage builtin browser (start, stop, status, restart)
+        crwl config     - Manage global configuration settings
+        crwl examples   - Show more usage examples
+        
+    Configuration Examples:
+        crwl config list                         - List all configuration settings
+        crwl config get DEFAULT_LLM_PROVIDER     - Show current LLM provider
+        crwl config set VERBOSE true             - Enable verbose mode globally
+        crwl config set BROWSER_HEADLESS false   - Default to visible browser
+    """
+
+    if example:
+        show_examples()
+        return
+        
+    if not url:
+        # Show help without error message
+        ctx = click.get_current_context()
+        click.echo(ctx.get_help())
+        return
+        
+    # Forward to crawl command
+    ctx = click.get_current_context()
+    ctx.invoke(
+        crawl_cmd, 
+        url=url, 
+        browser_config=browser_config,
+        crawler_config=crawler_config,
+        filter_config=filter_config,
+        extraction_config=extraction_config,
+        json_extract=json_extract,
+        schema=schema,
+        browser=browser,
+        crawler=crawler,
+        output=output,
+        bypass_cache=bypass_cache,
+        question=question,
+        verbose=verbose,
+        profile=profile
+    )
+
+def main():
+    import sys
+    if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
+        sys.argv.insert(1, "crawl")
+    cli()
+
+if __name__ == "__main__":
+    main()
+```
+
+
+## File: crawl4ai/extraction_strategy.py
+
+```py
+from abc import ABC, abstractmethod
+import inspect
+from typing import Any, List, Dict, Optional
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import json
+import time
+
+from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
+from .config import (
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
+    CHUNK_TOKEN_THRESHOLD,
+    OVERLAP_RATE,
+    WORD_TOKEN_RATE,
+)
+from .utils import *  # noqa: F403
+
+from .utils import (
+    sanitize_html,
+    escape_json_string,
+    perform_completion_with_backoff,
+    extract_xml_data,
+    split_and_parse_json_objects,
+    sanitize_input_encode,
+    merge_chunks,
+)
+from .models import * # noqa: F403
+
+from .models import TokenUsage
+
+from .model_loader import * # noqa: F403
+from .model_loader import (
+    get_device,
+    load_HF_embedding_model,
+    load_text_multilabel_classifier,
+    calculate_batch_size
+)
+
+from .types import LLMConfig, create_llm_config
+
+from functools import partial
+import numpy as np
+import re
+from bs4 import BeautifulSoup
+from lxml import html, etree
+
+
+class ExtractionStrategy(ABC):
+    """
+    Abstract base class for all extraction strategies.
+    """
+
+    def __init__(self, input_format: str = "markdown", **kwargs):
+        """
+        Initialize the extraction strategy.
+
+        Args:
+            input_format: Content format to use for extraction.
+                         Options: "markdown" (default), "html", "fit_markdown"
+            **kwargs: Additional keyword arguments
+        """
+        self.input_format = input_format
+        self.DEL = "<|DEL|>"
+        self.name = self.__class__.__name__
+        self.verbose = kwargs.get("verbose", False)
+
+    @abstractmethod
+    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML.
+
+        :param url: The URL of the webpage.
+        :param html: The HTML content of the webpage.
+        :return: A list of extracted blocks or chunks.
+        """
+        pass
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Process sections of text in parallel by default.
+
+        :param url: The URL of the webpage.
+        :param sections: List of sections (strings) to process.
+        :return: A list of processed JSON blocks.
+        """
+        extracted_content = []
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(self.extract, url, section, **kwargs)
+                for section in sections
+            ]
+            for future in as_completed(futures):
+                extracted_content.extend(future.result())
+        return extracted_content
+
+
+class NoExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
+    """
+
+    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML.
+        """
+        return [{"index": 0, "content": html}]
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        return [
+            {"index": i, "tags": [], "content": section}
+            for i, section in enumerate(sections)
+        ]
+
+
+#######################################################
+# Strategies using clustering for text data extraction #
+#######################################################
+
+
+class CosineStrategy(ExtractionStrategy):
+    """
+    Extract meaningful blocks or chunks from the given HTML using cosine similarity.
+
+    How it works:
+    1. Pre-filter documents using embeddings and semantic_filter.
+    2. Perform clustering using cosine similarity.
+    3. Organize texts by their cluster labels, retaining order.
+    4. Filter clusters by word count.
+    5. Extract meaningful blocks or chunks from the filtered clusters.
+
+    Attributes:
+        semantic_filter (str): A keyword filter for document filtering.
+        word_count_threshold (int): Minimum number of words per cluster.
+        max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
+        linkage_method (str): The linkage method for hierarchical clustering.
+        top_k (int): Number of top categories to extract.
+        model_name (str): The name of the sentence-transformers model.
+        sim_threshold (float): The similarity threshold for clustering.
+    """
+
+    def __init__(
+        self,
+        semantic_filter=None,
+        word_count_threshold=10,
+        max_dist=0.2,
+        linkage_method="ward",
+        top_k=3,
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        sim_threshold=0.3,
+        **kwargs,
+    ):
+        """
+        Initialize the strategy with clustering parameters.
+
+        Args:
+            semantic_filter (str): A keyword filter for document filtering.
+            word_count_threshold (int): Minimum number of words per cluster.
+            max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
+            linkage_method (str): The linkage method for hierarchical clustering.
+            top_k (int): Number of top categories to extract.
+        """
+        super().__init__(**kwargs)
+
+        import numpy as np
+
+        self.semantic_filter = semantic_filter
+        self.word_count_threshold = word_count_threshold
+        self.max_dist = max_dist
+        self.linkage_method = linkage_method
+        self.top_k = top_k
+        self.sim_threshold = sim_threshold
+        self.timer = time.time()
+        self.verbose = kwargs.get("verbose", False)
+
+        self.buffer_embeddings = np.array([])
+        self.get_embedding_method = "direct"
+
+        self.device = get_device()
+        # import torch
+        # self.device = torch.device('cpu')
+
+        self.default_batch_size = calculate_batch_size(self.device)
+
+        if self.verbose:
+            print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
+
+        # if False and self.device.type == "cpu":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+        # else:
+
+        self.tokenizer, self.model = load_HF_embedding_model(model_name)
+        self.model.to(self.device)
+        self.model.eval()
+
+        self.get_embedding_method = "batch"
+
+        self.buffer_embeddings = np.array([])
+
+        # if model_name == "bert-base-uncased":
+        #     self.tokenizer, self.model = load_bert_base_uncased()
+        #     self.model.eval()  # Ensure the model is in evaluation mode
+        #     self.get_embedding_method = "batch"
+        # elif model_name == "BAAI/bge-small-en-v1.5":
+        #     self.tokenizer, self.model = load_bge_small_en_v1_5()
+        #     self.model.eval()  # Ensure the model is in evaluation mode
+        #     self.get_embedding_method = "batch"
+        # elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+
+        if self.verbose:
+            print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
+
+        self.nlp, _ = load_text_multilabel_classifier()
+        # self.default_batch_size = 16 if self.device.type == 'cpu' else 64
+
+        if self.verbose:
+            print(
+                f"[LOG] Model loaded {model_name}, models/reuters, took "
+                + str(time.time() - self.timer)
+                + " seconds"
+            )
+
+    def filter_documents_embeddings(
+        self, documents: List[str], semantic_filter: str, at_least_k: int = 20
+    ) -> List[str]:
+        """
+        Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
+
+        Args:
+            documents (List[str]): A list of document texts.
+            semantic_filter (str): A keyword filter for document filtering.
+            at_least_k (int): The minimum number of documents to return.
+
+        Returns:
+            List[str]: A list of filtered and sorted document texts.
+        """
+
+        if not semantic_filter:
+            return documents
+
+        if len(documents) < at_least_k:
+            at_least_k = len(documents) // 2
+
+        from sklearn.metrics.pairwise import cosine_similarity
+
+        # Compute embedding for the keyword filter
+        query_embedding = self.get_embeddings([semantic_filter])[0]
+
+        # Compute embeddings for the documents
+        document_embeddings = self.get_embeddings(documents)
+
+        # Calculate cosine similarity between the query embedding and document embeddings
+        similarities = cosine_similarity(
+            [query_embedding], document_embeddings
+        ).flatten()
+
+        # Filter documents based on the similarity threshold
+        filtered_docs = [
+            (doc, sim)
+            for doc, sim in zip(documents, similarities)
+            if sim >= self.sim_threshold
+        ]
+
+        # If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
+        if len(filtered_docs) < at_least_k:
+            remaining_docs = [
+                (doc, sim)
+                for doc, sim in zip(documents, similarities)
+                if sim < self.sim_threshold
+            ]
+            remaining_docs.sort(key=lambda x: x[1], reverse=True)
+            filtered_docs.extend(remaining_docs[: at_least_k - len(filtered_docs)])
+
+        # Extract the document texts from the tuples
+        filtered_docs = [doc for doc, _ in filtered_docs]
+
+        return filtered_docs[:at_least_k]
+
+    def get_embeddings(
+        self, sentences: List[str], batch_size=None, bypass_buffer=False
+    ):
+        """
+        Get BERT embeddings for a list of sentences.
+
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of embeddings.
+        """
+        # if self.buffer_embeddings.any() and not bypass_buffer:
+        #     return self.buffer_embeddings
+
+        if self.device.type in ["cpu", "gpu", "cuda", "mps"]:
+            import torch
+
+            # Tokenize sentences and convert to tensor
+            if batch_size is None:
+                batch_size = self.default_batch_size
+
+            all_embeddings = []
+            for i in range(0, len(sentences), batch_size):
+                batch_sentences = sentences[i : i + batch_size]
+                encoded_input = self.tokenizer(
+                    batch_sentences, padding=True, truncation=True, return_tensors="pt"
+                )
+                encoded_input = {
+                    key: tensor.to(self.device) for key, tensor in encoded_input.items()
+                }
+
+                # Ensure no gradients are calculated
+                with torch.no_grad():
+                    model_output = self.model(**encoded_input)
+
+                # Get embeddings from the last hidden state (mean pooling)
+                embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
+                all_embeddings.append(embeddings)
+
+            self.buffer_embeddings = np.vstack(all_embeddings)
+        elif self.device.type == "cpu":
+            # self.buffer_embeddings = self.model(sentences)
+            if batch_size is None:
+                batch_size = self.default_batch_size
+
+            all_embeddings = []
+            for i in range(0, len(sentences), batch_size):
+                batch_sentences = sentences[i : i + batch_size]
+                embeddings = self.model(batch_sentences)
+                all_embeddings.append(embeddings)
+
+            self.buffer_embeddings = np.vstack(all_embeddings)
+        return self.buffer_embeddings
+
+    def hierarchical_clustering(self, sentences: List[str], embeddings=None):
+        """
+        Perform hierarchical clustering on sentences and return cluster labels.
+
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of cluster labels.
+        """
+        # Get embeddings
+        from scipy.cluster.hierarchy import linkage, fcluster
+        from scipy.spatial.distance import pdist
+
+        self.timer = time.time()
+        embeddings = self.get_embeddings(sentences, bypass_buffer=True)
+        # print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
+        # Compute pairwise cosine distances
+        distance_matrix = pdist(embeddings, "cosine")
+        # Perform agglomerative clustering respecting order
+        linked = linkage(distance_matrix, method=self.linkage_method)
+        # Form flat clusters
+        labels = fcluster(linked, self.max_dist, criterion="distance")
+        return labels
+
+    def filter_clusters_by_word_count(
+        self, clusters: Dict[int, List[str]]
+    ) -> Dict[int, List[str]]:
+        """
+        Filter clusters to remove those with a word count below the threshold.
+
+        Args:
+            clusters (Dict[int, List[str]]): Dictionary of clusters.
+
+        Returns:
+            Dict[int, List[str]]: Filtered dictionary of clusters.
+        """
+        filtered_clusters = {}
+        for cluster_id, texts in clusters.items():
+            # Concatenate texts for analysis
+            full_text = " ".join(texts)
+            # Count words
+            word_count = len(full_text.split())
+
+            # Keep clusters with word count above the threshold
+            if word_count >= self.word_count_threshold:
+                filtered_clusters[cluster_id] = texts
+
+        return filtered_clusters
+
+    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract clusters from HTML content using hierarchical clustering.
+
+        Args:
+            url (str): The URL of the webpage.
+            html (str): The HTML content of the webpage.
+
+        Returns:
+            List[Dict[str, Any]]: A list of processed JSON blocks.
+        """
+        # Assume `html` is a list of text chunks for this strategy
+        t = time.time()
+        text_chunks = html.split(self.DEL)  # Split by lines or paragraphs as needed
+
+        # Pre-filter documents using embeddings and semantic_filter
+        text_chunks = self.filter_documents_embeddings(
+            text_chunks, self.semantic_filter
+        )
+
+        if not text_chunks:
+            return []
+
+        # Perform clustering
+        labels = self.hierarchical_clustering(text_chunks)
+        # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
+
+        # Organize texts by their cluster labels, retaining order
+        t = time.time()
+        clusters = {}
+        for index, label in enumerate(labels):
+            clusters.setdefault(label, []).append(text_chunks[index])
+
+        # Filter clusters by word count
+        filtered_clusters = self.filter_clusters_by_word_count(clusters)
+
+        # Convert filtered clusters to a sorted list of dictionaries
+        cluster_list = [
+            {"index": int(idx), "tags": [], "content": " ".join(filtered_clusters[idx])}
+            for idx in sorted(filtered_clusters)
+        ]
+
+        if self.verbose:
+            print(f"[LOG] 🚀 Assign tags using {self.device}")
+
+        if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
+            labels = self.nlp([cluster["content"] for cluster in cluster_list])
+
+            for cluster, label in zip(cluster_list, labels):
+                cluster["tags"] = label
+        # elif self.device.type == "cpu":
+        #     # Process the text with the loaded model
+        #     texts = [cluster['content'] for cluster in cluster_list]
+        #     # Batch process texts
+        #     docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
+
+        #     for doc, cluster in zip(docs, cluster_list):
+        #         tok_k = self.top_k
+        #         top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+        #         cluster['tags'] = [cat for cat, _ in top_categories]
+
+        # for cluster in  cluster_list:
+        #     doc = self.nlp(cluster['content'])
+        #     tok_k = self.top_k
+        #     top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+        #     cluster['tags'] = [cat for cat, _ in top_categories]
+
+        if self.verbose:
+            print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
+
+        return cluster_list
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Process sections using hierarchical clustering.
+
+        Args:
+            url (str): The URL of the webpage.
+            sections (List[str]): List of sections (strings) to process.
+
+        Returns:
+        """
+        # This strategy processes all sections together
+
+        return self.extract(url, self.DEL.join(sections), **kwargs)
+
+
+#######################################################
+# Strategies using LLM-based extraction for text data #
+#######################################################
+class LLMExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that uses an LLM to extract meaningful content from the HTML.
+
+    Attributes:
+        llm_config: The LLM configuration object.
+        instruction: The instruction to use for the LLM model.
+        schema: Pydantic model schema for structured data.
+        extraction_type: "block" or "schema".
+        chunk_token_threshold: Maximum tokens per chunk.
+        overlap_rate: Overlap between chunks.
+        word_token_rate: Word to token conversion rate.
+        apply_chunking: Whether to apply chunking.
+        verbose: Whether to print verbose output.
+        usages: List of individual token usages.
+        total_usage: Accumulated token usage.
+    """
+    _UNWANTED_PROPS = {
+            'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
+            'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
+            'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+            'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+        }
+    def __init__(
+        self,
+        llm_config: 'LLMConfig' = None,
+        instruction: str = None,
+        schema: Dict = None,
+        extraction_type="block",
+        chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
+        overlap_rate=OVERLAP_RATE,
+        word_token_rate=WORD_TOKEN_RATE,
+        apply_chunking=True,
+        input_format: str = "markdown",
+        force_json_response=False,
+        verbose=False,
+        # Deprecated arguments
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: str = None,
+        api_base: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the strategy with clustering parameters.
+
+        Args:
+            llm_config: The LLM configuration object.
+            instruction: The instruction to use for the LLM model.
+            schema: Pydantic model schema for structured data.
+            extraction_type: "block" or "schema".
+            chunk_token_threshold: Maximum tokens per chunk.
+            overlap_rate: Overlap between chunks.
+            word_token_rate: Word to token conversion rate.
+            apply_chunking: Whether to apply chunking.
+            input_format: Content format to use for extraction.
+                            Options: "markdown" (default), "html", "fit_markdown"
+            force_json_response: Whether to force a JSON response from the LLM.
+            verbose: Whether to print verbose output.
+
+            # Deprecated arguments, will be removed very soon
+            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+            api_token: The API token for the provider.
+            base_url: The base URL for the API request.
+            api_base: The base URL for the API request.
+            extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
+        """
+        super().__init__( input_format=input_format, **kwargs)
+        self.llm_config = llm_config
+        if not self.llm_config:
+            self.llm_config = create_llm_config(
+                provider=DEFAULT_PROVIDER,
+                api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
+            )
+        self.instruction = instruction
+        self.extract_type = extraction_type
+        self.schema = schema
+        if schema:
+            self.extract_type = "schema"
+        self.force_json_response = force_json_response
+        self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate
+        self.apply_chunking = apply_chunking
+        self.extra_args = kwargs.get("extra_args", {})
+        if not self.apply_chunking:
+            self.chunk_token_threshold = 1e9
+        self.verbose = verbose
+        self.usages = []  # Store individual usages
+        self.total_usage = TokenUsage()  # Accumulated usage
+
+        self.provider = provider
+        self.api_token = api_token
+        self.base_url = base_url
+        self.api_base = api_base
+
+    
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)  
+        
+    def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML using an LLM.
+
+        How it works:
+        1. Construct a prompt with variables.
+        2. Make a request to the LLM using the prompt.
+        3. Parse the response and extract blocks or chunks.
+
+        Args:
+            url: The URL of the webpage.
+            ix: Index of the block.
+            html: The HTML content of the webpage.
+
+        Returns:
+            A list of extracted blocks or chunks.
+        """
+        if self.verbose:
+            # print("[LOG] Extracting blocks from URL:", url)
+            print(f"[LOG] Call LLM for {url} - block index: {ix}")
+
+        variable_values = {
+            "URL": url,
+            "HTML": escape_json_string(sanitize_html(html)),
+        }
+
+        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
+        if self.instruction:
+            variable_values["REQUEST"] = self.instruction
+            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
+
+        if self.extract_type == "schema" and self.schema:
+            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema
+            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
+
+        if self.extract_type == "schema" and not self.schema:
+            prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
+
+        for variable in variable_values:
+            prompt_with_variables = prompt_with_variables.replace(
+                "{" + variable + "}", variable_values[variable]
+            )
+
+        try:
+            response = perform_completion_with_backoff(
+                self.llm_config.provider,
+                prompt_with_variables,
+                self.llm_config.api_token,
+                base_url=self.llm_config.base_url,
+                json_response=self.force_json_response,
+                extra_args=self.extra_args,
+            )  # , json_response=self.extract_type == "schema")
+            # Track usage
+            usage = TokenUsage(
+                completion_tokens=response.usage.completion_tokens,
+                prompt_tokens=response.usage.prompt_tokens,
+                total_tokens=response.usage.total_tokens,
+                completion_tokens_details=response.usage.completion_tokens_details.__dict__
+                if response.usage.completion_tokens_details
+                else {},
+                prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
+                if response.usage.prompt_tokens_details
+                else {},
+            )
+            self.usages.append(usage)
+
+            # Update totals
+            self.total_usage.completion_tokens += usage.completion_tokens
+            self.total_usage.prompt_tokens += usage.prompt_tokens
+            self.total_usage.total_tokens += usage.total_tokens
+
+            try:
+                response = response.choices[0].message.content
+                blocks = None
+
+                if self.force_json_response:
+                    blocks = json.loads(response)
+                    if isinstance(blocks, dict):
+                        # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
+                        if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
+                            blocks = list(blocks.values())[0]
+                        else:
+                            # If it has only one key which value is not list then assign that to blocks, exampled: { "article_id": "1234", ... }
+                            blocks = [blocks]
+                    elif isinstance(blocks, list):
+                        # If it is a list then assign that to blocks
+                        blocks = blocks
+                else: 
+                    # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
+                    blocks = extract_xml_data(["blocks"], response)["blocks"]
+                    blocks = json.loads(blocks)
+
+                for block in blocks:
+                    block["error"] = False
+            except Exception:
+                parsed, unparsed = split_and_parse_json_objects(
+                    response.choices[0].message.content
+                )
+                blocks = parsed
+                if unparsed:
+                    blocks.append(
+                        {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
+                    )
+
+            if self.verbose:
+                print(
+                    "[LOG] Extracted",
+                    len(blocks),
+                    "blocks from URL:",
+                    url,
+                    "block index:",
+                    ix,
+                )
+            return blocks
+        except Exception as e:
+            if self.verbose:
+                print(f"[LOG] Error in LLM extraction: {e}")
+            # Add error information to extracted_content
+            return [
+                {
+                    "index": ix,
+                    "error": True,
+                    "tags": ["error"],
+                    "content": str(e),
+                }
+            ]
+
+    def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
+        """
+        Merge documents into sections based on chunk_token_threshold and overlap.
+        """
+        sections =  merge_chunks(
+            docs = documents,
+            target_size= chunk_token_threshold,
+            overlap=overlap,
+            word_token_ratio=self.word_token_rate
+        )
+        return sections
+
+    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
+        """
+        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
+
+        Args:
+            url: The URL of the webpage.
+            sections: List of sections (strings) to process.
+
+        Returns:
+            A list of extracted blocks or chunks.
+        """
+
+        merged_sections = self._merge(
+            sections,
+            self.chunk_token_threshold,
+            overlap=int(self.chunk_token_threshold * self.overlap_rate),
+        )
+        extracted_content = []
+        if self.llm_config.provider.startswith("groq/"):
+            # Sequential processing with a delay
+            for ix, section in enumerate(merged_sections):
+                extract_func = partial(self.extract, url)
+                extracted_content.extend(
+                    extract_func(ix, sanitize_input_encode(section))
+                )
+                time.sleep(0.5)  # 500 ms delay between each processing
+        else:
+            # Parallel processing using ThreadPoolExecutor
+            # extract_func = partial(self.extract, url)
+            # for ix, section in enumerate(merged_sections):
+            #     extracted_content.append(extract_func(ix, section))
+
+            with ThreadPoolExecutor(max_workers=4) as executor:
+                extract_func = partial(self.extract, url)
+                futures = [
+                    executor.submit(extract_func, ix, sanitize_input_encode(section))
+                    for ix, section in enumerate(merged_sections)
+                ]
+
+                for future in as_completed(futures):
+                    try:
+                        extracted_content.extend(future.result())
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error in thread execution: {e}")
+                        # Add error information to extracted_content
+                        extracted_content.append(
+                            {
+                                "index": 0,
+                                "error": True,
+                                "tags": ["error"],
+                                "content": str(e),
+                            }
+                        )
+
+        return extracted_content
+
+    def show_usage(self) -> None:
+        """Print a detailed token usage report showing total and per-request usage."""
+        print("\n=== Token Usage Summary ===")
+        print(f"{'Type':<15} {'Count':>12}")
+        print("-" * 30)
+        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
+        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
+        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
+
+        print("\n=== Usage History ===")
+        print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
+        print("-" * 48)
+        for i, usage in enumerate(self.usages, 1):
+            print(
+                f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
+            )
+
+
+#######################################################
+# New extraction strategies for JSON-based extraction #
+#######################################################
+class JsonElementExtractionStrategy(ExtractionStrategy):
+    """
+    Abstract base class for extracting structured JSON from HTML content.
+
+    How it works:
+    1. Parses HTML content using the `_parse_html` method.
+    2. Uses a schema to define base selectors, fields, and transformations.
+    3. Extracts data hierarchically, supporting nested fields and lists.
+    4. Handles computed fields with expressions or functions.
+
+    Attributes:
+        DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
+        _extract_item(element, fields): Extracts fields from a single element.
+        _extract_single_field(element, field): Extracts a single field based on its type.
+        _apply_transform(value, transform): Applies a transformation to a value.
+        _compute_field(item, field): Computes a field value using an expression or function.
+        run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
+
+    Abstract Methods:
+        _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
+        _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
+        _get_elements(element, selector): Retrieves child elements using a selector.
+        _get_element_text(element): Extracts text content from an element.
+        _get_element_html(element): Extracts raw HTML from an element.
+        _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
+    """
+
+    DEL = "\n"
+
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        """
+        Initialize the JSON element extraction strategy with a schema.
+
+        Args:
+            schema (Dict[str, Any]): The schema defining the extraction rules.
+        """
+        super().__init__(**kwargs)
+        self.schema = schema
+        self.verbose = kwargs.get("verbose", False)
+
+    def extract(
+        self, url: str, html_content: str, *q, **kwargs
+    ) -> List[Dict[str, Any]]:
+        """
+        Extract structured data from HTML content.
+
+        How it works:
+        1. Parses the HTML content using the `_parse_html` method.
+        2. Identifies base elements using the schema's base selector.
+        3. Extracts fields from each base element using `_extract_item`.
+
+        Args:
+            url (str): The URL of the page being processed.
+            html_content (str): The raw HTML content to parse and extract.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
+        """
+
+        parsed_html = self._parse_html(html_content)
+        base_elements = self._get_base_elements(
+            parsed_html, self.schema["baseSelector"]
+        )
+
+        results = []
+        for element in base_elements:
+            # Extract base element attributes
+            item = {}
+            if "baseFields" in self.schema:
+                for field in self.schema["baseFields"]:
+                    value = self._extract_single_field(element, field)
+                    if value is not None:
+                        item[field["name"]] = value
+
+            # Extract child fields
+            field_data = self._extract_item(element, self.schema["fields"])
+            item.update(field_data)
+
+            if item:
+                results.append(item)
+
+        return results
+
+    @abstractmethod
+    def _parse_html(self, html_content: str):
+        """Parse HTML content into appropriate format"""
+        pass
+
+    @abstractmethod
+    def _get_base_elements(self, parsed_html, selector: str):
+        """Get all base elements using the selector"""
+        pass
+
+    @abstractmethod
+    def _get_elements(self, element, selector: str):
+        """Get child elements using the selector"""
+        pass
+
+    def _extract_field(self, element, field):
+        try:
+            if field["type"] == "nested":
+                nested_elements = self._get_elements(element, field["selector"])
+                nested_element = nested_elements[0] if nested_elements else None
+                return (
+                    self._extract_item(nested_element, field["fields"])
+                    if nested_element
+                    else {}
+                )
+
+            if field["type"] == "list":
+                elements = self._get_elements(element, field["selector"])
+                return [self._extract_list_item(el, field["fields"]) for el in elements]
+
+            if field["type"] == "nested_list":
+                elements = self._get_elements(element, field["selector"])
+                return [self._extract_item(el, field["fields"]) for el in elements]
+
+            return self._extract_single_field(element, field)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error extracting field {field['name']}: {str(e)}")
+            return field.get("default")
+
+    def _extract_single_field(self, element, field):
+        """
+        Extract a single field based on its type.
+
+        How it works:
+        1. Selects the target element using the field's selector.
+        2. Extracts the field value based on its type (e.g., text, attribute, regex).
+        3. Applies transformations if defined in the schema.
+
+        Args:
+            element: The base element to extract the field from.
+            field (Dict[str, Any]): The field definition in the schema.
+
+        Returns:
+            Any: The extracted field value.
+        """
+
+        if "selector" in field:
+            selected = self._get_elements(element, field["selector"])
+            if not selected:
+                return field.get("default")
+            selected = selected[0]
+        else:
+            selected = element
+
+        value = None
+        if field["type"] == "text":
+            value = self._get_element_text(selected)
+        elif field["type"] == "attribute":
+            value = self._get_element_attribute(selected, field["attribute"])
+        elif field["type"] == "html":
+            value = self._get_element_html(selected)
+        elif field["type"] == "regex":
+            text = self._get_element_text(selected)
+            match = re.search(field["pattern"], text)
+            value = match.group(1) if match else None
+
+        if "transform" in field:
+            value = self._apply_transform(value, field["transform"])
+
+        return value if value is not None else field.get("default")
+
+    def _extract_list_item(self, element, fields):
+        item = {}
+        for field in fields:
+            value = self._extract_single_field(element, field)
+            if value is not None:
+                item[field["name"]] = value
+        return item
+
+    def _extract_item(self, element, fields):
+        """
+        Extracts fields from a given element.
+
+        How it works:
+        1. Iterates through the fields defined in the schema.
+        2. Handles computed, single, and nested field types.
+        3. Updates the item dictionary with extracted field values.
+
+        Args:
+            element: The base element to extract fields from.
+            fields (List[Dict[str, Any]]): The list of fields to extract.
+
+        Returns:
+            Dict[str, Any]: A dictionary representing the extracted item.
+        """
+
+        item = {}
+        for field in fields:
+            if field["type"] == "computed":
+                value = self._compute_field(item, field)
+            else:
+                value = self._extract_field(element, field)
+            if value is not None:
+                item[field["name"]] = value
+        return item
+
+    def _apply_transform(self, value, transform):
+        """
+        Apply a transformation to a value.
+
+        How it works:
+        1. Checks the transformation type (e.g., `lowercase`, `strip`).
+        2. Applies the transformation to the value.
+        3. Returns the transformed value.
+
+        Args:
+            value (str): The value to transform.
+            transform (str): The type of transformation to apply.
+
+        Returns:
+            str: The transformed value.
+        """
+
+        if transform == "lowercase":
+            return value.lower()
+        elif transform == "uppercase":
+            return value.upper()
+        elif transform == "strip":
+            return value.strip()
+        return value
+
+    def _compute_field(self, item, field):
+        try:
+            if "expression" in field:
+                return eval(field["expression"], {}, item)
+            elif "function" in field:
+                return field["function"](item)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error computing field {field['name']}: {str(e)}")
+            return field.get("default")
+
+    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Run the extraction strategy on a combined HTML content.
+
+        How it works:
+        1. Combines multiple HTML sections using the `DEL` delimiter.
+        2. Calls the `extract` method with the combined HTML.
+
+        Args:
+            url (str): The URL of the page being processed.
+            sections (List[str]): A list of HTML sections.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items.
+        """
+
+        combined_html = self.DEL.join(sections)
+        return self.extract(url, combined_html, **kwargs)
+
+    @abstractmethod
+    def _get_element_text(self, element) -> str:
+        """Get text content from element"""
+        pass
+
+    @abstractmethod
+    def _get_element_html(self, element) -> str:
+        """Get HTML content from element"""
+        pass
+
+    @abstractmethod
+    def _get_element_attribute(self, element, attribute: str):
+        """Get attribute value from element"""
+        pass
+
+    _GENERATE_SCHEMA_UNWANTED_PROPS = {
+        'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
+        'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
+    }
+
+    @staticmethod
+    def generate_schema(
+        html: str,
+        schema_type: str = "CSS", # or XPATH
+        query: str = None,
+        target_json_example: str = None,
+        llm_config: 'LLMConfig' = create_llm_config(),
+        provider: str = None,
+        api_token: str = None,
+        **kwargs
+    ) -> dict:
+        """
+        Generate extraction schema from HTML content and optional query.
+        
+        Args:
+            html (str): The HTML content to analyze
+            query (str, optional): Natural language description of what data to extract
+            provider (str): Legacy Parameter. LLM provider to use 
+            api_token (str): Legacy Parameter. API token for LLM provider
+            llm_config (LLMConfig): LLM configuration object
+            prompt (str, optional): Custom prompt template to use
+            **kwargs: Additional args passed to LLM processor
+            
+        Returns:
+            dict: Generated schema following the JsonElementExtractionStrategy format
+        """
+        from .prompts import JSON_SCHEMA_BUILDER
+        from .utils import perform_completion_with_backoff
+        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
+            if locals()[name] is not None:
+                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
+        
+        # Use default or custom prompt
+        prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
+        
+        # Build the prompt
+        system_message = {
+            "role": "system", 
+            "content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
+
+Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
+
+# Schema main keys:
+- name: This is the name of the schema.
+- baseSelector: This is the CSS or XPATH selector that identifies the base element that contains all the repetitive patterns.
+- baseFields: This is a list of fields that you extract from the base element itself.
+- fields: This is a list of fields that you extract from the children of the base element. {{name, selector, type}} based on the type, you may have extra keys such as "attribute" when the type is "attribute".
+
+# Extra Context:
+In this context, the following items may or may not be present:
+- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
+- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
+- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.
+
+# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
+In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.
+
+# What are the instructions and details for this schema generation?
+{prompt_template}"""
+        }
+        
+        user_message = {
+            "role": "user",
+            "content": f"""
+                HTML to analyze:
+                ```html
+                {html}
+                ```
+                """
+        }
+
+        if query:
+            user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
+        if target_json_example:
+            user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
+
+        if query and not target_json_example:
+            user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
+        elif not query and target_json_example:
+            user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
+        elif not query and not target_json_example:
+            user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
+        
+        user_message["content"] += """IMPORTANT: Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
+
+        Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
+        """
+
+        try:
+            # Call LLM with backoff handling
+            response = perform_completion_with_backoff(
+                provider=llm_config.provider,
+                prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
+                json_response = True,                
+                api_token=llm_config.api_token,
+                base_url=llm_config.base_url,
+                extra_args=kwargs
+            )
+            
+            # Extract and return schema
+            return json.loads(response.choices[0].message.content)
+            
+        except Exception as e:
+            raise Exception(f"Failed to generate schema: {str(e)}")
+
+class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
+
+    How it works:
+    1. Parses HTML content with BeautifulSoup.
+    2. Selects elements using CSS selectors defined in the schema.
+    3. Extracts field data and applies transformations as defined.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
+        _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
+        _get_elements(element, selector): Selects child elements using a CSS selector.
+        _get_element_text(element): Extracts text content from a BeautifulSoup element.
+        _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
+    """
+
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+
+    def _parse_html(self, html_content: str):
+        # return BeautifulSoup(html_content, "html.parser")
+        return BeautifulSoup(html_content, "lxml")
+
+    def _get_base_elements(self, parsed_html, selector: str):
+        return parsed_html.select(selector)
+
+    def _get_elements(self, element, selector: str):
+        # Return all matching elements using select() instead of select_one()
+        # This ensures that we get all elements that match the selector, not just the first one
+        return element.select(selector)
+
+    def _get_element_text(self, element) -> str:
+        return element.get_text(strip=True)
+
+    def _get_element_html(self, element) -> str:
+        return str(element)
+
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)
+
+class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"
+        super().__init__(schema, **kwargs)
+        self._selector_cache = {}
+        self._xpath_cache = {}
+        self._result_cache = {}
+        
+        # Control selector optimization strategy
+        self.use_caching = kwargs.get("use_caching", True)
+        self.optimize_common_patterns = kwargs.get("optimize_common_patterns", True)
+        
+        # Load lxml dependencies once
+        from lxml import etree, html
+        from lxml.cssselect import CSSSelector
+        self.etree = etree
+        self.html_parser = html
+        self.CSSSelector = CSSSelector
+    
+    def _parse_html(self, html_content: str):
+        """Parse HTML content with error recovery"""
+        try:
+            parser = self.etree.HTMLParser(recover=True, remove_blank_text=True)
+            return self.etree.fromstring(html_content, parser)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error parsing HTML, falling back to alternative method: {e}")
+            try:
+                return self.html_parser.fromstring(html_content)
+            except Exception as e2:
+                if self.verbose:
+                    print(f"Critical error parsing HTML: {e2}")
+                # Create minimal document as fallback
+                return self.etree.Element("html")
+    
+    def _optimize_selector(self, selector_str):
+        """Optimize common selector patterns for better performance"""
+        if not self.optimize_common_patterns:
+            return selector_str
+            
+        # Handle td:nth-child(N) pattern which is very common in table scraping
+        import re
+        if re.search(r'td:nth-child\(\d+\)', selector_str):
+            return selector_str  # Already handled specially in _apply_selector
+            
+        # Split complex selectors into parts for optimization
+        parts = selector_str.split()
+        if len(parts) <= 1:
+            return selector_str
+            
+        # For very long selectors, consider using just the last specific part
+        if len(parts) > 3 and any(p.startswith('.') or p.startswith('#') for p in parts):
+            specific_parts = [p for p in parts if p.startswith('.') or p.startswith('#')]
+            if specific_parts:
+                return specific_parts[-1]  # Use most specific class/id selector
+                
+        return selector_str
+    
+    def _create_selector_function(self, selector_str):
+        """Create a selector function that handles all edge cases"""
+        original_selector = selector_str
+        
+        # Try to optimize the selector if appropriate
+        if self.optimize_common_patterns:
+            selector_str = self._optimize_selector(selector_str)
+        
+        try:
+            # Attempt to compile the CSS selector
+            compiled = self.CSSSelector(selector_str)
+            xpath = compiled.path
+            
+            # Store XPath for later use
+            self._xpath_cache[selector_str] = xpath
+            
+            # Create the wrapper function that implements the selection strategy
+            def selector_func(element, context_sensitive=True):
+                cache_key = None
+                
+                # Use result caching if enabled
+                if self.use_caching:
+                    # Create a cache key based on element and selector
+                    element_id = element.get('id', '') or str(hash(element))
+                    cache_key = f"{element_id}::{selector_str}"
+                    
+                    if cache_key in self._result_cache:
+                        return self._result_cache[cache_key]
+                
+                results = []
+                try:
+                    # Strategy 1: Direct CSS selector application (fastest)
+                    results = compiled(element)
+                    
+                    # If that fails and we need context sensitivity
+                    if not results and context_sensitive:
+                        # Strategy 2: Try XPath with context adjustment
+                        context_xpath = self._make_context_sensitive_xpath(xpath, element)
+                        if context_xpath:
+                            results = element.xpath(context_xpath)
+                        
+                        # Strategy 3: Handle special case - nth-child
+                        if not results and 'nth-child' in original_selector:
+                            results = self._handle_nth_child_selector(element, original_selector)
+                        
+                        # Strategy 4: Direct descendant search for class/ID selectors
+                        if not results:
+                            results = self._fallback_class_id_search(element, original_selector)
+                            
+                        # Strategy 5: Last resort - tag name search for the final part
+                        if not results:
+                            parts = original_selector.split()
+                            if parts:
+                                last_part = parts[-1]
+                                # Extract tag name from the selector
+                                tag_match = re.match(r'^(\w+)', last_part)
+                                if tag_match:
+                                    tag_name = tag_match.group(1)
+                                    results = element.xpath(f".//{tag_name}")
+                    
+                    # Cache results if caching is enabled
+                    if self.use_caching and cache_key:
+                        self._result_cache[cache_key] = results
+                        
+                except Exception as e:
+                    if self.verbose:
+                        print(f"Error applying selector '{selector_str}': {e}")
+                
+                return results
+                
+            return selector_func
+            
+        except Exception as e:
+            if self.verbose:
+                print(f"Error compiling selector '{selector_str}': {e}")
+            
+            # Fallback function for invalid selectors
+            return lambda element, context_sensitive=True: []
+    
+    def _make_context_sensitive_xpath(self, xpath, element):
+        """Convert absolute XPath to context-sensitive XPath"""
+        try:
+            # If starts with descendant-or-self, it's already context-sensitive
+            if xpath.startswith('descendant-or-self::'):
+                return xpath
+                
+            # Remove leading slash if present
+            if xpath.startswith('/'):
+                context_xpath = f".{xpath}"
+            else:
+                context_xpath = f".//{xpath}"
+                
+            # Validate the XPath by trying it
+            try:
+                element.xpath(context_xpath)
+                return context_xpath
+            except:
+                # If that fails, try a simpler descendant search
+                return f".//{xpath.split('/')[-1]}"
+        except:
+            return None
+    
+    def _handle_nth_child_selector(self, element, selector_str):
+        """Special handling for nth-child selectors in tables"""
+        import re
+        results = []
+        
+        try:
+            # Extract the column number from td:nth-child(N)
+            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
+            if match:
+                col_num = match.group(1)
+                
+                # Check if there's content after the nth-child part
+                remaining_selector = selector_str.split(f"td:nth-child({col_num})", 1)[-1].strip()
+                
+                if remaining_selector:
+                    # If there's a specific element we're looking for after the column
+                    # Extract any tag names from the remaining selector
+                    tag_match = re.search(r'(\w+)', remaining_selector)
+                    tag_name = tag_match.group(1) if tag_match else '*'
+                    results = element.xpath(f".//td[{col_num}]//{tag_name}")
+                else:
+                    # Just get the column cell
+                    results = element.xpath(f".//td[{col_num}]")
+        except Exception as e:
+            if self.verbose:
+                print(f"Error handling nth-child selector: {e}")
+                
+        return results
+    
+    def _fallback_class_id_search(self, element, selector_str):
+        """Fallback to search by class or ID"""
+        results = []
+        
+        try:
+            # Extract class selectors (.classname)
+            import re
+            class_matches = re.findall(r'\.([a-zA-Z0-9_-]+)', selector_str)
+            
+            # Extract ID selectors (#idname)
+            id_matches = re.findall(r'#([a-zA-Z0-9_-]+)', selector_str)
+            
+            # Try each class
+            for class_name in class_matches:
+                class_results = element.xpath(f".//*[contains(@class, '{class_name}')]")
+                results.extend(class_results)
+                
+            # Try each ID (usually more specific)
+            for id_name in id_matches:
+                id_results = element.xpath(f".//*[@id='{id_name}']")
+                results.extend(id_results)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error in fallback class/id search: {e}")
+                
+        return results
+    
+    def _get_selector(self, selector_str):
+        """Get or create a selector function with caching"""
+        if selector_str not in self._selector_cache:
+            self._selector_cache[selector_str] = self._create_selector_function(selector_str)
+        return self._selector_cache[selector_str]
+    
+    def _get_base_elements(self, parsed_html, selector: str):
+        """Get all base elements using the selector"""
+        selector_func = self._get_selector(selector)
+        # For base elements, we don't need context sensitivity
+        return selector_func(parsed_html, context_sensitive=False)
+    
+    def _get_elements(self, element, selector: str):
+        """Get child elements using the selector with context sensitivity"""
+        selector_func = self._get_selector(selector)
+        return selector_func(element, context_sensitive=True)
+    
+    def _get_element_text(self, element) -> str:
+        """Extract normalized text from element"""
+        try:
+            # Get all text nodes and normalize
+            text = " ".join(t.strip() for t in element.xpath(".//text()") if t.strip())
+            return text
+        except Exception as e:
+            if self.verbose:
+                print(f"Error extracting text: {e}")
+            # Fallback
+            try:
+                return element.text_content().strip()
+            except:
+                return ""
+    
+    def _get_element_html(self, element) -> str:
+        """Get HTML string representation of element"""
+        try:
+            return self.etree.tostring(element, encoding='unicode', method='html')
+        except Exception as e:
+            if self.verbose:
+                print(f"Error serializing HTML: {e}")
+            return ""
+    
+    def _get_element_attribute(self, element, attribute: str):
+        """Get attribute value safely"""
+        try:
+            return element.get(attribute)
+        except Exception as e:
+            if self.verbose:
+                print(f"Error getting attribute '{attribute}': {e}")
+            return None
+            
+    def _clear_caches(self):
+        """Clear caches to free memory"""
+        if self.use_caching:
+            self._result_cache.clear()
+
+class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+        self._selector_cache = {}
+    
+    def _parse_html(self, html_content: str):
+        from lxml import etree
+        parser = etree.HTMLParser(recover=True)
+        return etree.fromstring(html_content, parser)
+    
+    def _get_selector(self, selector_str):
+        """Get a selector function that works within the context of an element"""
+        if selector_str not in self._selector_cache:
+            from lxml.cssselect import CSSSelector
+            try:
+                # Store both the compiled selector and its xpath translation
+                compiled = CSSSelector(selector_str)
+                
+                # Create a function that will apply this selector appropriately
+                def select_func(element):
+                    try:
+                        # First attempt: direct CSS selector application
+                        results = compiled(element)
+                        if results:
+                            return results
+                        
+                        # Second attempt: contextual XPath selection
+                        # Convert the root-based XPath to a context-based XPath
+                        xpath = compiled.path
+                        
+                        # If the XPath already starts with descendant-or-self, handle it specially
+                        if xpath.startswith('descendant-or-self::'):
+                            context_xpath = xpath
+                        else:
+                            # For normal XPath expressions, make them relative to current context
+                            context_xpath = f"./{xpath.lstrip('/')}"
+                        
+                        results = element.xpath(context_xpath)
+                        if results:
+                            return results
+                        
+                        # Final fallback: simple descendant search for common patterns
+                        if 'nth-child' in selector_str:
+                            # Handle td:nth-child(N) pattern
+                            import re
+                            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
+                            if match:
+                                col_num = match.group(1)
+                                sub_selector = selector_str.split(')', 1)[-1].strip()
+                                if sub_selector:
+                                    return element.xpath(f".//td[{col_num}]//{sub_selector}")
+                                else:
+                                    return element.xpath(f".//td[{col_num}]")
+                        
+                        # Last resort: try each part of the selector separately
+                        parts = selector_str.split()
+                        if len(parts) > 1 and parts[-1]:
+                            return element.xpath(f".//{parts[-1]}")
+                            
+                        return []
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error applying selector '{selector_str}': {e}")
+                        return []
+                
+                self._selector_cache[selector_str] = select_func
+            except Exception as e:
+                if self.verbose:
+                    print(f"Error compiling selector '{selector_str}': {e}")
+                
+                # Fallback function for invalid selectors
+                def fallback_func(element):
+                    return []
+                
+                self._selector_cache[selector_str] = fallback_func
+                
+        return self._selector_cache[selector_str]
+    
+    def _get_base_elements(self, parsed_html, selector: str):
+        selector_func = self._get_selector(selector)
+        return selector_func(parsed_html)
+    
+    def _get_elements(self, element, selector: str):
+        selector_func = self._get_selector(selector)
+        return selector_func(element)
+    
+    def _get_element_text(self, element) -> str:
+        return "".join(element.xpath(".//text()")).strip()
+    
+    def _get_element_html(self, element) -> str:
+        from lxml import etree
+        return etree.tostring(element, encoding='unicode')
+    
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)    
+
+class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
+
+    How it works:
+    1. Parses HTML content into an lxml tree.
+    2. Selects elements using XPath expressions.
+    3. Converts CSS selectors to XPath when needed.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into an lxml tree.
+        _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
+        _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
+        _get_elements(element, selector): Selects child elements using an XPath selector.
+        _get_element_text(element): Extracts text content from an lxml element.
+        _get_element_html(element): Extracts the raw HTML content of an lxml element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
+    """
+
+    def __init__(self, schema: Dict[str, Any], **kwargs):
+        kwargs["input_format"] = "html"  # Force HTML input
+        super().__init__(schema, **kwargs)
+
+    def _parse_html(self, html_content: str):
+        return html.fromstring(html_content)
+
+    def _get_base_elements(self, parsed_html, selector: str):
+        return parsed_html.xpath(selector)
+
+    def _css_to_xpath(self, css_selector: str) -> str:
+        """Convert CSS selector to XPath if needed"""
+        if "/" in css_selector:  # Already an XPath
+            return css_selector
+        return self._basic_css_to_xpath(css_selector)
+
+    def _basic_css_to_xpath(self, css_selector: str) -> str:
+        """Basic CSS to XPath conversion for common cases"""
+        if " > " in css_selector:
+            parts = css_selector.split(" > ")
+            return "//" + "/".join(parts)
+        if " " in css_selector:
+            parts = css_selector.split(" ")
+            return "//" + "//".join(parts)
+        return "//" + css_selector
+
+    def _get_elements(self, element, selector: str):
+        xpath = self._css_to_xpath(selector)
+        if not xpath.startswith("."):
+            xpath = "." + xpath
+        return element.xpath(xpath)
+
+    def _get_element_text(self, element) -> str:
+        return "".join(element.xpath(".//text()")).strip()
+
+    def _get_element_html(self, element) -> str:
+        return etree.tostring(element, encoding="unicode")
+
+    def _get_element_attribute(self, element, attribute: str):
+        return element.get(attribute)
+
+
+```
+
+
+## File: crawl4ai/models.py
+
+```py
+from pydantic import BaseModel, HttpUrl, PrivateAttr
+from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
+from typing import AsyncGenerator
+from typing import Generic, TypeVar
+from enum import Enum
+from dataclasses import dataclass
+from .ssl_certificate import SSLCertificate
+from datetime import datetime
+from datetime import timedelta
+
+
+###############################
+# Dispatcher Models
+###############################
+@dataclass
+class DomainState:
+    last_request_time: float = 0
+    current_delay: float = 0
+    fail_count: int = 0
+
+
+@dataclass
+class CrawlerTaskResult:
+    task_id: str
+    url: str
+    result: "CrawlResult"
+    memory_usage: float
+    peak_memory: float
+    start_time: Union[datetime, float]
+    end_time: Union[datetime, float]
+    error_message: str = ""
+    retry_count: int = 0
+    wait_time: float = 0.0
+    
+    @property
+    def success(self) -> bool:
+        return self.result.success
+
+class CrawlStatus(Enum):
+    QUEUED = "QUEUED"
+    IN_PROGRESS = "IN_PROGRESS"
+    COMPLETED = "COMPLETED"
+    FAILED = "FAILED"
+
+@dataclass
+class CrawlStats:
+    task_id: str
+    url: str
+    status: CrawlStatus
+    start_time: Optional[Union[datetime, float]] = None
+    end_time: Optional[Union[datetime, float]] = None
+    memory_usage: float = 0.0
+    peak_memory: float = 0.0
+    error_message: str = ""
+    wait_time: float = 0.0
+    retry_count: int = 0
+    counted_requeue: bool = False
+
+    @property
+    def duration(self) -> str:
+        if not self.start_time:
+            return "0:00"
+            
+        # Convert start_time to datetime if it's a float
+        start = self.start_time
+        if isinstance(start, float):
+            start = datetime.fromtimestamp(start)
+            
+        # Get end time or use current time
+        end = self.end_time or datetime.now()
+        # Convert end_time to datetime if it's a float
+        if isinstance(end, float):
+            end = datetime.fromtimestamp(end)
+            
+        duration = end - start
+        return str(timedelta(seconds=int(duration.total_seconds())))
+
+class DisplayMode(Enum):
+    DETAILED = "DETAILED"
+    AGGREGATED = "AGGREGATED"
+
+
+###############################
+# Crawler Models
+###############################
+@dataclass
+class TokenUsage:
+    completion_tokens: int = 0
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens_details: Optional[dict] = None
+    prompt_tokens_details: Optional[dict] = None
+
+class UrlModel(BaseModel):
+    url: HttpUrl
+    forced: bool = False
+
+
+
+@dataclass
+class TraversalStats:
+    """Statistics for the traversal process"""
+
+    start_time: datetime = datetime.now()
+    urls_processed: int = 0
+    urls_failed: int = 0
+    urls_skipped: int = 0
+    total_depth_reached: int = 0
+    current_depth: int = 0
+
+class DispatchResult(BaseModel):
+    task_id: str
+    memory_usage: float
+    peak_memory: float
+    start_time: Union[datetime, float]
+    end_time: Union[datetime, float]
+    error_message: str = ""
+
+class MarkdownGenerationResult(BaseModel):
+    raw_markdown: str
+    markdown_with_citations: str
+    references_markdown: str
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+
+    def __str__(self):
+        return self.raw_markdown
+    
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    js_execution_result: Optional[Dict[str, Any]] = None
+    screenshot: Optional[str] = None
+    pdf: Optional[bytes] = None
+    mhtml: Optional[str] = None
+    _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    dispatch_result: Optional[DispatchResult] = None
+    redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
+# and model_dump override all exist to support a smooth transition from markdown as a string
+# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
+# 
+# This allows code that expects markdown to be a string to continue working, while also
+# providing access to the full MarkdownGenerationResult object's properties.
+# 
+# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
+# 
+# When backward compatibility is no longer needed in future versions, this entire mechanism
+# can be simplified to a standard field with no custom accessors or serialization logic.
+    
+    def __init__(self, **data):
+        markdown_result = data.pop('markdown', None)
+        super().__init__(**data)
+        if markdown_result is not None:
+            self._markdown = (
+                MarkdownGenerationResult(**markdown_result)
+                if isinstance(markdown_result, dict)
+                else markdown_result
+            )
+    
+    @property
+    def markdown(self):
+        """
+        Property that returns a StringCompatibleMarkdown object that behaves like
+        a string but also provides access to MarkdownGenerationResult attributes.
+        
+        This approach allows backward compatibility with code that expects 'markdown'
+        to be a string, while providing access to the full MarkdownGenerationResult.
+        """
+        if self._markdown is None:
+            return None
+        return StringCompatibleMarkdown(self._markdown)
+    
+    @markdown.setter
+    def markdown(self, value):
+        """
+        Setter for the markdown property.
+        """
+        self._markdown = value
+    
+    @property
+    def markdown_v2(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+
+        This property exists to inform users that 'markdown_v2' has been
+        deprecated and they should use 'markdown' instead.
+        """
+        raise AttributeError(
+            "The 'markdown_v2' attribute is deprecated and has been removed. "
+            """Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
+            following properties:
+            - raw_markdown: The raw markdown string
+            - markdown_with_citations: The markdown string with citations
+            - references_markdown: The markdown string with references
+            - fit_markdown: The markdown string with fit text
+            """
+        )
+    
+    @property
+    def fit_markdown(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_markdown' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_markdown' instead."
+        )
+    
+    @property
+    def fit_html(self):
+        """
+        Deprecated property that raises an AttributeError when accessed.
+        """
+        raise AttributeError(
+            "The 'fit_html' attribute is deprecated and has been removed. "
+            "Please use 'markdown.fit_html' instead."
+        )
+
+    def model_dump(self, *args, **kwargs):
+        """
+        Override model_dump to include the _markdown private attribute in serialization.
+        
+        This override is necessary because:
+        1. PrivateAttr fields are excluded from serialization by default
+        2. We need to maintain backward compatibility by including the 'markdown' field
+           in the serialized output
+        3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
+           the same type of data
+        
+        Future developers: This method ensures that the markdown content is properly
+        serialized despite being stored in a private attribute. If the serialization
+        requirements change, this is where you would update the logic.
+        """
+        result = super().model_dump(*args, **kwargs)
+        if self._markdown is not None:
+            result["markdown"] = self._markdown.model_dump() 
+        return result
+
+class StringCompatibleMarkdown(str):
+    """A string subclass that also provides access to MarkdownGenerationResult attributes"""
+    def __new__(cls, markdown_result):
+        return super().__new__(cls, markdown_result.raw_markdown)
+    
+    def __init__(self, markdown_result):
+        self._markdown_result = markdown_result
+    
+    def __getattr__(self, name):
+        return getattr(self._markdown_result, name)
+
+CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
+
+class CrawlResultContainer(Generic[CrawlResultT]):
+    def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
+        # Normalize to a list
+        if isinstance(results, list):
+            self._results = results
+        else:
+            self._results = [results]
+
+    def __iter__(self):
+        return iter(self._results)
+
+    def __getitem__(self, index):
+        return self._results[index]
+
+    def __len__(self):
+        return len(self._results)
+
+    def __getattr__(self, attr):
+        # Delegate attribute access to the first element.
+        if self._results:
+            return getattr(self._results[0], attr)
+        raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self._results!r})"
+
+RunManyReturn = Union[
+    CrawlResultContainer[CrawlResultT],
+    AsyncGenerator[CrawlResultT, None]
+]
+
+
+# END of backward compatibility code for markdown/markdown_v2.
+# When removing this code in the future, make sure to:
+# 1. Replace the private attribute and property with a standard field
+# 2. Update any serialization logic that might depend on the current behavior
+
+class AsyncCrawlResponse(BaseModel):
+    html: str
+    response_headers: Dict[str, str]
+    js_execution_result: Optional[Dict[str, Any]] = None
+    status_code: int
+    screenshot: Optional[str] = None
+    pdf_data: Optional[bytes] = None
+    mhtml_data: Optional[str] = None
+    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+    downloaded_files: Optional[List[str]] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    redirected_url: Optional[str] = None
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+###############################
+# Scraping Models
+###############################
+class MediaItem(BaseModel):
+    src: Optional[str] = ""
+    data: Optional[str] = ""
+    alt: Optional[str] = ""
+    desc: Optional[str] = ""
+    score: Optional[int] = 0
+    type: str = "image"
+    group_id: Optional[int] = 0
+    format: Optional[str] = None
+    width: Optional[int] = None
+
+
+class Link(BaseModel):
+    href: Optional[str] = ""
+    text: Optional[str] = ""
+    title: Optional[str] = ""
+    base_domain: Optional[str] = ""
+
+
+class Media(BaseModel):
+    images: List[MediaItem] = []
+    videos: List[
+        MediaItem
+    ] = []  # Using MediaItem model for now, can be extended with Video model if needed
+    audios: List[
+        MediaItem
+    ] = []  # Using MediaItem model for now, can be extended with Audio model if needed
+    tables: List[Dict] = []  # Table data extracted from HTML tables
+
+
+class Links(BaseModel):
+    internal: List[Link] = []
+    external: List[Link] = []
+
+
+class ScrapingResult(BaseModel):
+    cleaned_html: str
+    success: bool
+    media: Media = Media()
+    links: Links = Links()
+    metadata: Dict[str, Any] = {}
+
+```
+
+
+## File: crawl4ai/content_filter_strategy.py
+
+```py
+import inspect
+import re
+import time
+from bs4 import BeautifulSoup, Tag
+from typing import List, Tuple, Dict, Optional
+from rank_bm25 import BM25Okapi
+from collections import deque
+from bs4 import NavigableString, Comment
+
+from .utils import (
+    clean_tokens,
+    perform_completion_with_backoff,
+    escape_json_string,
+    sanitize_html,
+    get_home_folder,
+    extract_xml_data,
+    merge_chunks,
+)
+from .types import LLMConfig
+from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
+from abc import ABC, abstractmethod
+import math
+from snowballstemmer import stemmer
+from .models import TokenUsage
+from .prompts import PROMPT_FILTER_CONTENT
+import json
+import hashlib
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+from .async_logger import AsyncLogger, LogLevel
+from colorama import Fore, Style
+
+
+class RelevantContentFilter(ABC):
+    """Abstract base class for content filtering strategies"""
+
+    def __init__(
+        self,
+        user_query: str = None,
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+    ):
+        """
+        Initializes the RelevantContentFilter class with optional user query.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            verbose (bool): Enable verbose logging (default: False).
+        """
+        self.user_query = user_query
+        self.included_tags = {
+            # Primary structure
+            "article",
+            "main",
+            "section",
+            "div",
+            # List structures
+            "ul",
+            "ol",
+            "li",
+            "dl",
+            "dt",
+            "dd",
+            # Text content
+            "p",
+            "span",
+            "blockquote",
+            "pre",
+            "code",
+            # Headers
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            # Tables
+            "table",
+            "thead",
+            "tbody",
+            "tr",
+            "td",
+            "th",
+            # Other semantic elements
+            "figure",
+            "figcaption",
+            "details",
+            "summary",
+            # Text formatting
+            "em",
+            "strong",
+            "b",
+            "i",
+            "mark",
+            "small",
+            # Rich content
+            "time",
+            "address",
+            "cite",
+            "q",
+        }
+        self.excluded_tags = {
+            "nav",
+            "footer",
+            "header",
+            "aside",
+            "script",
+            "style",
+            "form",
+            "iframe",
+            "noscript",
+        }
+        self.header_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
+        self.negative_patterns = re.compile(
+            r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
+        )
+        self.min_word_count = 2
+        self.verbose = False
+        self.logger = logger
+
+    @abstractmethod
+    def filter_content(self, html: str) -> List[str]:
+        """Abstract method to be implemented by specific filtering strategies"""
+        pass
+
+    def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str:
+        """Common method to extract page metadata with fallbacks"""
+        if self.user_query:
+            return self.user_query
+
+        query_parts = []
+
+        # Title
+        try:
+            title = soup.title.string
+            if title:
+                query_parts.append(title)
+        except Exception:
+            pass
+
+        if soup.find("h1"):
+            query_parts.append(soup.find("h1").get_text())
+
+        # Meta tags
+        temp = ""
+        for meta_name in ["keywords", "description"]:
+            meta = soup.find("meta", attrs={"name": meta_name})
+            if meta and meta.get("content"):
+                query_parts.append(meta["content"])
+                temp += meta["content"]
+
+        # If still empty, grab first significant paragraph
+        if not temp:
+            # Find the first tag P thatits text contains more than 50 characters
+            for p in body.find_all("p"):
+                if len(p.get_text()) > 150:
+                    query_parts.append(p.get_text()[:150])
+                    break
+
+        return " ".join(filter(None, query_parts))
+
+    def extract_text_chunks(
+        self, body: Tag, min_word_threshold: int = None
+    ) -> List[Tuple[str, str]]:
+        """
+        Extracts text chunks from a BeautifulSoup body element while preserving order.
+        Returns list of tuples (text, tag_name) for classification.
+
+        Args:
+            body: BeautifulSoup Tag object representing the body element
+
+        Returns:
+            List of (text, tag_name) tuples
+        """
+        # Tags to ignore - inline elements that shouldn't break text flow
+        INLINE_TAGS = {
+            "a",
+            "abbr",
+            "acronym",
+            "b",
+            "bdo",
+            "big",
+            "br",
+            "button",
+            "cite",
+            "code",
+            "dfn",
+            "em",
+            "i",
+            "img",
+            "input",
+            "kbd",
+            "label",
+            "map",
+            "object",
+            "q",
+            "samp",
+            "script",
+            "select",
+            "small",
+            "span",
+            "strong",
+            "sub",
+            "sup",
+            "textarea",
+            "time",
+            "tt",
+            "var",
+        }
+
+        # Tags that typically contain meaningful headers
+        HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "header"}
+
+        chunks = []
+        current_text = []
+        chunk_index = 0
+
+        def should_break_chunk(tag: Tag) -> bool:
+            """Determine if a tag should cause a break in the current text chunk"""
+            return tag.name not in INLINE_TAGS and not (
+                tag.name == "p" and len(current_text) == 0
+            )
+
+        # Use deque for efficient push/pop operations
+        stack = deque([(body, False)])
+
+        while stack:
+            element, visited = stack.pop()
+
+            if visited:
+                # End of block element - flush accumulated text
+                if current_text and should_break_chunk(element):
+                    text = " ".join("".join(current_text).split())
+                    if text:
+                        tag_type = (
+                            "header" if element.name in HEADER_TAGS else "content"
+                        )
+                        chunks.append((chunk_index, text, tag_type, element))
+                        chunk_index += 1
+                    current_text = []
+                continue
+
+            if isinstance(element, NavigableString):
+                if str(element).strip():
+                    current_text.append(str(element).strip())
+                continue
+
+            # Pre-allocate children to avoid multiple list operations
+            children = list(element.children)
+            if not children:
+                continue
+
+            # Mark block for revisit after processing children
+            stack.append((element, True))
+
+            # Add children in reverse order for correct processing
+            for child in reversed(children):
+                if isinstance(child, (Tag, NavigableString)):
+                    stack.append((child, False))
+
+        # Handle any remaining text
+        if current_text:
+            text = " ".join("".join(current_text).split())
+            if text:
+                chunks.append((chunk_index, text, "content", body))
+
+        if min_word_threshold:
+            chunks = [
+                chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold
+            ]
+
+        return chunks
+
+    def _deprecated_extract_text_chunks(
+        self, soup: BeautifulSoup
+    ) -> List[Tuple[int, str, Tag]]:
+        """Common method for extracting text chunks"""
+        _text_cache = {}
+
+        def fast_text(element: Tag) -> str:
+            elem_id = id(element)
+            if elem_id in _text_cache:
+                return _text_cache[elem_id]
+            texts = []
+            for content in element.contents:
+                if isinstance(content, str):
+                    text = content.strip()
+                    if text:
+                        texts.append(text)
+            result = " ".join(texts)
+            _text_cache[elem_id] = result
+            return result
+
+        candidates = []
+        index = 0
+
+        def dfs(element):
+            nonlocal index
+            if isinstance(element, Tag):
+                if element.name in self.included_tags:
+                    if not self.is_excluded(element):
+                        text = fast_text(element)
+                        word_count = len(text.split())
+
+                        # Headers pass through with adjusted minimum
+                        if element.name in self.header_tags:
+                            if word_count >= 3:  # Minimal sanity check for headers
+                                candidates.append((index, text, element))
+                                index += 1
+                        # Regular content uses standard minimum
+                        elif word_count >= self.min_word_count:
+                            candidates.append((index, text, element))
+                            index += 1
+
+                for child in element.children:
+                    dfs(child)
+
+        dfs(soup.body if soup.body else soup)
+        return candidates
+
+    def is_excluded(self, tag: Tag) -> bool:
+        """Common method for exclusion logic"""
+        if tag.name in self.excluded_tags:
+            return True
+        class_id = " ".join(
+            filter(None, [" ".join(tag.get("class", [])), tag.get("id", "")])
+        )
+        return bool(self.negative_patterns.search(class_id))
+
+    def clean_element(self, tag: Tag) -> str:
+        """Common method for cleaning HTML elements with minimal overhead"""
+        if not tag or not isinstance(tag, Tag):
+            return ""
+
+        unwanted_tags = {"script", "style", "aside", "form", "iframe", "noscript"}
+        unwanted_attrs = {
+            "style",
+            "onclick",
+            "onmouseover",
+            "align",
+            "bgcolor",
+            "class",
+            "id",
+        }
+
+        # Use string builder pattern for better performance
+        builder = []
+
+        def render_tag(elem):
+            if not isinstance(elem, Tag):
+                if isinstance(elem, str):
+                    builder.append(elem.strip())
+                return
+
+            if elem.name in unwanted_tags:
+                return
+
+            # Start tag
+            builder.append(f"<{elem.name}")
+
+            # Add cleaned attributes
+            attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs}
+            for key, value in attrs.items():
+                builder.append(f' {key}="{value}"')
+
+            builder.append(">")
+
+            # Process children
+            for child in elem.children:
+                render_tag(child)
+
+            # Close tag
+            builder.append(f"</{elem.name}>")
+
+        try:
+            render_tag(tag)
+            return "".join(builder)
+        except Exception:
+            return str(tag)  # Fallback to original if anything fails
+
+
+class BM25ContentFilter(RelevantContentFilter):
+    """
+    Content filtering using BM25 algorithm with priority tag handling.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Tokenizes the corpus and query.
+    4. Applies BM25 algorithm to calculate scores for each chunk.
+    5. Filters out chunks below the threshold.
+    6. Sorts chunks by score in descending order.
+    7. Returns the top N chunks.
+
+    Attributes:
+        user_query (str): User query for filtering (optional).
+        bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
+        language (str): Language for stemming (default: 'english').
+
+        Methods:
+            filter_content(self, html: str, min_word_threshold: int = None)
+    """
+
+    def __init__(
+        self,
+        user_query: str = None,
+        bm25_threshold: float = 1.0,
+        language: str = "english",
+    ):
+        """
+        Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
+
+        Note:
+        If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
+            language (str): Language for stemming (default: 'english').
+        """
+        super().__init__(user_query=user_query)
+        self.bm25_threshold = bm25_threshold
+        self.priority_tags = {
+            "h1": 5.0,
+            "h2": 4.0,
+            "h3": 3.0,
+            "title": 4.0,
+            "strong": 2.0,
+            "b": 1.5,
+            "em": 1.5,
+            "blockquote": 2.0,
+            "code": 2.0,
+            "pre": 1.5,
+            "th": 1.5,  # Table headers
+        }
+        self.stemmer = stemmer(language)
+
+    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        """
+        Implements content filtering using BM25 algorithm with priority tag handling.
+
+            Note:
+        This method implements the filtering logic for the BM25ContentFilter class.
+        It takes HTML content as input and returns a list of filtered text chunks.
+
+        Args:
+            html (str): HTML content to be filtered.
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+
+        Returns:
+            List[str]: List of filtered text chunks.
+        """
+        if not html or not isinstance(html, str):
+            return []
+
+        soup = BeautifulSoup(html, "lxml")
+
+        # Check if body is present
+        if not soup.body:
+            # Wrap in body tag if missing
+            soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+        body = soup.find("body")
+
+        query = self.extract_page_query(soup, body)
+
+        if not query:
+            return []
+            # return [self.clean_element(soup)]
+
+        candidates = self.extract_text_chunks(body, min_word_threshold)
+
+        if not candidates:
+            return []
+
+        # Tokenize corpus
+        # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
+        # tokenized_query = query.lower().split()
+
+        # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()]
+        #                 for _, chunk, _, _ in candidates]
+        # tokenized_query = [ps.stem(word) for word in query.lower().split()]
+
+        tokenized_corpus = [
+            [self.stemmer.stemWord(word) for word in chunk.lower().split()]
+            for _, chunk, _, _ in candidates
+        ]
+        tokenized_query = [
+            self.stemmer.stemWord(word) for word in query.lower().split()
+        ]
+
+        # tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
+        #            for _, chunk, _, _ in candidates]
+        # tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
+
+        # Clean from stop words and noise
+        tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
+        tokenized_query = clean_tokens(tokenized_query)
+
+        bm25 = BM25Okapi(tokenized_corpus)
+        scores = bm25.get_scores(tokenized_query)
+
+        # Adjust scores with tag weights
+        adjusted_candidates = []
+        for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
+            tag_weight = self.priority_tags.get(tag.name, 1.0)
+            adjusted_score = score * tag_weight
+            adjusted_candidates.append((adjusted_score, index, chunk, tag))
+
+        # Filter candidates by threshold
+        selected_candidates = [
+            (index, chunk, tag)
+            for adjusted_score, index, chunk, tag in adjusted_candidates
+            if adjusted_score >= self.bm25_threshold
+        ]
+
+        if not selected_candidates:
+            return []
+
+        # Sort selected candidates by original document order
+        selected_candidates.sort(key=lambda x: x[0])
+
+        return [self.clean_element(tag) for _, _, tag in selected_candidates]
+
+
+class PruningContentFilter(RelevantContentFilter):
+    """
+    Content filtering using pruning algorithm with dynamic threshold.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Applies pruning algorithm to calculate scores for each chunk.
+    4. Filters out chunks below the threshold.
+    5. Sorts chunks by score in descending order.
+    6. Returns the top N chunks.
+
+    Attributes:
+        user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
+        min_word_threshold (int): Minimum word threshold for filtering (optional).
+        threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
+        threshold (float): Fixed threshold value (default: 0.48).
+
+        Methods:
+            filter_content(self, html: str, min_word_threshold: int = None):
+    """
+
+    def __init__(
+        self,
+        user_query: str = None,
+        min_word_threshold: int = None,
+        threshold_type: str = "fixed",
+        threshold: float = 0.48,
+    ):
+        """
+        Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
+
+        Note:
+        If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
+
+        Args:
+            user_query (str): User query for filtering (optional).
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+            threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
+            threshold (float): Fixed threshold value (default: 0.48).
+        """
+        super().__init__(None)
+        self.min_word_threshold = min_word_threshold
+        self.threshold_type = threshold_type
+        self.threshold = threshold
+
+        # Add tag importance for dynamic threshold
+        self.tag_importance = {
+            "article": 1.5,
+            "main": 1.4,
+            "section": 1.3,
+            "p": 1.2,
+            "h1": 1.4,
+            "h2": 1.3,
+            "h3": 1.2,
+            "div": 0.7,
+            "span": 0.6,
+        }
+
+        # Metric configuration
+        self.metric_config = {
+            "text_density": True,
+            "link_density": True,
+            "tag_weight": True,
+            "class_id_weight": True,
+            "text_length": True,
+        }
+
+        self.metric_weights = {
+            "text_density": 0.4,
+            "link_density": 0.2,
+            "tag_weight": 0.2,
+            "class_id_weight": 0.1,
+            "text_length": 0.1,
+        }
+
+        self.tag_weights = {
+            "div": 0.5,
+            "p": 1.0,
+            "article": 1.5,
+            "section": 1.0,
+            "span": 0.3,
+            "li": 0.5,
+            "ul": 0.5,
+            "ol": 0.5,
+            "h1": 1.2,
+            "h2": 1.1,
+            "h3": 1.0,
+            "h4": 0.9,
+            "h5": 0.8,
+            "h6": 0.7,
+        }
+
+    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        """
+        Implements content filtering using pruning algorithm with dynamic threshold.
+
+        Note:
+        This method implements the filtering logic for the PruningContentFilter class.
+        It takes HTML content as input and returns a list of filtered text chunks.
+
+        Args:
+            html (str): HTML content to be filtered.
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+
+        Returns:
+            List[str]: List of filtered text chunks.
+        """
+        if not html or not isinstance(html, str):
+            return []
+
+        soup = BeautifulSoup(html, "lxml")
+        if not soup.body:
+            soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+
+        # Remove comments and unwanted tags
+        self._remove_comments(soup)
+        self._remove_unwanted_tags(soup)
+
+        # Prune tree starting from body
+        body = soup.find("body")
+        self._prune_tree(body)
+
+        # Extract remaining content as list of HTML strings
+        content_blocks = []
+        for element in body.children:
+            if isinstance(element, str) or not hasattr(element, "name"):
+                continue
+            if len(element.get_text(strip=True)) > 0:
+                content_blocks.append(str(element))
+
+        return content_blocks
+
+    def _remove_comments(self, soup):
+        """Removes HTML comments"""
+        for element in soup(text=lambda text: isinstance(text, Comment)):
+            element.extract()
+
+    def _remove_unwanted_tags(self, soup):
+        """Removes unwanted tags"""
+        for tag in self.excluded_tags:
+            for element in soup.find_all(tag):
+                element.decompose()
+
+    def _prune_tree(self, node):
+        """
+        Prunes the tree starting from the given node.
+
+        Args:
+            node (Tag): The node from which the pruning starts.
+        """
+        if not node or not hasattr(node, "name") or node.name is None:
+            return
+
+        text_len = len(node.get_text(strip=True))
+        tag_len = len(node.encode_contents().decode("utf-8"))
+        link_text_len = sum(
+            len(s.strip())
+            for s in (a.string for a in node.find_all("a", recursive=False))
+            if s
+        )
+
+        metrics = {
+            "node": node,
+            "tag_name": node.name,
+            "text_len": text_len,
+            "tag_len": tag_len,
+            "link_text_len": link_text_len,
+        }
+
+        score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
+
+        if self.threshold_type == "fixed":
+            should_remove = score < self.threshold
+        else:  # dynamic
+            tag_importance = self.tag_importance.get(node.name, 0.7)
+            text_ratio = text_len / tag_len if tag_len > 0 else 0
+            link_ratio = link_text_len / text_len if text_len > 0 else 1
+
+            threshold = self.threshold  # base threshold
+            if tag_importance > 1:
+                threshold *= 0.8
+            if text_ratio > 0.4:
+                threshold *= 0.9
+            if link_ratio > 0.6:
+                threshold *= 1.2
+
+            should_remove = score < threshold
+
+        if should_remove:
+            node.decompose()
+        else:
+            children = [child for child in node.children if hasattr(child, "name")]
+            for child in children:
+                self._prune_tree(child)
+
+    def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
+        """Computes the composite score"""
+        if self.min_word_threshold:
+            # Get raw text from metrics node - avoid extra processing
+            text = metrics["node"].get_text(strip=True)
+            word_count = text.count(" ") + 1
+            if word_count < self.min_word_threshold:
+                return -1.0  # Guaranteed removal
+        score = 0.0
+        total_weight = 0.0
+
+        if self.metric_config["text_density"]:
+            density = text_len / tag_len if tag_len > 0 else 0
+            score += self.metric_weights["text_density"] * density
+            total_weight += self.metric_weights["text_density"]
+
+        if self.metric_config["link_density"]:
+            density = 1 - (link_text_len / text_len if text_len > 0 else 0)
+            score += self.metric_weights["link_density"] * density
+            total_weight += self.metric_weights["link_density"]
+
+        if self.metric_config["tag_weight"]:
+            tag_score = self.tag_weights.get(metrics["tag_name"], 0.5)
+            score += self.metric_weights["tag_weight"] * tag_score
+            total_weight += self.metric_weights["tag_weight"]
+
+        if self.metric_config["class_id_weight"]:
+            class_score = self._compute_class_id_weight(metrics["node"])
+            score += self.metric_weights["class_id_weight"] * max(0, class_score)
+            total_weight += self.metric_weights["class_id_weight"]
+
+        if self.metric_config["text_length"]:
+            score += self.metric_weights["text_length"] * math.log(text_len + 1)
+            total_weight += self.metric_weights["text_length"]
+
+        return score / total_weight if total_weight > 0 else 0
+
+    def _compute_class_id_weight(self, node):
+        """Computes the class ID weight"""
+        class_id_score = 0
+        if "class" in node.attrs:
+            classes = " ".join(node["class"])
+            if self.negative_patterns.match(classes):
+                class_id_score -= 0.5
+        if "id" in node.attrs:
+            element_id = node["id"]
+            if self.negative_patterns.match(element_id):
+                class_id_score -= 0.5
+        return class_id_score
+
+
+class LLMContentFilter(RelevantContentFilter):
+    """Content filtering using LLMs to generate relevant markdown.
+
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Applies LLMs to generate markdown for each chunk.
+    4. Filters out chunks below the threshold.
+    5. Sorts chunks by score in descending order.
+    6. Returns the top N chunks.
+
+    Attributes:
+        llm_config (LLMConfig): LLM configuration object.
+        instruction (str): Instruction for LLM markdown generation
+        chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9).
+        overlap_rate (float): Overlap rate for chunking (default: 0.5).
+        word_token_rate (float): Word token rate for chunking (default: 0.2).
+        verbose (bool): Enable verbose logging (default: False).
+        logger (AsyncLogger): Custom logger for LLM operations (optional).
+    """
+    _UNWANTED_PROPS = {
+        'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
+        'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
+        'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+        'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
+    }
+
+    def __init__(
+        self,
+        llm_config: "LLMConfig" = None,
+        instruction: str = None,
+        chunk_token_threshold: int = int(1e9),
+        overlap_rate: float = OVERLAP_RATE,
+        word_token_rate: float = WORD_TOKEN_RATE,
+        # char_token_rate: float = WORD_TOKEN_RATE * 5,
+        # chunk_mode: str = "char",
+        verbose: bool = False,
+        logger: Optional[AsyncLogger] = None,
+        ignore_cache: bool = True,
+        # Deprecated properties
+        provider: str = DEFAULT_PROVIDER,
+        api_token: Optional[str] = None,
+        base_url: Optional[str] = None,
+        api_base: Optional[str] = None,
+        extra_args: Dict = None,
+    ):
+        super().__init__(None)
+        self.provider = provider
+        self.api_token = api_token
+        self.base_url = base_url or api_base
+        self.llm_config = llm_config
+        self.instruction = instruction
+        self.chunk_token_threshold = chunk_token_threshold
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate or WORD_TOKEN_RATE
+        # self.chunk_mode: str = chunk_mode
+        # self.char_token_rate = char_token_rate or word_token_rate / 5
+        # self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
+        self.token_rate = word_token_rate or WORD_TOKEN_RATE
+        self.extra_args = extra_args or {}
+        self.ignore_cache = ignore_cache
+        self.verbose = verbose
+
+        # Setup logger with custom styling for LLM operations
+        if logger:
+            self.logger = logger
+        elif verbose:
+            self.logger = AsyncLogger(
+                verbose=verbose,
+                icons={
+                    **AsyncLogger.DEFAULT_ICONS,
+                    "LLM": "★",  # Star for LLM operations
+                    "CHUNK": "◈",  # Diamond for chunks
+                    "CACHE": "⚡",  # Lightning for cache operations
+                },
+                colors={
+                    **AsyncLogger.DEFAULT_COLORS,
+                    LogLevel.INFO: Fore.MAGENTA
+                    + Style.DIM,  # Dimmed purple for LLM ops
+                },
+            )
+        else:
+            self.logger = None
+
+        self.usages = []
+        self.total_usage = TokenUsage()
+    
+    def __setattr__(self, name, value):
+        """Handle attribute setting."""
+        # TODO: Planning to set properties dynamically based on the __init__ signature
+        sig = inspect.signature(self.__init__)
+        all_params = sig.parameters  # Dictionary of parameter names and their details
+
+        if name in self._UNWANTED_PROPS and value is not all_params[name].default:
+            raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
+        
+        super().__setattr__(name, value)  
+        
+    def _get_cache_key(self, html: str, instruction: str) -> str:
+        """Generate a unique cache key based on HTML and instruction"""
+        content = f"{html}{instruction}"
+        return hashlib.md5(content.encode()).hexdigest()
+
+    def _merge_chunks(self, text: str) -> List[str]:
+        """Split text into chunks with overlap using char or word mode."""
+        ov = int(self.chunk_token_threshold * self.overlap_rate)
+        sections = merge_chunks(
+            docs=[text],
+            target_size=self.chunk_token_threshold,
+            overlap=ov,
+            word_token_ratio=self.word_token_rate,
+        )
+        return sections
+
+    def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
+        if not html or not isinstance(html, str):
+            return []
+
+        if self.logger:
+            self.logger.info(
+                "Starting LLM markdown content filtering process",
+                tag="LLM",
+                params={"provider": self.llm_config.provider},
+                colors={"provider": Fore.CYAN},
+            )
+
+        # Cache handling
+        cache_dir = Path(get_home_folder()) / "llm_cache" / "content_filter"
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        cache_key = self._get_cache_key(html, self.instruction or "")
+        cache_file = cache_dir / f"{cache_key}.json"
+
+        # if ignore_cache == None:
+        ignore_cache = self.ignore_cache
+
+        if not ignore_cache and cache_file.exists():
+            if self.logger:
+                self.logger.info("Found  cached markdown result", tag="CACHE")
+            try:
+                with cache_file.open("r") as f:
+                    cached_data = json.load(f)
+                    usage = TokenUsage(**cached_data["usage"])
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+                    return cached_data["blocks"]
+            except Exception as e:
+                if self.logger:
+                    self.logger.error(
+                        f"LLM markdown: Cache read error: {str(e)}", tag="CACHE"
+                    )
+
+        # Split into chunks
+        html_chunks = self._merge_chunks(html)
+        if self.logger:
+            self.logger.info(
+                "LLM markdown: Split content into {chunk_count} chunks",
+                tag="CHUNK",
+                params={"chunk_count": len(html_chunks)},
+                colors={"chunk_count": Fore.YELLOW},
+            )
+
+        start_time = time.time()
+
+        # Process chunks in parallel
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            futures = []
+            for i, chunk in enumerate(html_chunks):
+                if self.logger:
+                    self.logger.debug(
+                        "LLM markdown: Processing chunk {chunk_num}/{total_chunks}",
+                        tag="CHUNK",
+                        params={"chunk_num": i + 1, "total_chunks": len(html_chunks)},
+                    )
+
+                prompt_variables = {
+                    "HTML": escape_json_string(sanitize_html(chunk)),
+                    "REQUEST": self.instruction
+                    or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content.",
+                }
+
+                prompt = PROMPT_FILTER_CONTENT
+                for var, value in prompt_variables.items():
+                    prompt = prompt.replace("{" + var + "}", value)
+
+                def _proceed_with_chunk(
+                    provider: str,
+                    prompt: str,
+                    api_token: str,
+                    base_url: Optional[str] = None,
+                    extra_args: Dict = {},
+                ) -> List[str]:
+                    if self.logger:
+                        self.logger.info(
+                            "LLM Markdown: Processing chunk {chunk_num}",
+                            tag="CHUNK",
+                            params={"chunk_num": i + 1},
+                        )
+                    return perform_completion_with_backoff(
+                        provider,
+                        prompt,
+                        api_token,
+                        base_url=base_url,
+                        extra_args=extra_args,
+                    )
+
+                future = executor.submit(
+                    _proceed_with_chunk,
+                    self.llm_config.provider,
+                    prompt,
+                    self.llm_config.api_token,
+                    self.llm_config.base_url,
+                    self.extra_args,
+                )
+                futures.append((i, future))
+
+            # Collect results in order
+            ordered_results = []
+            for i, future in sorted(futures):
+                try:
+                    response = future.result()
+
+                    # Track usage
+                    usage = TokenUsage(
+                        completion_tokens=response.usage.completion_tokens,
+                        prompt_tokens=response.usage.prompt_tokens,
+                        total_tokens=response.usage.total_tokens,
+                        completion_tokens_details=(
+                            response.usage.completion_tokens_details.__dict__
+                            if response.usage.completion_tokens_details
+                            else {}
+                        ),
+                        prompt_tokens_details=(
+                            response.usage.prompt_tokens_details.__dict__
+                            if response.usage.prompt_tokens_details
+                            else {}
+                        ),
+                    )
+                    self.usages.append(usage)
+                    self.total_usage.completion_tokens += usage.completion_tokens
+                    self.total_usage.prompt_tokens += usage.prompt_tokens
+                    self.total_usage.total_tokens += usage.total_tokens
+
+                    blocks = extract_xml_data(
+                        ["content"], response.choices[0].message.content
+                    )["content"]
+                    if blocks:
+                        ordered_results.append(blocks)
+                        if self.logger:
+                            self.logger.success(
+                                "LLM markdown: Successfully processed chunk {chunk_num}",
+                                tag="CHUNK",
+                                params={"chunk_num": i + 1},
+                            )
+                except Exception as e:
+                    if self.logger:
+                        self.logger.error(
+                            "LLM markdown: Error processing chunk {chunk_num}: {error}",
+                            tag="CHUNK",
+                            params={"chunk_num": i + 1, "error": str(e)},
+                        )
+
+        end_time = time.time()
+        if self.logger:
+            self.logger.success(
+                "LLM markdown: Completed processing in {time:.2f}s",
+                tag="LLM",
+                params={"time": end_time - start_time},
+                colors={"time": Fore.YELLOW},
+            )
+
+        result = ordered_results if ordered_results else []
+
+        # Cache the final result
+        cache_data = {"blocks": result, "usage": self.total_usage.__dict__}
+        with cache_file.open("w") as f:
+            json.dump(cache_data, f)
+            if self.logger:
+                self.logger.info("Cached results for future use", tag="CACHE")
+
+        return result
+
+    def show_usage(self) -> None:
+        """Print usage statistics"""
+        print("\n=== Token Usage Summary ===")
+        print(f"{'Type':<15} {'Count':>12}")
+        print("-" * 30)
+        print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
+        print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
+        print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
+
+        if self.usages:
+            print("\n=== Usage History ===")
+            print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
+            print("-" * 48)
+            for i, usage in enumerate(self.usages, 1):
+                print(
+                    f"{i:<10} {usage.completion_tokens:>12,} "
+                    f"{usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
+                )
+
+```
+
+
+## File: crawl4ai/markdown_generation_strategy.py
+
+```py
+from abc import ABC, abstractmethod
+from typing import Optional, Dict, Any, Tuple
+from .models import MarkdownGenerationResult
+from .html2text import CustomHTML2Text
+# from .types import RelevantContentFilter
+from .content_filter_strategy import RelevantContentFilter
+import re
+from urllib.parse import urljoin
+
+# Pre-compile the regex pattern
+LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
+
+
+def fast_urljoin(base: str, url: str) -> str:
+    """Fast URL joining for common cases."""
+    if url.startswith(("http://", "https://", "mailto:", "//")):
+        return url
+    if url.startswith("/"):
+        # Handle absolute paths
+        if base.endswith("/"):
+            return base[:-1] + url
+        return base + url
+    return urljoin(base, url)
+
+
+class MarkdownGenerationStrategy(ABC):
+    """Abstract base class for markdown generation strategies."""
+
+    def __init__(
+        self,
+        content_filter: Optional[RelevantContentFilter] = None,
+        options: Optional[Dict[str, Any]] = None,
+        verbose: bool = False,
+        content_source: str = "cleaned_html",
+    ):
+        self.content_filter = content_filter
+        self.options = options or {}
+        self.verbose = verbose
+        self.content_source = content_source
+
+    @abstractmethod
+    def generate_markdown(
+        self,
+        input_html: str,
+        base_url: str = "",
+        html2text_options: Optional[Dict[str, Any]] = None,
+        content_filter: Optional[RelevantContentFilter] = None,
+        citations: bool = True,
+        **kwargs,
+    ) -> MarkdownGenerationResult:
+        """Generate markdown from the selected input HTML."""
+        pass
+
+
+class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
+    """
+    Default implementation of markdown generation strategy.
+
+    How it works:
+    1. Generate raw markdown from cleaned HTML.
+    2. Convert links to citations.
+    3. Generate fit markdown if content filter is provided.
+    4. Return MarkdownGenerationResult.
+
+    Args:
+        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
+        content_source (str): Source of content to generate markdown from. Options: "cleaned_html", "raw_html", "fit_html". Defaults to "cleaned_html".
+
+    Returns:
+        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+    """
+
+    def __init__(
+        self,
+        content_filter: Optional[RelevantContentFilter] = None,
+        options: Optional[Dict[str, Any]] = None,
+        content_source: str = "cleaned_html",
+    ):
+        super().__init__(content_filter, options, verbose=False, content_source=content_source)
+
+    def convert_links_to_citations(
+        self, markdown: str, base_url: str = ""
+    ) -> Tuple[str, str]:
+        """
+        Convert links in markdown to citations.
+
+        How it works:
+        1. Find all links in the markdown.
+        2. Convert links to citations.
+        3. Return converted markdown and references markdown.
+
+        Note:
+        This function uses a regex pattern to find links in markdown.
+
+        Args:
+            markdown (str): Markdown text.
+            base_url (str): Base URL for URL joins.
+
+        Returns:
+            Tuple[str, str]: Converted markdown and references markdown.
+        """
+        link_map = {}
+        url_cache = {}  # Cache for URL joins
+        parts = []
+        last_end = 0
+        counter = 1
+
+        for match in LINK_PATTERN.finditer(markdown):
+            parts.append(markdown[last_end : match.start()])
+            text, url, title = match.groups()
+
+            # Use cached URL if available, otherwise compute and cache
+            if base_url and not url.startswith(("http://", "https://", "mailto:")):
+                if url not in url_cache:
+                    url_cache[url] = fast_urljoin(base_url, url)
+                url = url_cache[url]
+
+            if url not in link_map:
+                desc = []
+                if title:
+                    desc.append(title)
+                if text and text != title:
+                    desc.append(text)
+                link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
+                counter += 1
+
+            num = link_map[url][0]
+            parts.append(
+                f"{text}⟨{num}⟩"
+                if not match.group(0).startswith("!")
+                else f"![{text}⟨{num}⟩]"
+            )
+            last_end = match.end()
+
+        parts.append(markdown[last_end:])
+        converted_text = "".join(parts)
+
+        # Pre-build reference strings
+        references = ["\n\n## References\n\n"]
+        references.extend(
+            f"⟨{num}⟩ {url}{desc}\n"
+            for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
+        )
+
+        return converted_text, "".join(references)
+
+    def generate_markdown(
+        self,
+        input_html: str,
+        base_url: str = "",
+        html2text_options: Optional[Dict[str, Any]] = None,
+        options: Optional[Dict[str, Any]] = None,
+        content_filter: Optional[RelevantContentFilter] = None,
+        citations: bool = True,
+        **kwargs,
+    ) -> MarkdownGenerationResult:
+        """
+        Generate markdown with citations from the provided input HTML.
+
+        How it works:
+        1. Generate raw markdown from the input HTML.
+        2. Convert links to citations.
+        3. Generate fit markdown if content filter is provided.
+        4. Return MarkdownGenerationResult.
+
+        Args:
+            input_html (str): The HTML content to process (selected based on content_source).
+            base_url (str): Base URL for URL joins.
+            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
+            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
+            content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+            citations (bool): Whether to generate citations.
+
+        Returns:
+            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+        """
+        try:
+            # Initialize HTML2Text with default options for better conversion
+            h = CustomHTML2Text(baseurl=base_url)
+            default_options = {
+                "body_width": 0,  # Disable text wrapping
+                "ignore_emphasis": False,
+                "ignore_links": False,
+                "ignore_images": False,
+                "protect_links": False,
+                "single_line_break": True,
+                "mark_code": True,
+                "escape_snob": False,
+            }
+
+            # Update with custom options if provided
+            if html2text_options:
+                default_options.update(html2text_options)
+            elif options:
+                default_options.update(options)
+            elif self.options:
+                default_options.update(self.options)
+
+            h.update_params(**default_options)
+
+            # Ensure we have valid input
+            if not input_html:
+                input_html = ""
+            elif not isinstance(input_html, str):
+                input_html = str(input_html)
+
+            # Generate raw markdown
+            try:
+                raw_markdown = h.handle(input_html)
+            except Exception as e:
+                raw_markdown = f"Error converting HTML to markdown: {str(e)}"
+
+            raw_markdown = raw_markdown.replace("    ```", "```")
+
+            # Convert links to citations
+            markdown_with_citations: str = raw_markdown
+            references_markdown: str = ""
+            if citations:
+                try:
+                    (
+                        markdown_with_citations,
+                        references_markdown,
+                    ) = self.convert_links_to_citations(raw_markdown, base_url)
+                except Exception as e:
+                    markdown_with_citations = raw_markdown
+                    references_markdown = f"Error generating citations: {str(e)}"
+
+            # Generate fit markdown if content filter is provided
+            fit_markdown: Optional[str] = ""
+            filtered_html: Optional[str] = ""
+            if content_filter or self.content_filter:
+                try:
+                    content_filter = content_filter or self.content_filter
+                    filtered_html = content_filter.filter_content(input_html)
+                    filtered_html = "\n".join(
+                        "<div>{}</div>".format(s) for s in filtered_html
+                    )
+                    fit_markdown = h.handle(filtered_html)
+                except Exception as e:
+                    fit_markdown = f"Error generating fit markdown: {str(e)}"
+                    filtered_html = ""
+
+            return MarkdownGenerationResult(
+                raw_markdown=raw_markdown or "",
+                markdown_with_citations=markdown_with_citations or "",
+                references_markdown=references_markdown or "",
+                fit_markdown=fit_markdown or "",
+                fit_html=filtered_html or "",
+            )
+        except Exception as e:
+            # If anything fails, return empty strings with error message
+            error_msg = f"Error in markdown generation: {str(e)}"
+            return MarkdownGenerationResult(
+                raw_markdown=error_msg,
+                markdown_with_citations=error_msg,
+                references_markdown="",
+                fit_markdown="",
+                fit_html="",
+            )
+
+```
+
+
+## File: crawl4ai/browser_manager.py
+
+```py
+import asyncio
+import time
+from typing import List, Optional
+import os
+import sys
+import shutil
+import tempfile
+import subprocess
+from playwright.async_api import BrowserContext
+import hashlib
+from .js_snippet import load_js_script
+from .config import DOWNLOAD_PAGE_TIMEOUT
+from .async_configs import BrowserConfig, CrawlerRunConfig
+from playwright_stealth import StealthConfig
+from .utils import get_chromium_path
+
+stealth_config = StealthConfig(
+    webdriver=True,
+    chrome_app=True,
+    chrome_csi=True,
+    chrome_load_times=True,
+    chrome_runtime=True,
+    navigator_languages=True,
+    navigator_plugins=True,
+    navigator_permissions=True,
+    webgl_vendor=True,
+    outerdimensions=True,
+    navigator_hardware_concurrency=True,
+    media_codecs=True,
+)
+
+BROWSER_DISABLE_OPTIONS = [
+    "--disable-background-networking",
+    "--disable-background-timer-throttling",
+    "--disable-backgrounding-occluded-windows",
+    "--disable-breakpad",
+    "--disable-client-side-phishing-detection",
+    "--disable-component-extensions-with-background-pages",
+    "--disable-default-apps",
+    "--disable-extensions",
+    "--disable-features=TranslateUI",
+    "--disable-hang-monitor",
+    "--disable-ipc-flooding-protection",
+    "--disable-popup-blocking",
+    "--disable-prompt-on-repost",
+    "--disable-sync",
+    "--force-color-profile=srgb",
+    "--metrics-recording-only",
+    "--no-first-run",
+    "--password-store=basic",
+    "--use-mock-keychain",
+]
+
+
+class ManagedBrowser:
+    """
+    Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
+
+    Attributes:
+        browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                            Default: "chromium".
+        user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                     temporary directory may be used. Default: None.
+        headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                         Default: True.
+        browser_process (subprocess.Popen): The process object for the browser.
+        temp_dir (str): Temporary directory for user data if not provided.
+        debugging_port (int): Port for debugging the browser.
+        host (str): Host for debugging the browser.
+
+        Methods:
+            start(): Starts the browser process and returns the CDP endpoint URL.
+            _get_browser_path(): Returns the browser executable path based on OS and browser type.
+            _get_browser_args(): Returns browser-specific command line arguments.
+            _get_user_data_dir(): Returns the user data directory path.
+            _cleanup(): Terminates the browser process and removes the temporary directory.
+            create_profile(): Static method to create a user profile by launching a browser for user interaction.
+    """
+
+    browser_type: str
+    user_data_dir: str
+    headless: bool
+    browser_process: subprocess.Popen
+    temp_dir: str
+    debugging_port: int
+    host: str
+
+    def __init__(
+        self,
+        browser_type: str = "chromium",
+        user_data_dir: Optional[str] = None,
+        headless: bool = False,
+        logger=None,
+        host: str = "localhost",
+        debugging_port: int = 9222,
+        cdp_url: Optional[str] = None, 
+        browser_config: Optional[BrowserConfig] = None,
+    ):
+        """
+        Initialize the ManagedBrowser instance.
+
+        Args:
+            browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
+                                Default: "chromium".
+            user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
+                                         temporary directory may be used. Default: None.
+            headless (bool): Whether to run the browser in headless mode (no visible GUI).
+                             Default: True.
+            logger (logging.Logger): Logger instance for logging messages. Default: None.
+            host (str): Host for debugging the browser. Default: "localhost".
+            debugging_port (int): Port for debugging the browser. Default: 9222.
+            cdp_url (str or None): CDP URL to connect to the browser. Default: None.
+            browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
+        """
+        self.browser_type = browser_config.browser_type
+        self.user_data_dir = browser_config.user_data_dir
+        self.headless = browser_config.headless
+        self.browser_process = None
+        self.temp_dir = None
+        self.debugging_port = browser_config.debugging_port
+        self.host = browser_config.host
+        self.logger = logger
+        self.shutting_down = False
+        self.cdp_url = browser_config.cdp_url
+        self.browser_config = browser_config
+
+    async def start(self) -> str:
+        """
+        Starts the browser process or returns CDP endpoint URL.
+        If cdp_url is provided, returns it directly.
+        If user_data_dir is not provided for local browser, creates a temporary directory.
+        
+        Returns:
+            str: CDP endpoint URL
+        """
+        # If CDP URL provided, just return it
+        if self.cdp_url:
+            return self.cdp_url
+
+        # Create temp dir if needed
+        if not self.user_data_dir:
+            self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
+            self.user_data_dir = self.temp_dir
+
+        # Get browser path and args based on OS and browser type
+        # browser_path = self._get_browser_path()
+        args = await self._get_browser_args()
+        
+        if self.browser_config.extra_args:
+            args.extend(self.browser_config.extra_args)
+
+        # Start browser process
+        try:
+            # Use DETACHED_PROCESS flag on Windows to fully detach the process
+            # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
+            if sys.platform == "win32":
+                self.browser_process = subprocess.Popen(
+                    args, 
+                    stdout=subprocess.PIPE, 
+                    stderr=subprocess.PIPE,
+                    creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
+                )
+            else:
+                self.browser_process = subprocess.Popen(
+                    args, 
+                    stdout=subprocess.PIPE, 
+                    stderr=subprocess.PIPE,
+                    preexec_fn=os.setpgrp  # Start in a new process group
+                )
+                
+            # We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
+            await asyncio.sleep(0.5)  # Give browser time to start
+            await self._initial_startup_check()
+            await asyncio.sleep(2)  # Give browser time to start
+            return f"http://{self.host}:{self.debugging_port}"
+        except Exception as e:
+            await self.cleanup()
+            raise Exception(f"Failed to start browser: {e}")
+
+    async def _initial_startup_check(self):
+        """
+        Perform a quick check to make sure the browser started successfully.
+        This only runs once at startup rather than continuously monitoring.
+        """
+        if not self.browser_process:
+            return
+            
+        # Check that process started without immediate termination
+        await asyncio.sleep(0.5)
+        if self.browser_process.poll() is not None:
+            # Process already terminated
+            stdout, stderr = b"", b""
+            try:
+                stdout, stderr = self.browser_process.communicate(timeout=0.5)
+            except subprocess.TimeoutExpired:
+                pass
+                
+            self.logger.error(
+                message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                tag="ERROR",
+                params={
+                    "code": self.browser_process.returncode,
+                    "stdout": stdout.decode() if stdout else "",
+                    "stderr": stderr.decode() if stderr else "",
+                },
+            )
+    
+    async def _monitor_browser_process(self):
+        """
+        Monitor the browser process for unexpected termination.
+
+        How it works:
+        1. Read stdout and stderr from the browser process.
+        2. If the process has terminated, log the error message and terminate the browser.
+        3. If the shutting_down flag is set, log the normal termination message.
+        4. If any other error occurs, log the error message.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        This is DEPRECATED and should not be used for builtin browsers that need to outlive the Python process.
+        """
+        if self.browser_process:
+            try:
+                stdout, stderr = await asyncio.gather(
+                    asyncio.to_thread(self.browser_process.stdout.read),
+                    asyncio.to_thread(self.browser_process.stderr.read),
+                )
+
+                # Check shutting_down flag BEFORE logging anything
+                if self.browser_process.poll() is not None:
+                    if not self.shutting_down:
+                        self.logger.error(
+                            message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                            tag="ERROR",
+                            params={
+                                "code": self.browser_process.returncode,
+                                "stdout": stdout.decode(),
+                                "stderr": stderr.decode(),
+                            },
+                        )
+                        await self.cleanup()
+                    else:
+                        self.logger.info(
+                            message="Browser process terminated normally | Code: {code}",
+                            tag="INFO",
+                            params={"code": self.browser_process.returncode},
+                        )
+            except Exception as e:
+                if not self.shutting_down:
+                    self.logger.error(
+                        message="Error monitoring browser process: {error}",
+                        tag="ERROR",
+                        params={"error": str(e)},
+                    )
+
+    def _get_browser_path_WIP(self) -> str:
+        """Returns the browser executable path based on OS and browser type"""
+        if sys.platform == "darwin":  # macOS
+            paths = {
+                "chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+                "firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
+                "webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
+            }
+        elif sys.platform == "win32":  # Windows
+            paths = {
+                "chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
+                "firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
+                "webkit": None,  # WebKit not supported on Windows
+            }
+        else:  # Linux
+            paths = {
+                "chromium": "google-chrome",
+                "firefox": "firefox",
+                "webkit": None,  # WebKit not supported on Linux
+            }
+
+        return paths.get(self.browser_type)
+
+    async def _get_browser_path(self) -> str:
+        browser_path = await get_chromium_path(self.browser_type)
+        return browser_path
+
+    async def _get_browser_args(self) -> List[str]:
+        """Returns browser-specific command line arguments"""
+        base_args = [await self._get_browser_path()]
+
+        if self.browser_type == "chromium":
+            args = [
+                f"--remote-debugging-port={self.debugging_port}",
+                f"--user-data-dir={self.user_data_dir}",
+            ]
+            if self.headless:
+                args.append("--headless=new")
+        elif self.browser_type == "firefox":
+            args = [
+                "--remote-debugging-port",
+                str(self.debugging_port),
+                "--profile",
+                self.user_data_dir,
+            ]
+            if self.headless:
+                args.append("--headless")
+        else:
+            raise NotImplementedError(f"Browser type {self.browser_type} not supported")
+
+        return base_args + args
+
+    async def cleanup(self):
+        """Cleanup browser process and temporary directory"""
+        # Set shutting_down flag BEFORE any termination actions
+        self.shutting_down = True
+
+        if self.browser_process:
+            try:
+                # For builtin browsers that should persist, we should check if it's a detached process
+                # Only terminate if we have proper control over the process
+                if not self.browser_process.poll():
+                    # Process is still running
+                    self.browser_process.terminate()
+                    # Wait for process to end gracefully
+                    for _ in range(10):  # 10 attempts, 100ms each
+                        if self.browser_process.poll() is not None:
+                            break
+                        await asyncio.sleep(0.1)
+
+                    # Force kill if still running
+                    if self.browser_process.poll() is None:
+                        if sys.platform == "win32":
+                            # On Windows we might need taskkill for detached processes
+                            try:
+                                subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
+                            except Exception:
+                                self.browser_process.kill()
+                        else:
+                            self.browser_process.kill()
+                        await asyncio.sleep(0.1)  # Brief wait for kill to take effect
+
+            except Exception as e:
+                self.logger.error(
+                    message="Error terminating browser: {error}",
+                    tag="ERROR", 
+                    params={"error": str(e)},
+                )
+
+        if self.temp_dir and os.path.exists(self.temp_dir):
+            try:
+                shutil.rmtree(self.temp_dir)
+            except Exception as e:
+                self.logger.error(
+                    message="Error removing temporary directory: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)},
+                )
+                
+    # These methods have been moved to BrowserProfiler class
+    @staticmethod
+    async def create_profile(browser_config=None, profile_name=None, logger=None):
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Creates a browser profile by launching a browser for interactive user setup
+        and waits until the user closes it. The profile is stored in a directory that
+        can be used later with BrowserConfig.user_data_dir.
+        
+        Please use BrowserProfiler.create_profile() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            profile_path = await profiler.create_profile(profile_name="my-login-profile")
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler(logger=logger)
+        return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config)
+    
+    @staticmethod
+    def list_profiles():
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Lists all available browser profiles in the Crawl4AI profiles directory.
+        
+        Please use BrowserProfiler.list_profiles() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            profiles = profiler.list_profiles()
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler()
+        return profiler.list_profiles()
+        
+    @staticmethod
+    def delete_profile(profile_name_or_path):
+        """
+        This method has been moved to the BrowserProfiler class.
+        
+        Delete a browser profile by name or path.
+        
+        Please use BrowserProfiler.delete_profile() instead.
+        
+        Example:
+            ```python
+            from crawl4ai.browser_profiler import BrowserProfiler
+            
+            profiler = BrowserProfiler()
+            success = profiler.delete_profile("my-profile")
+            ```
+        """
+        from .browser_profiler import BrowserProfiler
+        
+        # Create a BrowserProfiler instance and delegate to it
+        profiler = BrowserProfiler()
+        return profiler.delete_profile(profile_name_or_path)
+
+
+
+
+class BrowserManager:
+    """
+    Manages the browser instance and context.
+
+    Attributes:
+        config (BrowserConfig): Configuration object containing all browser settings
+        logger: Logger instance for recording events and errors
+        browser (Browser): The browser instance
+        default_context (BrowserContext): The default browser context
+        managed_browser (ManagedBrowser): The managed browser instance
+        playwright (Playwright): The Playwright instance
+        sessions (dict): Dictionary to store session information
+        session_ttl (int): Session timeout in seconds
+    """
+
+    _playwright_instance = None
+    
+    @classmethod
+    async def get_playwright(cls):
+        from playwright.async_api import async_playwright
+        cls._playwright_instance = await async_playwright().start()
+        return cls._playwright_instance    
+
+    def __init__(self, browser_config: BrowserConfig, logger=None):
+        """
+        Initialize the BrowserManager with a browser configuration.
+
+        Args:
+            browser_config (BrowserConfig): Configuration object containing all browser settings
+            logger: Logger instance for recording events and errors
+        """
+        self.config: BrowserConfig = browser_config
+        self.logger = logger
+
+        # Browser state
+        self.browser = None
+        self.default_context = None
+        self.managed_browser = None
+        self.playwright = None
+
+        # Session management
+        self.sessions = {}
+        self.session_ttl = 1800  # 30 minutes
+
+        # Keep track of contexts by a "config signature," so each unique config reuses a single context
+        self.contexts_by_config = {}
+        self._contexts_lock = asyncio.Lock() 
+
+        # Initialize ManagedBrowser if needed
+        if self.config.use_managed_browser:
+            self.managed_browser = ManagedBrowser(
+                browser_type=self.config.browser_type,
+                user_data_dir=self.config.user_data_dir,
+                headless=self.config.headless,
+                logger=self.logger,
+                debugging_port=self.config.debugging_port,
+                cdp_url=self.config.cdp_url,
+                browser_config=self.config,
+            )
+
+    async def start(self):
+        """
+        Start the browser instance and set up the default context.
+
+        How it works:
+        1. Check if Playwright is already initialized.
+        2. If not, initialize Playwright.
+        3. If managed browser is used, start it and connect to the CDP endpoint.
+        4. If managed browser is not used, launch the browser and set up the default context.
+
+        Note: This method should be called in a separate task to avoid blocking the main event loop.
+        """
+        if self.playwright is not None:
+            await self.close()
+            
+        from playwright.async_api import async_playwright
+
+        self.playwright = await async_playwright().start()
+
+        if self.config.cdp_url or self.config.use_managed_browser:
+            self.config.use_managed_browser = True
+            cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
+            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
+            contexts = self.browser.contexts
+            if contexts:
+                self.default_context = contexts[0]
+            else:
+                self.default_context = await self.create_browser_context()
+            await self.setup_context(self.default_context)
+        else:
+            browser_args = self._build_browser_args()
+
+            # Launch appropriate browser type
+            if self.config.browser_type == "firefox":
+                self.browser = await self.playwright.firefox.launch(**browser_args)
+            elif self.config.browser_type == "webkit":
+                self.browser = await self.playwright.webkit.launch(**browser_args)
+            else:
+                self.browser = await self.playwright.chromium.launch(**browser_args)
+
+            self.default_context = self.browser
+
+
+    def _build_browser_args(self) -> dict:
+        """Build browser launch arguments from config."""
+        args = [
+            "--disable-gpu",
+            "--disable-gpu-compositing",
+            "--disable-software-rasterizer",
+            "--no-sandbox",
+            "--disable-dev-shm-usage",
+            "--no-first-run",
+            "--no-default-browser-check",
+            "--disable-infobars",
+            "--window-position=0,0",
+            "--ignore-certificate-errors",
+            "--ignore-certificate-errors-spki-list",
+            "--disable-blink-features=AutomationControlled",
+            "--window-position=400,0",
+            "--disable-renderer-backgrounding",
+            "--disable-ipc-flooding-protection",
+            "--force-color-profile=srgb",
+            "--mute-audio",
+            "--disable-background-timer-throttling",
+            # "--single-process",
+            f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
+        ]
+
+        if self.config.light_mode:
+            args.extend(BROWSER_DISABLE_OPTIONS)
+
+        if self.config.text_mode:
+            args.extend(
+                [
+                    "--blink-settings=imagesEnabled=false",
+                    "--disable-remote-fonts",
+                    "--disable-images",
+                    "--disable-javascript",
+                    "--disable-software-rasterizer",
+                    "--disable-dev-shm-usage",
+                ]
+            )
+
+        if self.config.extra_args:
+            args.extend(self.config.extra_args)
+
+        # Deduplicate args
+        args = list(dict.fromkeys(args))
+        
+        browser_args = {"headless": self.config.headless, "args": args}
+
+        if self.config.chrome_channel:
+            browser_args["channel"] = self.config.chrome_channel
+
+        if self.config.accept_downloads:
+            browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
+                os.getcwd(), "downloads"
+            )
+            os.makedirs(browser_args["downloads_path"], exist_ok=True)
+
+        if self.config.proxy or self.config.proxy_config:
+            from playwright.async_api import ProxySettings
+
+            proxy_settings = (
+                ProxySettings(server=self.config.proxy)
+                if self.config.proxy
+                else ProxySettings(
+                    server=self.config.proxy_config.server,
+                    username=self.config.proxy_config.username,
+                    password=self.config.proxy_config.password,
+                )
+            )
+            browser_args["proxy"] = proxy_settings
+
+        return browser_args
+
+    async def setup_context(
+        self,
+        context: BrowserContext,
+        crawlerRunConfig: CrawlerRunConfig = None,
+        is_default=False,
+    ):
+        """
+        Set up a browser context with the configured options.
+
+        How it works:
+        1. Set extra HTTP headers if provided.
+        2. Add cookies if provided.
+        3. Load storage state if provided.
+        4. Accept downloads if enabled.
+        5. Set default timeouts for navigation and download.
+        6. Set user agent if provided.
+        7. Set browser hints if provided.
+        8. Set proxy if provided.
+        9. Set downloads path if provided.
+        10. Set storage state if provided.
+        11. Set cache if provided.
+        12. Set extra HTTP headers if provided.
+        13. Add cookies if provided.
+        14. Set default timeouts for navigation and download if enabled.
+        15. Set user agent if provided.
+        16. Set browser hints if provided.
+
+        Args:
+            context (BrowserContext): The browser context to set up
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+            is_default (bool): Flag indicating if this is the default context
+        Returns:
+            None
+        """
+        if self.config.headers:
+            await context.set_extra_http_headers(self.config.headers)
+
+        if self.config.cookies:
+            await context.add_cookies(self.config.cookies)
+
+        if self.config.storage_state:
+            await context.storage_state(path=None)
+
+        if self.config.accept_downloads:
+            context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
+            if self.config.downloads_path:
+                context._impl_obj._options["accept_downloads"] = True
+                context._impl_obj._options[
+                    "downloads_path"
+                ] = self.config.downloads_path
+
+        # Handle user agent and browser hints
+        if self.config.user_agent:
+            combined_headers = {
+                "User-Agent": self.config.user_agent,
+                "sec-ch-ua": self.config.browser_hint,
+            }
+            combined_headers.update(self.config.headers)
+            await context.set_extra_http_headers(combined_headers)
+
+        # Add default cookie
+        await context.add_cookies(
+            [
+                {
+                    "name": "cookiesEnabled",
+                    "value": "true",
+                    "url": crawlerRunConfig.url
+                    if crawlerRunConfig and crawlerRunConfig.url
+                    else "https://crawl4ai.com/",
+                }
+            ]
+        )
+
+        # Handle navigator overrides
+        if crawlerRunConfig:
+            if (
+                crawlerRunConfig.override_navigator
+                or crawlerRunConfig.simulate_user
+                or crawlerRunConfig.magic
+            ):
+                await context.add_init_script(load_js_script("navigator_overrider"))        
+
+    async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
+        """
+        Creates and returns a new browser context with configured settings.
+        Applies text-only mode settings if text_mode is enabled in config.
+
+        Returns:
+            Context: Browser context object with the specified configurations
+        """
+        # Base settings
+        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
+        viewport_settings = {
+            "width": self.config.viewport_width,
+            "height": self.config.viewport_height,
+        }
+        proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
+
+        blocked_extensions = [
+            # Images
+            "jpg",
+            "jpeg",
+            "png",
+            "gif",
+            "webp",
+            "svg",
+            "ico",
+            "bmp",
+            "tiff",
+            "psd",
+            # Fonts
+            "woff",
+            "woff2",
+            "ttf",
+            "otf",
+            "eot",
+            # Styles
+            # 'css', 'less', 'scss', 'sass',
+            # Media
+            "mp4",
+            "webm",
+            "ogg",
+            "avi",
+            "mov",
+            "wmv",
+            "flv",
+            "m4v",
+            "mp3",
+            "wav",
+            "aac",
+            "m4a",
+            "opus",
+            "flac",
+            # Documents
+            "pdf",
+            "doc",
+            "docx",
+            "xls",
+            "xlsx",
+            "ppt",
+            "pptx",
+            # Archives
+            "zip",
+            "rar",
+            "7z",
+            "tar",
+            "gz",
+            # Scripts and data
+            "xml",
+            "swf",
+            "wasm",
+        ]
+
+        # Common context settings
+        context_settings = {
+            "user_agent": user_agent,
+            "viewport": viewport_settings,
+            "proxy": proxy_settings,
+            "accept_downloads": self.config.accept_downloads,
+            "storage_state": self.config.storage_state,
+            "ignore_https_errors": self.config.ignore_https_errors,
+            "device_scale_factor": 1.0,
+            "java_script_enabled": self.config.java_script_enabled,
+        }
+        
+        if crawlerRunConfig:
+            # Check if there is value for crawlerRunConfig.proxy_config set add that to context
+            if crawlerRunConfig.proxy_config:
+                proxy_settings = {
+                    "server": crawlerRunConfig.proxy_config.server,
+                }
+                if crawlerRunConfig.proxy_config.username:
+                    proxy_settings.update({
+                        "username": crawlerRunConfig.proxy_config.username,
+                        "password": crawlerRunConfig.proxy_config.password,
+                    })
+                context_settings["proxy"] = proxy_settings
+
+        if self.config.text_mode:
+            text_mode_settings = {
+                "has_touch": False,
+                "is_mobile": False,
+            }
+            # Update context settings with text mode settings
+            context_settings.update(text_mode_settings)
+
+        # Create and return the context with all settings
+        context = await self.browser.new_context(**context_settings)
+
+        # Apply text mode settings if enabled
+        if self.config.text_mode:
+            # Create and apply route patterns for each extension
+            for ext in blocked_extensions:
+                await context.route(f"**/*.{ext}", lambda route: route.abort())
+        return context
+
+    def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
+        """
+        Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
+        then returns a hash of the sorted JSON. This yields a stable signature
+        that identifies configurations requiring a unique browser context.
+        """
+        import json
+
+        config_dict = crawlerRunConfig.__dict__.copy()
+        # Exclude items that do not affect browser-level setup.
+        # Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
+        ephemeral_keys = [
+            "session_id",
+            "js_code",
+            "scraping_strategy",
+            "extraction_strategy",
+            "chunking_strategy",
+            "cache_mode",
+            "content_filter",
+            "semaphore_count",
+            "url"
+        ]
+        for key in ephemeral_keys:
+            if key in config_dict:
+                del config_dict[key]
+        # Convert to canonical JSON string
+        signature_json = json.dumps(config_dict, sort_keys=True, default=str)
+
+        # Hash the JSON so we get a compact, unique string
+        signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
+        return signature_hash
+
+    async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
+        """
+        Get a page for the given session ID, creating a new one if needed.
+
+        Args:
+            crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
+
+        Returns:
+            (page, context): The Page and its BrowserContext
+        """
+        self._cleanup_expired_sessions()
+
+        # If a session_id is provided and we already have it, reuse that page + context
+        if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
+            context, page, _ = self.sessions[crawlerRunConfig.session_id]
+            # Update last-used timestamp
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+            return page, context
+
+        # If using a managed browser, just grab the shared default_context
+        if self.config.use_managed_browser:
+            context = self.default_context
+            pages = context.pages
+            page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
+            if not page:
+                page = await context.new_page()
+        else:
+            # Otherwise, check if we have an existing context for this config
+            config_signature = self._make_config_signature(crawlerRunConfig)
+
+            async with self._contexts_lock:
+                if config_signature in self.contexts_by_config:
+                    context = self.contexts_by_config[config_signature]
+                else:
+                    # Create and setup a new context
+                    context = await self.create_browser_context(crawlerRunConfig)
+                    await self.setup_context(context, crawlerRunConfig)
+                    self.contexts_by_config[config_signature] = context
+
+            # Create a new page from the chosen context
+            page = await context.new_page()
+
+        # If a session_id is specified, store this session so we can reuse later
+        if crawlerRunConfig.session_id:
+            self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
+
+        return page, context
+
+    async def kill_session(self, session_id: str):
+        """
+        Kill a browser session and clean up resources.
+
+        Args:
+            session_id (str): The session ID to kill.
+        """
+        if session_id in self.sessions:
+            context, page, _ = self.sessions[session_id]
+            await page.close()
+            if not self.config.use_managed_browser:
+                await context.close()
+            del self.sessions[session_id]
+
+    def _cleanup_expired_sessions(self):
+        """Clean up expired sessions based on TTL."""
+        current_time = time.time()
+        expired_sessions = [
+            sid
+            for sid, (_, _, last_used) in self.sessions.items()
+            if current_time - last_used > self.session_ttl
+        ]
+        for sid in expired_sessions:
+            asyncio.create_task(self.kill_session(sid))
+
+    async def close(self):
+        """Close all browser resources and clean up."""
+        if self.config.cdp_url:
+            return
+        
+        if self.config.sleep_on_close:
+            await asyncio.sleep(0.5)
+
+        session_ids = list(self.sessions.keys())
+        for session_id in session_ids:
+            await self.kill_session(session_id)
+
+        # Now close all contexts we created. This reclaims memory from ephemeral contexts.
+        for ctx in self.contexts_by_config.values():
+            try:
+                await ctx.close()
+            except Exception as e:
+                self.logger.error(
+                    message="Error closing context: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )
+        self.contexts_by_config.clear()
+
+        if self.browser:
+            await self.browser.close()
+            self.browser = None
+
+        if self.managed_browser:
+            await asyncio.sleep(0.5)
+            await self.managed_browser.cleanup()
+            self.managed_browser = None
+
+        if self.playwright:
+            await self.playwright.stop()
+            self.playwright = None
+
+```
+
+
+
+
+## File: docs/examples/quickstart.py
+
+```py
+import os, sys
+
+from crawl4ai import LLMConfig
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+import time
+import json
+import re
+from typing import Dict
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.extraction_strategy import (
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+)
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+
+# Basic Example - Simple Crawl
+async def simple_crawl():
+    print("\n--- Basic Usage ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def clean_content():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        excluded_tags=["nav", "footer", "aside"],
+        remove_overlay_elements=True,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, threshold_type="fixed", min_word_threshold=0
+            ),
+            options={"ignore_links": True},
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            config=crawler_config,
+        )
+        full_markdown_length = len(result.markdown.raw_markdown)
+        fit_markdown_length = len(result.markdown.fit_markdown)
+        print(f"Full Markdown Length: {full_markdown_length}")
+        print(f"Fit Markdown Length: {fit_markdown_length}")
+
+
+async def link_analysis():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config,
+        )
+        print(f"Found {len(result.links['internal'])} internal links")
+        print(f"Found {len(result.links['external'])} external links")
+
+        for link in result.links["internal"][:5]:
+            print(f"Href: {link['href']}\nText: {link['text']}\n")
+
+
+# JavaScript Execution Example
+async def simple_example_with_running_js_code():
+    print("\n--- Executing JavaScript and Using CSS Selectors ---")
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
+        # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+# CSS Selector Example
+async def simple_example_with_css_selector():
+    print("\n--- Using CSS Selectors ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def media_handling():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        for img in result.media["images"][:5]:
+            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
+
+
+async def custom_hook_workflow(verbose=True):
+    async with AsyncWebCrawler() as crawler:
+        # Set a 'before_goto' hook to run custom code just before navigation
+        crawler.crawler_strategy.set_hook(
+            "before_goto",
+            lambda page, context: print("[Hook] Preparing to navigate..."),
+        )
+
+        # Perform the crawl operation
+        result = await crawler.arun(url="https://crawl4ai.com")
+        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
+
+
+# Proxy Example
+async def use_proxy():
+    print("\n--- Using a Proxy ---")
+    browser_config = BrowserConfig(
+        headless=True,
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "username",
+            "password": "password",
+        },
+    )
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        if result.success:
+            print(result.markdown[:500])
+
+
+# Screenshot Example
+async def capture_and_save_screenshot(url: str, output_path: str):
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url=url, config=crawler_config)
+
+        if result.success and result.screenshot:
+            import base64
+
+            screenshot_data = base64.b64decode(result.screenshot)
+            with open(output_path, "wb") as f:
+                f.write(screenshot_data)
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+
+
+# LLM Extraction Example
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config=LLMConfig(provider=provider,api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+
+# CSS Extraction Example
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+        delay_before_return_html=1
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+
+# Dynamic Content Examples - Method 1
+async def crawl_dynamic_content_pages_method_1():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+    first_commit = ""
+
+    async def on_execution_started(page, **kwargs):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await commit.evaluate("(element) => element.textContent")
+                commit = re.sub(r"\s+", "", commit)
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        js_next_page = """
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+        """
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                js_code=js_next_page if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            soup = BeautifulSoup(result.cleaned_html, "html.parser")
+            commits = soup.select("li")
+            all_commits.extend(commits)
+
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+# Dynamic Content Examples - Method 2
+async def crawl_dynamic_content_pages_method_2():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    js_next_page_and_wait = """
+    (async () => {
+        const getCurrentCommit = () => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            return commits.length > 0 ? commits[0].textContent.trim() : null;
+        };
+
+        const initialCommit = getCurrentCommit();
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+
+        while (true) {
+            await new Promise(resolve => setTimeout(resolve, 100));
+            const newCommit = getCurrentCommit();
+            if (newCommit && newCommit !== initialCommit) {
+                break;
+            }
+        }
+    })();
+    """
+
+    schema = {
+        "name": "Commit Extractor",
+        "baseSelector": "li.Box-sc-g0xbh4-0",
+        "fields": [
+            {
+                "name": "title",
+                "selector": "h4.markdown-title",
+                "type": "text",
+                "transform": "strip",
+            },
+        ],
+    }
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            commits = json.loads(result.extracted_content)
+            all_commits.extend(commits)
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+async def cosine_similarity_extraction():
+    from crawl4ai.extraction_strategy import CosineStrategy
+    crawl_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=10,
+            max_dist=0.2,  # Maximum distance between two words
+            linkage_method="ward",  # Linkage method for hierarchical clustering (ward, complete, average, single)
+            top_k=3,  # Number of top keywords to extract
+            sim_threshold=0.3,  # Similarity threshold for clustering
+            semantic_filter="McDonald's economic impact, American consumer trends",  # Keywords to filter the content semantically using embeddings
+            verbose=True,
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
+            config=crawl_config,
+        )
+        print(json.loads(result.extracted_content)[:5])
+
+
+# Browser Comparison
+async def crawl_custom_browser_type():
+    print("\n--- Browser Comparison ---")
+
+    # Firefox
+    browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Firefox:", time.time() - start)
+        print(result.markdown[:500])
+
+    # WebKit
+    browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("WebKit:", time.time() - start)
+        print(result.markdown[:500])
+
+    # Chromium (default)
+    browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Chromium:", time.time() - start)
+        print(result.markdown[:500])
+
+
+# Anti-Bot and User Simulation
+async def crawl_with_user_simulation():
+    browser_config = BrowserConfig(
+        headless=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+    )
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
+        print(result.markdown)
+
+
+async def ssl_certification():
+    # Configure crawler to fetch SSL certificate
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+
+            tmp_dir = os.path.join(__location__, "tmp")
+            os.makedirs(tmp_dir, exist_ok=True)
+
+            # 1. Access certificate properties directly
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+
+            # 2. Export certificate in different formats
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
+            print("\nCertificate exported to:")
+            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
+            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
+            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+
+# Main execution
+async def main():
+    # Basic examples
+    await simple_crawl()
+    await simple_example_with_running_js_code()
+    await simple_example_with_css_selector()
+
+    # Advanced examples
+    await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_llm(
+        "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
+    )
+    await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_2()
+
+    # Browser comparisons
+    await crawl_custom_browser_type()
+
+    # Screenshot example
+    await capture_and_save_screenshot(
+        "https://www.example.com",
+        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+## File: docs/examples/quickstart_examples_set_1.py
+
+```py
+import asyncio
+import os
+import json
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai import ProxyConfig
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai import RoundRobinProxyStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import LLMConfig
+from crawl4ai import PruningContentFilter, BM25ContentFilter
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
+from crawl4ai import BrowserConfig
+
+__cur_dir__ = Path(__file__).parent
+
+async def demo_basic_crawl():
+    """Basic web crawling with markdown generation"""
+    print("\n=== 1. Basic Web Crawling ===")
+    async with AsyncWebCrawler(config = BrowserConfig(
+        viewport_height=800,
+        viewport_width=1200,
+        headless=True,
+        verbose=True,
+    )) as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com/"
+        )
+
+        for i, result in enumerate(results):
+            print(f"Result {i + 1}:")
+            print(f"Success: {result.success}")
+            if result.success:
+                print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
+                print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
+            else:
+                print("Failed to crawl the URL")
+
+async def demo_parallel_crawl():
+    """Crawl multiple URLs in parallel"""
+    print("\n=== 2. Parallel Crawling ===")
+
+    urls = [
+        "https://news.ycombinator.com/",
+        "https://example.com/",
+        "https://httpbin.org/html",
+    ]
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun_many(
+            urls=urls,
+        )
+
+        print(f"Crawled {len(results)} URLs in parallel:")
+        for i, result in enumerate(results):
+            print(
+                f"  {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
+            )
+
+async def demo_fit_markdown():
+    """Generate focused markdown with LLM content filter"""
+    print("\n=== 3. Fit Markdown with LLM Content Filter ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(
+            url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
+            config=CrawlerRunConfig(
+                markdown_generator=DefaultMarkdownGenerator(
+                    content_filter=PruningContentFilter()
+                )
+            ),
+        )
+
+        # Print stats and save the fit markdown
+        print(f"Raw: {len(result.markdown.raw_markdown)} chars")
+        print(f"Fit: {len(result.markdown.fit_markdown)} chars")
+
+async def demo_llm_structured_extraction_no_schema():
+    # Create a simple LLM extraction strategy (no schema required)
+    extraction_strategy = LLMExtractionStrategy(
+        llm_config=LLMConfig(
+            provider="groq/qwen-2.5-32b",
+            api_token="env:GROQ_API_KEY",
+        ),
+        instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
+        extract_type="schema",
+        schema="{title: string, url: string, comments: int}",
+        extra_args={
+            "temperature": 0.0,
+            "max_tokens": 4096,
+        },
+        verbose=True,
+    )
+
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://news.ycombinator.com/", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_css_structured_extraction_no_schema():
+    """Extract structured data using CSS selectors"""
+    print("\n=== 5. CSS-Based Structured Extraction ===")
+    # Sample HTML for schema generation (one-time cost)
+    sample_html = """
+<div class="body-post clear">
+    <a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
+        <div class="clear home-post-box cf">
+            <div class="home-img clear">
+                <div class="img-ratio">
+                    <img alt="..." src="...">
+                </div>
+            </div>
+            <div class="clear home-right">
+                <h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
+                <div class="item-label">
+                    <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
+                    <span class="h-tags">Malware / Supply Chain Attack</span>
+                </div>
+                <div class="home-desc"> Cybersecurity researchers have...</div>
+            </div>
+        </div>
+    </a>
+</div>
+    """
+
+    # Check if schema file exists
+    schema_file_path = f"{__cur_dir__}/tmp/schema.json"
+    if os.path.exists(schema_file_path):
+        with open(schema_file_path, "r") as f:
+            schema = json.load(f)
+    else:
+        # Generate schema using LLM (one-time setup)
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=sample_html,
+            llm_config=LLMConfig(
+                provider="groq/qwen-2.5-32b",
+                api_token="env:GROQ_API_KEY",
+            ),
+            query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
+        )
+
+    print(f"Generated schema: {json.dumps(schema, indent=2)}")
+    # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
+    with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
+        json.dump(schema, f, indent=2)
+
+    # Create no-LLM extraction strategy with the generated schema
+    extraction_strategy = JsonCssExtractionStrategy(schema)
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    # Use the fast CSS extraction (no LLM calls during extraction)
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://thehackernews.com", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_deep_crawl():
+    """Deep crawling with BFS strategy"""
+    print("\n=== 6. Deep Crawling ===")
+
+    filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
+
+    deep_crawl_strategy = BFSDeepCrawlStrategy(
+        max_depth=1, max_pages=5, filter_chain=filter_chain
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://docs.crawl4ai.com",
+            config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
+        )
+
+        print(f"Deep crawl returned {len(results)} pages:")
+        for i, result in enumerate(results):
+            depth = result.metadata.get("depth", "unknown")
+            print(f"  {i + 1}. {result.url} (Depth: {depth})")
+
+async def demo_js_interaction():
+    """Execute JavaScript to load more content"""
+    print("\n=== 7. JavaScript Interaction ===")
+
+    # A simple page that needs JS to reveal content
+    async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
+        # Initial load
+
+        news_schema = {
+            "name": "news",
+            "baseSelector": "tr.athing",
+            "fields": [
+                {
+                    "name": "title",
+                    "selector": "span.titleline",
+                    "type": "text",
+                }
+            ],
+        }
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=CrawlerRunConfig(
+                session_id="hn_session",  # Keep session
+                extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
+            ),
+        )
+
+        news = []
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+        print(f"Initial items: {len(news)}")
+
+        # Click "More" link
+        more_config = CrawlerRunConfig(
+            js_code="document.querySelector('a.morelink').click();",
+            js_only=True,  # Continue in same page
+            session_id="hn_session",  # Keep session
+            extraction_strategy=JsonCssExtractionStrategy(
+                schema=news_schema,
+            ),
+        )
+
+        result: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com", config=more_config
+        )
+
+        # Extract new items
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+        print(f"Total items: {len(news)}")
+
+async def demo_media_and_links():
+    """Extract media and links from a page"""
+    print("\n=== 8. Media and Links Extraction ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
+
+        for i, result in enumerate(result):
+            # Extract and save all images
+            images = result.media.get("images", [])
+            print(f"Found {len(images)} images")
+
+            # Extract and save all links (internal and external)
+            internal_links = result.links.get("internal", [])
+            external_links = result.links.get("external", [])
+            print(f"Found {len(internal_links)} internal links")
+            print(f"Found {len(external_links)} external links")
+
+            # Print some of the images and links
+            for image in images[:3]:
+                print(f"Image: {image['src']}")
+            for link in internal_links[:3]:
+                print(f"Internal link: {link['href']}")
+            for link in external_links[:3]:
+                print(f"External link: {link['href']}")
+
+            # # Save everything to files
+            with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
+                json.dump(images, f, indent=2)
+
+            with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
+                json.dump(
+                    {"internal": internal_links, "external": external_links},
+                    f,
+                    indent=2,
+                )
+
+async def demo_screenshot_and_pdf():
+    """Capture screenshot and PDF of a page"""
+    print("\n=== 9. Screenshot and PDF Capture ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun(
+            # url="https://example.com",
+            url="https://en.wikipedia.org/wiki/Giant_anteater",
+            config=CrawlerRunConfig(screenshot=True, pdf=True),
+        )
+
+        for i, result in enumerate(result):
+            # if result.screenshot_data:
+            if result.screenshot:
+                # Save screenshot
+                screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
+                with open(screenshot_path, "wb") as f:
+                    f.write(base64.b64decode(result.screenshot))
+                print(f"Screenshot saved to {screenshot_path}")
+
+            # if result.pdf_data:
+            if result.pdf:
+                # Save PDF
+                pdf_path = f"{__cur_dir__}/tmp/example.pdf"
+                with open(pdf_path, "wb") as f:
+                    f.write(result.pdf)
+                print(f"PDF saved to {pdf_path}")
+
+async def demo_proxy_rotation():
+    """Proxy rotation for multiple requests"""
+    print("\n=== 10. Proxy Rotation ===")
+
+    # Example proxies (replace with real ones)
+    proxies = [
+        ProxyConfig(server="http://proxy1.example.com:8080"),
+        ProxyConfig(server="http://proxy2.example.com:8080"),
+    ]
+
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+
+    print(f"Using {len(proxies)} proxies in rotation")
+    print(
+        "Note: This example uses placeholder proxies - replace with real ones to test"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            proxy_rotation_strategy=proxy_strategy
+        )
+
+        # In a real scenario, these would be run and the proxies would rotate
+        print("In a real scenario, requests would rotate through the available proxies")
+
+async def demo_raw_html_and_file():
+    """Process raw HTML and local files"""
+    print("\n=== 11. Raw HTML and Local Files ===")
+
+    raw_html = """
+    <html><body>
+        <h1>Sample Article</h1>
+        <p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
+    </body></html>
+    """
+
+    # Save to file
+    file_path = Path("docs/examples/tmp/sample.html").absolute()
+    with open(file_path, "w") as f:
+        f.write(raw_html)
+
+    async with AsyncWebCrawler() as crawler:
+        # Crawl raw HTML
+        raw_result = await crawler.arun(
+            url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        print("Raw HTML processing:")
+        print(f"  Markdown: {raw_result.markdown.raw_markdown[:50]}...")
+
+        # Crawl local file
+        file_result = await crawler.arun(
+            url=f"file://{file_path}",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("\nLocal file processing:")
+        print(f"  Markdown: {file_result.markdown.raw_markdown[:50]}...")
+
+    # Clean up
+    os.remove(file_path)
+    print(f"Processed both raw HTML and local file ({file_path})")
+
+async def main():
+    """Run all demo functions sequentially"""
+    print("=== Comprehensive Crawl4AI Demo ===")
+    print("Note: Some examples require API keys or other configurations")
+
+    # Run all demos
+    await demo_basic_crawl()
+    await demo_parallel_crawl()
+    await demo_fit_markdown()
+    await demo_llm_structured_extraction_no_schema()
+    await demo_css_structured_extraction_no_schema()
+    await demo_deep_crawl()
+    await demo_js_interaction()
+    await demo_media_and_links()
+    await demo_screenshot_and_pdf()
+    # # await demo_proxy_rotation()
+    await demo_raw_html_and_file()
+
+    # Clean up any temp files that may have been created
+    print("\n=== Demo Complete ===")
+    print("Check for any generated files (screenshots, PDFs) in the current directory")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+
+
+## File: docs/examples/dispatcher_example.py
+
+```py
+import asyncio
+import time
+from rich import print
+from rich.table import Table
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    MemoryAdaptiveDispatcher,
+    SemaphoreDispatcher,
+    RateLimiter,
+    CrawlerMonitor,
+    DisplayMode,
+    CacheMode,
+    LXMLWebScrapingStrategy,
+)
+
+
+async def memory_adaptive(urls, browser_config, run_config):
+    """Memory adaptive crawler with monitoring"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=70.0,
+            max_session_permit=10,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
+    """Memory adaptive crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=95.0,
+            max_session_permit=10,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def semaphore(urls, browser_config, run_config):
+    """Basic semaphore crawler"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+async def semaphore_with_rate_limit(urls, browser_config, run_config):
+    """Semaphore crawler with rate limiting"""
+    start = time.perf_counter()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = SemaphoreDispatcher(
+            semaphore_count=5,
+            rate_limiter=RateLimiter(
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
+            ),
+            monitor=CrawlerMonitor(
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
+        )
+    duration = time.perf_counter() - start
+    return len(results), duration
+
+
+def create_performance_table(results):
+    """Creates a rich table showing performance results"""
+    table = Table(title="Crawler Strategy Performance Comparison")
+    table.add_column("Strategy", style="cyan")
+    table.add_column("URLs Crawled", justify="right", style="green")
+    table.add_column("Time (seconds)", justify="right", style="yellow")
+    table.add_column("URLs/second", justify="right", style="magenta")
+
+    sorted_results = sorted(results.items(), key=lambda x: x[1][1])
+
+    for strategy, (urls_crawled, duration) in sorted_results:
+        urls_per_second = urls_crawled / duration
+        table.add_row(
+            strategy, str(urls_crawled), f"{duration:.2f}", f"{urls_per_second:.2f}"
+        )
+
+    return table
+
+
+async def main():
+    urls = [f"https://example.com/page{i}" for i in range(1, 40)]
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy())
+
+    results = {
+        "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
+        # "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
+        #     urls, browser_config, run_config
+        # ),
+        # "Semaphore": await semaphore(urls, browser_config, run_config),
+        # "Semaphore + Rate Limit": await semaphore_with_rate_limit(
+        #     urls, browser_config, run_config
+        # ),
+    }
+
+    table = create_performance_table(results)
+    print("\nPerformance Summary:")
+    print(table)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+## File: docs/examples/hello_world.py
+
+```py
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    CrawlResult
+)
+
+async def example_cdp():
+    browser_conf = BrowserConfig(
+        headless=False,
+        cdp_url="http://localhost:9223"
+    )
+    crawler_config = CrawlerRunConfig(
+        session_id="test",
+        js_code = """(() => { return {"result": "Hello World!"} })()""",
+        js_only=True
+    )
+    async with AsyncWebCrawler(
+        config=browser_conf,
+        verbose=True,
+    ) as crawler:
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org",
+            config=crawler_config,
+        )
+        print(result.js_execution_result)
+                   
+
+async def main():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(
+                     threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
+            ),
+        )
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org", config=crawler_config
+        )
+        print(result.markdown.raw_markdown[:500])
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+```
+
+
+## File: docs/examples/hooks_example.py
+
+```py
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+
+async def main():
+    print("🔗 Hooks Example: Demonstrating different hook use cases")
+
+    # Configure browser settings
+    browser_config = BrowserConfig(headless=True)
+
+    # Configure crawler settings
+    crawler_run_config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);",
+        wait_for="body",
+        cache_mode=CacheMode.BYPASS,
+    )
+
+    # Create crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    # Define and set hook functions
+    async def on_browser_created(browser, context: BrowserContext, **kwargs):
+        """Hook called after the browser is created"""
+        print("[HOOK] on_browser_created - Browser is ready!")
+        # Example: Set a cookie that will be used for all requests
+        return browser
+
+    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+        """Hook called after a new page and context are created"""
+        print("[HOOK] on_page_context_created - New page created!")
+        # Example: Set default viewport size
+        await context.add_cookies(
+            [
+                {
+                    "name": "session_id",
+                    "value": "example_session",
+                    "domain": ".example.com",
+                    "path": "/",
+                }
+            ]
+        )
+        await page.set_viewport_size({"width": 1080, "height": 800})
+        return page
+
+    async def on_user_agent_updated(
+        page: Page, context: BrowserContext, user_agent: str, **kwargs
+    ):
+        """Hook called when the user agent is updated"""
+        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
+        return page
+
+    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+        """Hook called after custom JavaScript execution"""
+        print("[HOOK] on_execution_started - Custom JS executed!")
+        return page
+
+    async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
+        """Hook called before navigating to each URL"""
+        print(f"[HOOK] before_goto - About to visit: {url}")
+        # Example: Add custom headers for the request
+        await page.set_extra_http_headers({"Custom-Header": "my-value"})
+        return page
+
+    async def after_goto(
+        page: Page, context: BrowserContext, url: str, response: dict, **kwargs
+    ):
+        """Hook called after navigating to each URL"""
+        print(f"[HOOK] after_goto - Successfully loaded: {url}")
+        # Example: Wait for a specific element to be loaded
+        try:
+            await page.wait_for_selector(".content", timeout=1000)
+            print("Content element found!")
+        except:
+            print("Content element not found, continuing anyway")
+        return page
+
+    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+        """Hook called before retrieving the HTML content"""
+        print("[HOOK] before_retrieve_html - About to get HTML content")
+        # Example: Scroll to bottom to trigger lazy loading
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        return page
+
+    async def before_return_html(
+        page: Page, context: BrowserContext, html: str, **kwargs
+    ):
+        """Hook called before returning the HTML content"""
+        print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
+        # Example: You could modify the HTML content here if needed
+        return page
+
+    # Set all the hooks
+    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+    crawler.crawler_strategy.set_hook(
+        "on_page_context_created", on_page_context_created
+    )
+    crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
+    crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+    crawler.crawler_strategy.set_hook("before_goto", before_goto)
+    crawler.crawler_strategy.set_hook("after_goto", after_goto)
+    crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
+    crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
+
+    await crawler.start()
+
+    # Example usage: crawl a simple website
+    url = "https://example.com"
+    result = await crawler.arun(url, config=crawler_run_config)
+    print(f"\nCrawled URL: {result.url}")
+    print(f"HTML length: {len(result.html)}")
+
+    await crawler.close()
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
+
+```
+
+
+
+## File: crawl4ai/deep_crawling/__init__.py
+
+```py
+# deep_crawling/__init__.py
+from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
+from .bfs_strategy import BFSDeepCrawlStrategy
+from .bff_strategy import BestFirstCrawlingStrategy
+from .dfs_strategy import DFSDeepCrawlStrategy
+from .filters import (
+    FilterChain,
+    ContentTypeFilter,
+    DomainFilter,
+    URLFilter,
+    URLPatternFilter,
+    FilterStats,
+    ContentRelevanceFilter,
+    SEOFilter
+)
+from .scorers import (
+    KeywordRelevanceScorer,
+    URLScorer,
+    CompositeScorer,
+    DomainAuthorityScorer,
+    FreshnessScorer,
+    PathDepthScorer,
+    ContentTypeScorer
+)
+
+__all__ = [
+    "DeepCrawlDecorator",
+    "DeepCrawlStrategy",
+    "BFSDeepCrawlStrategy",
+    "BestFirstCrawlingStrategy",
+    "DFSDeepCrawlStrategy",
+    "FilterChain",
+    "ContentTypeFilter",
+    "DomainFilter",
+    "URLFilter",
+    "URLPatternFilter",
+    "FilterStats",
+    "ContentRelevanceFilter",
+    "SEOFilter",
+    "KeywordRelevanceScorer",
+    "URLScorer",
+    "CompositeScorer",
+    "DomainAuthorityScorer",
+    "FreshnessScorer",
+    "PathDepthScorer",
+    "ContentTypeScorer",
+]
+
+```
+
+
+## File: crawl4ai/deep_crawling/base_strategy.py
+
+```py
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Optional, Set, List, Dict
+from functools import wraps
+from contextvars import ContextVar
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+
+
+class DeepCrawlDecorator:
+    """Decorator that adds deep crawling capability to arun method."""
+    deep_crawl_active = ContextVar("deep_crawl_active", default=False)
+    
+    def __init__(self, crawler: AsyncWebCrawler): 
+        self.crawler = crawler
+
+    def __call__(self, original_arun):
+        @wraps(original_arun)
+        async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
+            # If deep crawling is already active, call the original method to avoid recursion.
+            if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
+                token = self.deep_crawl_active.set(True)
+                # Await the arun call to get the actual result object.
+                result_obj = await config.deep_crawl_strategy.arun(
+                    crawler=self.crawler,
+                    start_url=url,
+                    config=config
+                )
+                if config.stream:
+                    async def result_wrapper():
+                        try:
+                            async for result in result_obj:
+                                yield result
+                        finally:
+                            self.deep_crawl_active.reset(token)
+                    return result_wrapper()
+                else:
+                    try:
+                        return result_obj
+                    finally:
+                        self.deep_crawl_active.reset(token)
+            return await original_arun(url, config=config, **kwargs)
+        return wrapped_arun
+
+class DeepCrawlStrategy(ABC):
+    """
+    Abstract base class for deep crawling strategies.
+    
+    Core functions:
+      - arun: Main entry point that returns an async generator of CrawlResults.
+      - shutdown: Clean up resources.
+      - can_process_url: Validate a URL and decide whether to process it.
+      - _process_links: Extract and process links from a CrawlResult.
+    """
+
+    @abstractmethod
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) mode:
+        Processes one BFS level at a time, then yields all the results.
+        """
+        pass
+
+    @abstractmethod
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming mode:
+        Processes one BFS level at a time and yields results immediately as they arrive.
+        """
+        pass
+    
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> RunManyReturn:
+        """
+        Traverse the given URL using the specified crawler.
+        
+        Args:
+            start_url (str): The URL from which to start crawling.
+            crawler (AsyncWebCrawler): The crawler instance to use.
+            crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
+        
+        Returns:
+            Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig):
+        return self.arun(start_url, crawler, config)
+
+    @abstractmethod
+    async def shutdown(self) -> None:
+        """
+        Clean up resources used by the deep crawl strategy.
+        """
+        pass
+
+    @abstractmethod
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply custom filtering logic.
+        
+        Args:
+            url (str): The URL to validate.
+            depth (int): The current depth in the crawl.
+        
+        Returns:
+            bool: True if the URL should be processed, False otherwise.
+        """
+        pass
+
+    @abstractmethod
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_level: List[tuple],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract and process links from the given crawl result.
+        
+        This method should:
+          - Validate each extracted URL using can_process_url.
+          - Optionally score URLs.
+          - Append valid URLs (and their parent references) to the next_level list.
+          - Update the depths dictionary with the new depth for each URL.
+        
+        Args:
+            result (CrawlResult): The result from a crawl operation.
+            source_url (str): The URL from which this result was obtained.
+            current_depth (int): The depth at which the source URL was processed.
+            visited (Set[str]): Set of already visited URLs.
+            next_level (List[tuple]): List of tuples (url, parent_url) for the next BFS level.
+            depths (Dict[str, int]): Mapping of URLs to their current depth.
+        """
+        pass
+
+
+```
+
+
+## File: crawl4ai/deep_crawling/bff_strategy.py
+
+```py
+# best_first_crawling_strategy.py
+import asyncio
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+from urllib.parse import urlparse
+
+from ..models import TraversalStats
+from .filters import FilterChain
+from .scorers import URLScorer
+from . import DeepCrawlStrategy
+
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+
+from math import inf as infinity
+
+# Configurable batch size for processing items from the priority queue
+BATCH_SIZE = 10
+
+
+class BestFirstCrawlingStrategy(DeepCrawlStrategy):
+    """
+    Best-First Crawling Strategy using a priority queue.
+    
+    This strategy prioritizes URLs based on their score, ensuring that higher-value
+    pages are crawled first. It reimplements the core traversal loop to use a priority
+    queue while keeping URL validation and link discovery consistent with our design.
+    
+    Core methods:
+      - arun: Returns either a list (batch mode) or an async generator (stream mode).
+      - _arun_best_first: Core generator that uses a priority queue to yield CrawlResults.
+      - can_process_url: Validates URLs and applies filtering (inherited behavior).
+      - link_discovery: Extracts and validates links from a CrawlResult.
+    """
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FilterChain = FilterChain(),
+        url_scorer: Optional[URLScorer] = None,
+        include_external: bool = False,
+        max_pages: int = infinity,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.include_external = include_external
+        self.max_pages = max_pages
+        self.logger = logger or logging.getLogger(__name__)
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply filtering.
+        For the starting URL (depth 0), filtering is bypassed.
+        """
+        try:
+            parsed = urlparse(url)
+            if not parsed.scheme or not parsed.netloc:
+                raise ValueError("Missing scheme or netloc")
+            if parsed.scheme not in ("http", "https"):
+                raise ValueError("Invalid scheme")
+            if "." not in parsed.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}, error: {e}")
+            return False
+
+        if depth != 0 and not await self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_links: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract links from the crawl result, validate them, and append new URLs
+        (with their parent references) to next_links.
+        Also updates the depths dictionary.
+        """
+        new_depth = current_depth + 1
+        if new_depth > self.max_depth:
+            return
+            
+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return
+
+        # Retrieve internal links; include external links if enabled.
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        # If we have more links than remaining capacity, limit how many we'll process
+        valid_links = []
+        for link in links:
+            url = link.get("href")
+            if url in visited:
+                continue
+            if not await self.can_process_url(url, new_depth):
+                self.stats.urls_skipped += 1
+                continue
+                
+            valid_links.append(url)
+            
+        # If we have more valid links than capacity, limit them
+        if len(valid_links) > remaining_capacity:
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Record the new depths and add to next_links
+        for url in valid_links:
+            depths[url] = new_depth
+            next_links.append((url, source_url))
+
+    async def _arun_best_first(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Core best-first crawl method using a priority queue.
+        
+        The queue items are tuples of (score, depth, url, parent_url). Lower scores
+        are treated as higher priority. URLs are processed in batches for efficiency.
+        """
+        queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
+        # Push the initial URL with score 0 and depth 0.
+        await queue.put((0, 0, start_url, None))
+        visited: Set[str] = set()
+        depths: Dict[str, int] = {start_url: 0}
+
+        while not queue.empty() and not self._cancel_event.is_set():
+            # Stop if we've reached the max pages limit
+            if self._pages_crawled >= self.max_pages:
+                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
+                break
+                
+            batch: List[Tuple[float, int, str, Optional[str]]] = []
+            # Retrieve up to BATCH_SIZE items from the priority queue.
+            for _ in range(BATCH_SIZE):
+                if queue.empty():
+                    break
+                item = await queue.get()
+                score, depth, url, parent_url = item
+                if url in visited:
+                    continue
+                visited.add(url)
+                batch.append(item)
+
+            if not batch:
+                continue
+
+            # Process the current batch of URLs.
+            urls = [item[2] for item in batch]
+            batch_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=urls, config=batch_config)
+            async for result in stream_gen:
+                result_url = result.url
+                # Find the corresponding tuple from the batch.
+                corresponding = next((item for item in batch if item[2] == result_url), None)
+                if not corresponding:
+                    continue
+                score, depth, url, parent_url = corresponding
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent_url
+                result.metadata["score"] = score
+                
+                # Count only successful crawls toward max_pages limit
+                if result.success:
+                    self._pages_crawled += 1
+                
+                yield result
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Discover new links from this result
+                    new_links: List[Tuple[str, Optional[str]]] = []
+                    await self.link_discovery(result, result_url, depth, visited, new_links, depths)
+                    
+                    for new_url, new_parent in new_links:
+                        new_depth = depths.get(new_url, depth + 1)
+                        new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
+                        await queue.put((new_score, new_depth, new_url, new_parent))
+
+        # End of crawl.
+
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Best-first crawl in batch mode.
+        
+        Aggregates all CrawlResults into a list.
+        """
+        results: List[CrawlResult] = []
+        async for result in self._arun_best_first(start_url, crawler, config):
+            results.append(result)
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Best-first crawl in streaming mode.
+        
+        Yields CrawlResults as they become available.
+        """
+        async for result in self._arun_best_first(start_url, crawler, config):
+            yield result
+
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> "RunManyReturn":
+        """
+        Main entry point for best-first crawling.
+        
+        Returns either a list (batch mode) or an async generator (stream mode)
+        of CrawlResults.
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    async def shutdown(self) -> None:
+        """
+        Signal cancellation and clean up resources.
+        """
+        self._cancel_event.set()
+        self.stats.end_time = datetime.now()
+
+```
+
+
+## File: crawl4ai/deep_crawling/bfs_strategy.py
+
+```py
+# bfs_deep_crawl_strategy.py
+import asyncio
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+from urllib.parse import urlparse
+
+from ..models import TraversalStats
+from .filters import FilterChain
+from .scorers import URLScorer
+from . import DeepCrawlStrategy  
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
+from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
+from math import inf as infinity
+
+class BFSDeepCrawlStrategy(DeepCrawlStrategy):
+    """
+    Breadth-First Search deep crawling strategy.
+    
+    Core functions:
+      - arun: Main entry point; splits execution into batch or stream modes.
+      - link_discovery: Extracts, filters, and (if needed) scores the outgoing URLs.
+      - can_process_url: Validates URL format and applies the filter chain.
+    """
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FilterChain = FilterChain(),
+        url_scorer: Optional[URLScorer] = None,        
+        include_external: bool = False,
+        score_threshold: float = -infinity,
+        max_pages: int = infinity,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.include_external = include_external
+        self.score_threshold = score_threshold
+        self.max_pages = max_pages
+        self.logger = logger or logging.getLogger(__name__)
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+        self._pages_crawled = 0
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validates the URL and applies the filter chain.
+        For the start URL (depth 0) filtering is bypassed.
+        """
+        try:
+            parsed = urlparse(url)
+            if not parsed.scheme or not parsed.netloc:
+                raise ValueError("Missing scheme or netloc")
+            if parsed.scheme not in ("http", "https"):
+                raise ValueError("Invalid scheme")
+            if "." not in parsed.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}, error: {e}")
+            return False
+
+        if depth != 0 and not await self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_level: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extracts links from the crawl result, validates and scores them, and
+        prepares the next level of URLs.
+        Each valid URL is appended to next_level as a tuple (url, parent_url)
+        and its depth is tracked.
+        """            
+        next_depth = current_depth + 1
+        if next_depth > self.max_depth:
+            return
+
+        # If we've reached the max pages limit, don't discover new links
+        remaining_capacity = self.max_pages - self._pages_crawled
+        if remaining_capacity <= 0:
+            self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
+            return
+
+        # Get internal links and, if enabled, external links.
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        valid_links = []
+        
+        # First collect all valid links
+        for link in links:
+            url = link.get("href")
+            # Strip URL fragments to avoid duplicate crawling
+            # base_url = url.split('#')[0] if url else url
+            base_url = normalize_url_for_deep_crawl(url, source_url)
+            if base_url in visited:
+                continue
+            if not await self.can_process_url(url, next_depth):
+                self.stats.urls_skipped += 1
+                continue
+
+            # Score the URL if a scorer is provided
+            score = self.url_scorer.score(base_url) if self.url_scorer else 0
+            
+            # Skip URLs with scores below the threshold
+            if score < self.score_threshold:
+                self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
+                self.stats.urls_skipped += 1
+                continue
+            
+            valid_links.append((base_url, score))
+        
+        # If we have more valid links than capacity, sort by score and take the top ones
+        if len(valid_links) > remaining_capacity:
+            if self.url_scorer:
+                # Sort by score in descending order
+                valid_links.sort(key=lambda x: x[1], reverse=True)
+            # Take only as many as we have capacity for
+            valid_links = valid_links[:remaining_capacity]
+            self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
+            
+        # Process the final selected links
+        for url, score in valid_links:
+            # attach the score to metadata if needed
+            if score:
+                result.metadata = result.metadata or {}
+                result.metadata["score"] = score
+            next_level.append((url, source_url))
+            depths[url] = next_depth
+
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) mode:
+        Processes one BFS level at a time, then yields all the results.
+        """
+        visited: Set[str] = set()
+        # current_level holds tuples: (url, parent_url)
+        current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        results: List[CrawlResult] = []
+
+        while current_level and not self._cancel_event.is_set():
+            next_level: List[Tuple[str, Optional[str]]] = []
+            urls = [url for url, _ in current_level]
+            visited.update(urls)
+
+            # Clone the config to disable deep crawling recursion and enforce batch mode.
+            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
+            batch_results = await crawler.arun_many(urls=urls, config=batch_config)
+            
+            # Update pages crawled counter - count only successful crawls
+            successful_results = [r for r in batch_results if r.success]
+            self._pages_crawled += len(successful_results)
+            
+            for result in batch_results:
+                url = result.url
+                depth = depths.get(url, 0)
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                parent_url = next((parent for (u, parent) in current_level if u == url), None)
+                result.metadata["parent_url"] = parent_url
+                results.append(result)
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)
+
+            current_level = next_level
+
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming mode:
+        Processes one BFS level at a time and yields results immediately as they arrive.
+        """
+        visited: Set[str] = set()
+        current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        while current_level and not self._cancel_event.is_set():
+            next_level: List[Tuple[str, Optional[str]]] = []
+            urls = [url for url, _ in current_level]
+            visited.update(urls)
+
+            stream_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=urls, config=stream_config)
+            
+            # Keep track of processed results for this batch
+            results_count = 0
+            async for result in stream_gen:
+                url = result.url
+                depth = depths.get(url, 0)
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                parent_url = next((parent for (u, parent) in current_level if u == url), None)
+                result.metadata["parent_url"] = parent_url
+                
+                # Count only successful crawls
+                if result.success:
+                    self._pages_crawled += 1
+                
+                results_count += 1
+                yield result
+                
+                # Only discover links from successful crawls
+                if result.success:
+                    # Link discovery will handle the max pages limit internally
+                    await self.link_discovery(result, url, depth, visited, next_level, depths)
+            
+            # If we didn't get results back (e.g. due to errors), avoid getting stuck in an infinite loop
+            # by considering these URLs as visited but not counting them toward the max_pages limit
+            if results_count == 0 and urls:
+                self.logger.warning(f"No results returned for {len(urls)} URLs, marking as visited")
+                
+            current_level = next_level
+
+    async def shutdown(self) -> None:
+        """
+        Clean up resources and signal cancellation of the crawl.
+        """
+        self._cancel_event.set()
+        self.stats.end_time = datetime.now()
+
+```
+
+
+## File: crawl4ai/deep_crawling/filters.py
+
+```py
+from abc import ABC, abstractmethod
+from typing import List, Pattern, Set, Union
+from urllib.parse import urlparse
+from array import array
+import re
+import logging
+from functools import lru_cache
+import fnmatch
+from dataclasses import dataclass
+import weakref
+import math
+from collections import defaultdict
+from typing import Dict
+from ..utils import HeadPeekr
+import asyncio
+import inspect
+
+
+@dataclass
+class FilterStats:
+    __slots__ = ("_counters",)
+
+    def __init__(self):
+        # Use array of unsigned ints for atomic operations
+        self._counters = array("I", [0, 0, 0])  # total, passed, rejected
+
+    @property
+    def total_urls(self):
+        return self._counters[0]
+
+    @property
+    def passed_urls(self):
+        return self._counters[1]
+
+    @property
+    def rejected_urls(self):
+        return self._counters[2]
+
+
+class URLFilter(ABC):
+    """Optimized base filter class"""
+
+    __slots__ = ("name", "stats", "_logger_ref")
+
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FilterStats()
+        # Lazy logger initialization using weakref
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger(f"urlfilter.{self.name}")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+    def _update_stats(self, passed: bool):
+        # Use direct array index for speed
+        self.stats._counters[0] += 1  # total
+        self.stats._counters[1] += passed  # passed
+        self.stats._counters[2] += not passed  # rejected
+
+
+class FilterChain:
+    """Optimized filter chain"""
+
+    __slots__ = ("filters", "stats", "_logger_ref")
+
+    def __init__(self, filters: List[URLFilter] = None):
+        self.filters = tuple(filters or [])  # Immutable tuple for speed
+        self.stats = FilterStats()
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger("urlfilter.chain")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    def add_filter(self, filter_: URLFilter) -> "FilterChain":
+        """Add a filter to the chain"""
+        self.filters.append(filter_)
+        return self  # Enable method chaining
+
+    async def apply(self, url: str) -> bool:
+        """Apply all filters concurrently when possible"""
+        self.stats._counters[0] += 1  # Total processed URLs
+
+        tasks = []
+        for f in self.filters:
+            result = f.apply(url)
+
+            if inspect.isawaitable(result):
+                tasks.append(result)  # Collect async tasks
+            elif not result:  # Sync rejection
+                self.stats._counters[2] += 1  # Sync rejected
+                return False
+
+        if tasks:
+            results = await asyncio.gather(*tasks)
+
+            # Count how many filters rejected
+            rejections = results.count(False)
+            self.stats._counters[2] += rejections
+
+            if not all(results):
+                return False  # Stop early if any filter rejected
+
+        self.stats._counters[1] += 1  # Passed
+        return True
+
+
+class URLPatternFilter(URLFilter):
+    """Pattern filter balancing speed and completeness"""
+
+    __slots__ = (
+        "_simple_suffixes",
+        "_simple_prefixes",
+        "_domain_patterns",
+        "_path_patterns",
+        "_reverse",
+    )
+
+    PATTERN_TYPES = {
+        "SUFFIX": 1,  # *.html
+        "PREFIX": 2,  # /foo/*
+        "DOMAIN": 3,  # *.example.com
+        "PATH": 4,  # Everything else
+        "REGEX": 5,
+    }
+
+    def __init__(
+        self,
+        patterns: Union[str, Pattern, List[Union[str, Pattern]]],
+        use_glob: bool = True,
+        reverse: bool = False,
+    ):
+        super().__init__()
+        self._reverse = reverse
+        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+
+        self._simple_suffixes = set()
+        self._simple_prefixes = set()
+        self._domain_patterns = []
+        self._path_patterns = []
+
+        for pattern in patterns:
+            pattern_type = self._categorize_pattern(pattern)
+            self._add_pattern(pattern, pattern_type)
+
+    def _categorize_pattern(self, pattern: str) -> int:
+        """Categorize pattern for specialized handling"""
+        if not isinstance(pattern, str):
+            return self.PATTERN_TYPES["PATH"]
+
+        # Check if it's a regex pattern
+        if pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern:
+            return self.PATTERN_TYPES["REGEX"]
+
+        if pattern.count("*") == 1:
+            if pattern.startswith("*."):
+                return self.PATTERN_TYPES["SUFFIX"]
+            if pattern.endswith("/*"):
+                return self.PATTERN_TYPES["PREFIX"]
+
+        if "://" in pattern and pattern.startswith("*."):
+            return self.PATTERN_TYPES["DOMAIN"]
+
+        return self.PATTERN_TYPES["PATH"]
+
+    def _add_pattern(self, pattern: str, pattern_type: int):
+        """Add pattern to appropriate matcher"""
+        if pattern_type == self.PATTERN_TYPES["REGEX"]:
+            # For regex patterns, compile directly without glob translation
+            if isinstance(pattern, str) and (
+                pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern
+            ):
+                self._path_patterns.append(re.compile(pattern))
+                return
+        elif pattern_type == self.PATTERN_TYPES["SUFFIX"]:
+            self._simple_suffixes.add(pattern[2:])
+        elif pattern_type == self.PATTERN_TYPES["PREFIX"]:
+            self._simple_prefixes.add(pattern[:-2])
+        elif pattern_type == self.PATTERN_TYPES["DOMAIN"]:
+            self._domain_patterns.append(re.compile(pattern.replace("*.", r"[^/]+\.")))
+        else:
+            if isinstance(pattern, str):
+                # Handle complex glob patterns
+                if "**" in pattern:
+                    pattern = pattern.replace("**", ".*")
+                if "{" in pattern:
+                    # Convert {a,b} to (a|b)
+                    pattern = re.sub(
+                        r"\{([^}]+)\}",
+                        lambda m: f'({"|".join(m.group(1).split(","))})',
+                        pattern,
+                    )
+                pattern = fnmatch.translate(pattern)
+            self._path_patterns.append(
+                pattern if isinstance(pattern, Pattern) else re.compile(pattern)
+            )
+
+    @lru_cache(maxsize=10000)
+    def apply(self, url: str) -> bool:
+        # Quick suffix check (*.html)
+        if self._simple_suffixes:
+            path = url.split("?")[0]
+            if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        # Domain check
+        if self._domain_patterns:
+            for pattern in self._domain_patterns:
+                if pattern.match(url):
+                    result = True
+                    self._update_stats(result)
+                    return not result if self._reverse else result
+
+        # Prefix check (/foo/*)
+        if self._simple_prefixes:
+            path = url.split("?")[0]
+            if any(path.startswith(p) for p in self._simple_prefixes):
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        # Complex patterns
+        if self._path_patterns:
+            if any(p.search(url) for p in self._path_patterns):
+                result = True
+                self._update_stats(result)
+                return not result if self._reverse else result
+
+        result = False
+        self._update_stats(result)
+        return not result if self._reverse else result
+
+
+class ContentTypeFilter(URLFilter):
+    """Optimized content type filter using fast lookups"""
+
+    __slots__ = ("allowed_types", "_ext_map", "_check_extension")
+
+    # Fast extension to mime type mapping
+    _MIME_MAP = {
+        # Text Formats
+        "txt": "text/plain",
+        "html": "text/html",
+        "htm": "text/html",
+        "xhtml": "application/xhtml+xml",
+        "css": "text/css",
+        "csv": "text/csv",
+        "ics": "text/calendar",
+        "js": "application/javascript",
+        # Images
+        "bmp": "image/bmp",
+        "gif": "image/gif",
+        "jpeg": "image/jpeg",
+        "jpg": "image/jpeg",
+        "png": "image/png",
+        "svg": "image/svg+xml",
+        "tiff": "image/tiff",
+        "ico": "image/x-icon",
+        "webp": "image/webp",
+        # Audio
+        "mp3": "audio/mpeg",
+        "wav": "audio/wav",
+        "ogg": "audio/ogg",
+        "m4a": "audio/mp4",
+        "aac": "audio/aac",
+        # Video
+        "mp4": "video/mp4",
+        "mpeg": "video/mpeg",
+        "webm": "video/webm",
+        "avi": "video/x-msvideo",
+        "mov": "video/quicktime",
+        "flv": "video/x-flv",
+        "wmv": "video/x-ms-wmv",
+        "mkv": "video/x-matroska",
+        # Applications
+        "json": "application/json",
+        "xml": "application/xml",
+        "pdf": "application/pdf",
+        "zip": "application/zip",
+        "gz": "application/gzip",
+        "tar": "application/x-tar",
+        "rar": "application/vnd.rar",
+        "7z": "application/x-7z-compressed",
+        "exe": "application/vnd.microsoft.portable-executable",
+        "msi": "application/x-msdownload",
+        # Fonts
+        "woff": "font/woff",
+        "woff2": "font/woff2",
+        "ttf": "font/ttf",
+        "otf": "font/otf",
+        # Microsoft Office
+        "doc": "application/msword",
+        "dot": "application/msword",
+        "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "xls": "application/vnd.ms-excel",
+        "ppt": "application/vnd.ms-powerpoint",
+        "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        # OpenDocument Formats
+        "odt": "application/vnd.oasis.opendocument.text",
+        "ods": "application/vnd.oasis.opendocument.spreadsheet",
+        "odp": "application/vnd.oasis.opendocument.presentation",
+        # Archives
+        "tar.gz": "application/gzip",
+        "tgz": "application/gzip",
+        "bz2": "application/x-bzip2",
+        # Others
+        "rtf": "application/rtf",
+        "apk": "application/vnd.android.package-archive",
+        "epub": "application/epub+zip",
+        "jar": "application/java-archive",
+        "swf": "application/x-shockwave-flash",
+        "midi": "audio/midi",
+        "mid": "audio/midi",
+        "ps": "application/postscript",
+        "ai": "application/postscript",
+        "eps": "application/postscript",
+        # Custom or less common
+        "bin": "application/octet-stream",
+        "dmg": "application/x-apple-diskimage",
+        "iso": "application/x-iso9660-image",
+        "deb": "application/x-debian-package",
+        "rpm": "application/x-rpm",
+        "sqlite": "application/vnd.sqlite3",
+        # Placeholder
+        "unknown": "application/octet-stream",  # Fallback for unknown file types
+    }
+
+    @staticmethod
+    @lru_cache(maxsize=1000)
+    def _extract_extension(url: str) -> str:
+        """Extracts file extension from a URL."""
+        # Remove scheme (http://, https://) if present
+        if "://" in url:
+            url = url.split("://", 1)[-1]  # Get everything after '://'
+
+        # Remove domain (everything up to the first '/')
+        path_start = url.find("/")
+        path = url[path_start:] if path_start != -1 else ""
+
+        # Extract last filename in path
+        filename = path.rsplit("/", 1)[-1] if "/" in path else ""
+
+        # Extract and validate extension
+        if "." not in filename:
+            return ""
+
+        return filename.rpartition(".")[-1].lower()
+
+    def __init__(
+        self,
+        allowed_types: Union[str, List[str]],
+        check_extension: bool = True,
+        ext_map: Dict[str, str] = _MIME_MAP,
+    ):
+        super().__init__()
+        # Normalize and store as frozenset for fast lookup
+        self.allowed_types = frozenset(
+            t.lower()
+            for t in (
+                allowed_types if isinstance(allowed_types, list) else [allowed_types]
+            )
+        )
+        self._check_extension = check_extension
+
+        # Pre-compute extension map for allowed types
+        self._ext_map = frozenset(
+            ext
+            for ext, mime in self._MIME_MAP.items()
+            if any(allowed in mime for allowed in self.allowed_types)
+        )
+
+    @lru_cache(maxsize=1000)
+    def _check_url_cached(self, url: str) -> bool:
+        """Cached URL checking"""
+        if not self._check_extension:
+            return True
+        ext = self._extract_extension(url)
+        if not ext:
+            return True
+
+        return ext in self._ext_map
+
+    def apply(self, url: str) -> bool:
+        """Fast extension check with caching"""
+        result = self._check_url_cached(url)
+        self._update_stats(result)
+        return result
+
+
+class DomainFilter(URLFilter):
+    """Optimized domain filter with fast lookups and caching"""
+
+    __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
+
+    # Regex for fast domain extraction
+    _DOMAIN_REGEX = re.compile(r"://([^/]+)")
+
+    def __init__(
+        self,
+        allowed_domains: Union[str, List[str]] = None,
+        blocked_domains: Union[str, List[str]] = None,
+    ):
+        super().__init__()
+
+        # Convert inputs to frozensets for immutable, fast lookups
+        self._allowed_domains = (
+            frozenset(self._normalize_domains(allowed_domains))
+            if allowed_domains
+            else None
+        )
+        self._blocked_domains = (
+            frozenset(self._normalize_domains(blocked_domains))
+            if blocked_domains
+            else frozenset()
+        )
+
+    @staticmethod
+    def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]:
+        """Fast domain normalization"""
+        if isinstance(domains, str):
+            return {domains.lower()}
+        return {d.lower() for d in domains}
+    
+    @staticmethod
+    def _is_subdomain(domain: str, parent_domain: str) -> bool:
+        """Check if domain is a subdomain of parent_domain"""
+        return domain == parent_domain or domain.endswith(f".{parent_domain}")
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Ultra-fast domain extraction with regex and caching"""
+        match = DomainFilter._DOMAIN_REGEX.search(url)
+        return match.group(1).lower() if match else ""
+
+    def apply(self, url: str) -> bool:
+        """Optimized domain checking with early returns"""
+        # Skip processing if no filters
+        if not self._blocked_domains and self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        domain = self._extract_domain(url)
+
+        # Check for blocked domains, including subdomains
+        for blocked in self._blocked_domains:
+            if self._is_subdomain(domain, blocked):
+                self._update_stats(False)
+                return False
+
+        # If no allowed domains specified, accept all non-blocked
+        if self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        # Check if domain matches any allowed domain (including subdomains)
+        for allowed in self._allowed_domains:
+            if self._is_subdomain(domain, allowed):
+                self._update_stats(True)
+                return True
+
+        # No matches found
+        self._update_stats(False)
+        return False
+
+
+class ContentRelevanceFilter(URLFilter):
+    """BM25-based relevance filter using head section content"""
+
+    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
+
+    def __init__(
+        self,
+        query: str,
+        threshold: float,
+        k1: float = 1.2,
+        b: float = 0.75,
+        avgdl: int = 1000,
+    ):
+        super().__init__(name="BM25RelevanceFilter")
+        self.query_terms = self._tokenize(query)
+        self.threshold = threshold
+        self.k1 = k1  # TF saturation parameter
+        self.b = b  # Length normalization parameter
+        self.avgdl = avgdl  # Average document length (empirical value)
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        # Field extraction with weighting
+        fields = {
+            "title": HeadPeekr.get_title(head_content) or "",
+            "meta": HeadPeekr.extract_meta_tags(head_content),
+        }
+        doc_text = self._build_document(fields)
+
+        score = self._bm25(doc_text)
+        decision = score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _build_document(self, fields: Dict) -> str:
+        """Weighted document construction"""
+        return " ".join(
+            [
+                fields["title"] * 3,  # Title weight
+                fields["meta"].get("description", "") * 2,
+                fields["meta"].get("keywords", ""),
+                " ".join(fields["meta"].values()),
+            ]
+        )
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Fast case-insensitive tokenization"""
+        return text.lower().split()
+
+    def _bm25(self, document: str) -> float:
+        """Optimized BM25 implementation for head sections"""
+        doc_terms = self._tokenize(document)
+        doc_len = len(doc_terms)
+        tf = defaultdict(int)
+
+        for term in doc_terms:
+            tf[term] += 1
+
+        score = 0.0
+        for term in set(self.query_terms):
+            term_freq = tf[term]
+            idf = math.log((1 + 1) / (term_freq + 0.5) + 1)  # Simplified IDF
+            numerator = term_freq * (self.k1 + 1)
+            denominator = term_freq + self.k1 * (
+                1 - self.b + self.b * (doc_len / self.avgdl)
+            )
+            score += idf * (numerator / denominator)
+
+        return score
+
+
+class SEOFilter(URLFilter):
+    """Quantitative SEO quality assessment filter using head section analysis"""
+
+    __slots__ = ("threshold", "_weights", "_kw_patterns")
+
+    # Based on SEMrush/Google ranking factors research
+    DEFAULT_WEIGHTS = {
+        "title_length": 0.15,
+        "title_kw": 0.18,
+        "meta_description": 0.12,
+        "canonical": 0.10,
+        "robot_ok": 0.20,  # Most critical factor
+        "schema_org": 0.10,
+        "url_quality": 0.15,
+    }
+
+    def __init__(
+        self,
+        threshold: float = 0.65,
+        keywords: List[str] = None,
+        weights: Dict[str, float] = None,
+    ):
+        super().__init__(name="SEOFilter")
+        self.threshold = threshold
+        self._weights = weights or self.DEFAULT_WEIGHTS
+        self._kw_patterns = (
+            re.compile(
+                r"\b({})\b".format("|".join(map(re.escape, keywords or []))), re.I
+            )
+            if keywords
+            else None
+        )
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        meta = HeadPeekr.extract_meta_tags(head_content)
+        title = HeadPeekr.get_title(head_content) or ""
+        parsed_url = urlparse(url)
+
+        scores = {
+            "title_length": self._score_title_length(title),
+            "title_kw": self._score_keyword_presence(title),
+            "meta_description": self._score_meta_description(
+                meta.get("description", "")
+            ),
+            "canonical": self._score_canonical(meta.get("canonical"), url),
+            "robot_ok": 1.0 if "noindex" not in meta.get("robots", "") else 0.0,
+            "schema_org": self._score_schema_org(head_content),
+            "url_quality": self._score_url_quality(parsed_url),
+        }
+
+        total_score = sum(
+            weight * scores[factor] for factor, weight in self._weights.items()
+        )
+
+        decision = total_score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _score_title_length(self, title: str) -> float:
+        length = len(title)
+        if 50 <= length <= 60:
+            return 1.0
+        if 40 <= length < 50 or 60 < length <= 70:
+            return 0.7
+        return 0.3  # Poor length
+
+    def _score_keyword_presence(self, text: str) -> float:
+        if not self._kw_patterns:
+            return 0.0
+        matches = len(self._kw_patterns.findall(text))
+        return min(matches * 0.3, 1.0)  # Max 3 matches
+
+    def _score_meta_description(self, desc: str) -> float:
+        length = len(desc)
+        if 140 <= length <= 160:
+            return 1.0
+        return 0.5 if 120 <= length <= 200 else 0.2
+
+    def _score_canonical(self, canonical: str, original: str) -> float:
+        if not canonical:
+            return 0.5  # Neutral score
+        return 1.0 if canonical == original else 0.2
+
+    def _score_schema_org(self, html: str) -> float:
+        # Detect any schema.org markup in head
+        return (
+            1.0
+            if re.search(r'<script[^>]+type=["\']application/ld\+json', html)
+            else 0.0
+        )
+
+    def _score_url_quality(self, parsed_url) -> float:
+        score = 1.0
+        path = parsed_url.path.lower()
+
+        # Penalty factors
+        if len(path) > 80:
+            score *= 0.7
+        if re.search(r"\d{4}", path):
+            score *= 0.8  # Numbers in path
+        if parsed_url.query:
+            score *= 0.6  # URL parameters
+        if "_" in path:
+            score *= 0.9  # Underscores vs hyphens
+
+        return score
+
+```
+
+
+## File: crawl4ai/deep_crawling/scorers.py
+
+```py
+from abc import ABC, abstractmethod
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from urllib.parse import urlparse, unquote
+import re
+import logging
+from functools import lru_cache
+from array import array
+import ctypes
+import platform
+PLATFORM = platform.system()
+
+# Pre-computed scores for common year differences
+_SCORE_LOOKUP = [1.0, 0.5, 0.3333333333333333, 0.25]
+
+# Pre-computed scores for common year differences
+_FRESHNESS_SCORES = [
+   1.0,    # Current year
+   0.9,    # Last year
+   0.8,    # 2 years ago
+   0.7,    # 3 years ago
+   0.6,    # 4 years ago
+   0.5,    # 5 years ago
+]
+
+class ScoringStats:
+    __slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score')
+    
+    def __init__(self):
+        self._urls_scored = 0
+        self._total_score = 0.0
+        self._min_score = None  # Lazy initialization
+        self._max_score = None
+    
+    def update(self, score: float) -> None:
+        """Optimized update with minimal operations"""
+        self._urls_scored += 1
+        self._total_score += score
+        
+        # Lazy min/max tracking - only if actually accessed
+        if self._min_score is not None:
+            if score < self._min_score:
+                self._min_score = score
+        if self._max_score is not None:
+            if score > self._max_score:
+                self._max_score = score
+                
+    def get_average(self) -> float:
+        """Direct calculation instead of property"""
+        return self._total_score / self._urls_scored if self._urls_scored else 0.0
+    
+    def get_min(self) -> float:
+        """Lazy min calculation"""
+        if self._min_score is None:
+            self._min_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
+        return self._min_score
+        
+    def get_max(self) -> float:
+        """Lazy max calculation"""
+        if self._max_score is None:
+            self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
+        return self._max_score
+class URLScorer(ABC):
+    __slots__ = ('_weight', '_stats')
+    
+    def __init__(self, weight: float = 1.0):
+        # Store weight directly as float32 for memory efficiency
+        self._weight = ctypes.c_float(weight).value
+        self._stats = ScoringStats()
+    
+    @abstractmethod
+    def _calculate_score(self, url: str) -> float:
+        """Calculate raw score for URL."""
+        pass
+    
+    def score(self, url: str) -> float:
+        """Calculate weighted score with minimal overhead."""
+        score = self._calculate_score(url) * self._weight
+        self._stats.update(score)
+        return score
+    
+    @property
+    def stats(self):
+        """Access to scoring statistics."""
+        return self._stats
+    
+    @property
+    def weight(self):
+        return self._weight
+
+class CompositeScorer(URLScorer):
+    __slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array')
+    
+    def __init__(self, scorers: List[URLScorer], normalize: bool = True):
+        """Initialize composite scorer combining multiple scoring strategies.
+        
+        Optimized for:
+        - Fast parallel scoring
+        - Memory efficient score aggregation
+        - Quick short-circuit conditions
+        - Pre-allocated arrays
+        
+        Args:
+            scorers: List of scoring strategies to combine
+            normalize: Whether to normalize final score by scorer count
+        """
+        super().__init__(weight=1.0)
+        self._scorers = scorers
+        self._normalize = normalize
+        
+        # Pre-allocate arrays for scores and weights
+        self._weights_array = array('f', [s.weight for s in scorers])
+        self._score_array = array('f', [0.0] * len(scorers))
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate combined score from all scoring strategies.
+        
+        Uses:
+        1. Pre-allocated arrays for scores
+        2. Short-circuit on zero scores
+        3. Optimized normalization
+        4. Vectorized operations where possible
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Combined and optionally normalized score
+        """
+        total_score = 0.0
+        scores = self._score_array
+        
+        # Get scores from all scorers
+        for i, scorer in enumerate(self._scorers):
+            # Use public score() method which applies weight
+            scores[i] = scorer.score(url)
+            total_score += scores[i]
+            
+        # Normalize if requested
+        if self._normalize and self._scorers:
+            count = len(self._scorers)
+            return total_score / count
+            
+        return total_score
+
+    def score(self, url: str) -> float:
+        """Public scoring interface with stats tracking.
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Final combined score
+        """
+        score = self._calculate_score(url)
+        self.stats.update(score)
+        return score
+
+class KeywordRelevanceScorer(URLScorer):
+    __slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive')
+    
+    def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
+        super().__init__(weight=weight)
+        self._case_sensitive = case_sensitive
+        # Pre-process keywords once
+        self._keywords = [k if case_sensitive else k.lower() for k in keywords]
+    
+    @lru_cache(maxsize=10000)
+    def _url_bytes(self, url: str) -> bytes:
+        """Cache decoded URL bytes"""
+        return url.encode('utf-8') if self._case_sensitive else url.lower().encode('utf-8')
+    
+    
+    def _calculate_score(self, url: str) -> float:
+        """Fast string matching without regex or byte conversion"""
+        if not self._case_sensitive:
+            url = url.lower()
+            
+        matches = sum(1 for k in self._keywords if k in url)
+        
+        # Fast return paths
+        if not matches:
+            return 0.0
+        if matches == len(self._keywords):
+            return 1.0
+            
+        return matches / len(self._keywords)
+
+class PathDepthScorer(URLScorer):
+    __slots__ = ('_weight', '_stats', '_optimal_depth')  # Remove _url_cache
+    
+    def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
+        super().__init__(weight=weight)
+        self._optimal_depth = optimal_depth
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _quick_depth(path: str) -> int:
+        """Ultra fast path depth calculation.
+        
+        Examples:
+            - "http://example.com" -> 0  # No path segments
+            - "http://example.com/" -> 0  # Empty path
+            - "http://example.com/a" -> 1
+            - "http://example.com/a/b" -> 2
+        """
+        if not path or path == '/':
+            return 0
+            
+        if '/' not in path:
+            return 0
+            
+        depth = 0
+        last_was_slash = True
+        
+        for c in path:
+            if c == '/':
+                if not last_was_slash:
+                    depth += 1
+                last_was_slash = True
+            else:
+                last_was_slash = False
+                
+        if not last_was_slash:
+            depth += 1
+            
+        return depth
+
+    @lru_cache(maxsize=10000)  # Cache the whole calculation
+    def _calculate_score(self, url: str) -> float:
+        pos = url.find('/', url.find('://') + 3)
+        if pos == -1:
+            depth = 0
+        else:
+            depth = self._quick_depth(url[pos:])
+            
+        # Use lookup table for common distances
+        distance = depth - self._optimal_depth
+        distance = distance if distance >= 0 else -distance  # Faster than abs()
+        
+        if distance < 4:
+            return _SCORE_LOOKUP[distance]
+            
+        return 1.0 / (1.0 + distance)                                             
+
+class ContentTypeScorer(URLScorer):
+    __slots__ = ('_weight', '_exact_types', '_regex_types')
+
+    def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
+        """Initialize scorer with type weights map.
+        
+        Args:
+            type_weights: Dict mapping file extensions/patterns to scores (e.g. {'.html$': 1.0})
+            weight: Overall weight multiplier for this scorer
+        """
+        super().__init__(weight=weight)
+        self._exact_types = {}  # Fast lookup for simple extensions
+        self._regex_types = []  # Fallback for complex patterns
+        
+        # Split into exact vs regex matchers for performance
+        for pattern, score in type_weights.items():
+            if pattern.startswith('.') and pattern.endswith('$'):
+                ext = pattern[1:-1]
+                self._exact_types[ext] = score
+            else:
+                self._regex_types.append((re.compile(pattern), score))
+                
+        # Sort complex patterns by score for early exit
+        self._regex_types.sort(key=lambda x: -x[1])
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _quick_extension(url: str) -> str:
+        """Extract file extension ultra-fast without regex/splits.
+        
+        Handles:
+        - Basic extensions: "example.html" -> "html"
+        - Query strings: "page.php?id=1" -> "php" 
+        - Fragments: "doc.pdf#page=1" -> "pdf"
+        - Path params: "file.jpg;width=100" -> "jpg"
+        
+        Args:
+            url: URL to extract extension from
+            
+        Returns:
+            Extension without dot, or empty string if none found
+        """
+        pos = url.rfind('.')
+        if pos == -1:
+            return ''
+        
+        # Find first non-alphanumeric char after extension
+        end = len(url)
+        for i in range(pos + 1, len(url)):
+            c = url[i]
+            # Stop at query string, fragment, path param or any non-alphanumeric
+            if c in '?#;' or not c.isalnum():
+                end = i
+                break
+                
+        return url[pos + 1:end].lower()
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate content type score for URL.
+        
+        Uses staged approach:
+        1. Try exact extension match (fast path)
+        2. Fall back to regex patterns if needed
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Score between 0.0 and 1.0 * weight
+        """
+        # Fast path: direct extension lookup
+        ext = self._quick_extension(url)
+        if ext:
+            score = self._exact_types.get(ext, None)
+            if score is not None:
+                return score
+                
+        # Slow path: regex patterns
+        for pattern, score in self._regex_types:
+            if pattern.search(url):
+                return score
+
+        return 0.0
+
+class FreshnessScorer(URLScorer):
+    __slots__ = ('_weight', '_date_pattern', '_current_year')
+
+    def __init__(self, weight: float = 1.0, current_year: int = 2024):
+        """Initialize freshness scorer.
+        
+        Extracts and scores dates from URLs using format:
+        - YYYY/MM/DD 
+        - YYYY-MM-DD
+        - YYYY_MM_DD
+        - YYYY (year only)
+        
+        Args:
+            weight: Score multiplier
+            current_year: Year to calculate freshness against (default 2024)
+        """
+        super().__init__(weight=weight)
+        self._current_year = current_year
+        
+        # Combined pattern for all date formats
+        # Uses non-capturing groups (?:) and alternation
+        self._date_pattern = re.compile(
+            r'(?:/'  # Path separator
+            r'|[-_])'  # or date separators
+            r'((?:19|20)\d{2})'  # Year group (1900-2099)
+            r'(?:'  # Optional month/day group
+            r'(?:/|[-_])'  # Date separator  
+            r'(?:\d{2})'  # Month
+            r'(?:'  # Optional day
+            r'(?:/|[-_])'  # Date separator
+            r'(?:\d{2})'  # Day
+            r')?'  # Day is optional
+            r')?'  # Month/day group is optional
+        )
+
+    @lru_cache(maxsize=10000)
+    def _extract_year(self, url: str) -> Optional[int]:
+        """Extract the most recent year from URL.
+        
+        Args:
+            url: URL to extract year from
+            
+        Returns:
+            Year as int or None if no valid year found
+        """
+        matches = self._date_pattern.finditer(url)
+        latest_year = None
+        
+        # Find most recent year
+        for match in matches:
+            year = int(match.group(1))
+            if (year <= self._current_year and  # Sanity check
+                (latest_year is None or year > latest_year)):
+                latest_year = year
+                
+        return latest_year
+
+    @lru_cache(maxsize=10000) 
+    def _calculate_score(self, url: str) -> float:
+        """Calculate freshness score based on URL date.
+        
+        More recent years score higher. Uses pre-computed scoring
+        table for common year differences.
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Score between 0.0 and 1.0 * weight
+        """
+        year = self._extract_year(url)
+        if year is None:
+            return 0.5  # Default score
+            
+        # Use lookup table for common year differences
+        year_diff = self._current_year - year
+        if year_diff < len(_FRESHNESS_SCORES):
+            return _FRESHNESS_SCORES[year_diff]
+            
+        # Fallback calculation for older content
+        return max(0.1, 1.0 - year_diff * 0.1)
+
+class DomainAuthorityScorer(URLScorer):
+    __slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains')
+    
+    def __init__(
+        self,
+        domain_weights: Dict[str, float],
+        default_weight: float = 0.5,
+        weight: float = 1.0,
+    ):
+        """Initialize domain authority scorer.
+        
+        Args:
+            domain_weights: Dict mapping domains to authority scores
+            default_weight: Score for unknown domains
+            weight: Overall scorer weight multiplier
+            
+        Example:
+            {
+                'python.org': 1.0,
+                'github.com': 0.9,
+                'medium.com': 0.7
+            }
+        """
+        super().__init__(weight=weight)
+        
+        # Pre-process domains for faster lookup
+        self._domain_weights = {
+            domain.lower(): score 
+            for domain, score in domain_weights.items()
+        }
+        self._default_weight = default_weight
+        
+        # Cache top domains for fast path
+        self._top_domains = {
+            domain: score
+            for domain, score in sorted(
+                domain_weights.items(), 
+                key=lambda x: -x[1]
+            )[:5]  # Keep top 5 highest scoring domains
+        }
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Extract domain from URL ultra-fast.
+        
+        Handles:
+        - Basic domains: "example.com"
+        - Subdomains: "sub.example.com" 
+        - Ports: "example.com:8080"
+        - IPv4: "192.168.1.1"
+        
+        Args:
+            url: Full URL to extract domain from
+            
+        Returns:
+            Lowercase domain without port
+        """
+        # Find domain start
+        start = url.find('://') 
+        if start == -1:
+            start = 0
+        else:
+            start += 3
+            
+        # Find domain end
+        end = url.find('/', start)
+        if end == -1:
+            end = url.find('?', start)
+            if end == -1:
+                end = url.find('#', start)
+                if end == -1:
+                    end = len(url)
+                    
+        # Extract domain and remove port
+        domain = url[start:end]
+        port_idx = domain.rfind(':')
+        if port_idx != -1:
+            domain = domain[:port_idx]
+            
+        return domain.lower()
+
+    @lru_cache(maxsize=10000)
+    def _calculate_score(self, url: str) -> float:
+        """Calculate domain authority score.
+        
+        Uses staged approach:
+        1. Check top domains (fastest)
+        2. Check full domain weights
+        3. Return default weight
+        
+        Args:
+            url: URL to score
+            
+        Returns:
+            Authority score between 0.0 and 1.0 * weight
+        """
+        domain = self._extract_domain(url)
+        
+        # Fast path: check top domains first
+        score = self._top_domains.get(domain)
+        if score is not None:
+            return score
+            
+        # Regular path: check all domains
+        return self._domain_weights.get(domain, self._default_weight)
+```
+
+
+## File: docs/examples/deepcrawl_example.py
+
+```py
+import asyncio
+import time
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+    ContentRelevanceFilter,
+    SEOFilter,
+)
+from crawl4ai.deep_crawling.scorers import (
+    KeywordRelevanceScorer,
+)
+
+
+# 1️⃣ Basic Deep Crawl Setup
+async def basic_deep_crawl():
+    """
+    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
+
+    This function shows:
+    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
+    - Setting depth and domain parameters
+    - Processing the results to show the hierarchy
+    """
+    print("\n===== BASIC DEEP CRAWL SETUP =====")
+
+    # Configure a 2-level deep crawl using Breadth-First Search strategy
+    # max_depth=2 means: initial page (depth 0) + 2 more levels
+    # include_external=False means: only follow links within the same domain
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True,  # Show progress during crawling
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        start_time = time.perf_counter()
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        # Group results by depth to visualize the crawl tree
+        pages_by_depth = {}
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            if depth not in pages_by_depth:
+                pages_by_depth[depth] = []
+            pages_by_depth[depth].append(result.url)
+
+        print(f"✅ Crawled {len(results)} pages total")
+
+        # Display crawl structure by depth
+        for depth, urls in sorted(pages_by_depth.items()):
+            print(f"\nDepth {depth}: {len(urls)} pages")
+            # Show first 3 URLs for each depth as examples
+            for url in urls[:3]:
+                print(f"  → {url}")
+            if len(urls) > 3:
+                print(f"  ... and {len(urls) - 3} more")
+
+        print(
+            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
+        )
+
+# 2️⃣ Stream vs. Non-Stream Execution
+async def stream_vs_nonstream():
+    """
+    PART 2: Demonstrates the difference between stream and non-stream execution.
+
+    Non-stream: Waits for all results before processing
+    Stream: Processes results as they become available
+    """
+    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
+
+    # Common configuration for both examples
+    base_config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=False,
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # NON-STREAMING MODE
+        print("\n📊 NON-STREAMING MODE:")
+        print("  In this mode, all results are collected before being returned.")
+
+        non_stream_config = base_config.clone()
+        non_stream_config.stream = False
+
+        start_time = time.perf_counter()
+        results = await crawler.arun(
+            url="https://docs.crawl4ai.com", config=non_stream_config
+        )
+
+        print(f"  ✅ Received all {len(results)} results at once")
+        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
+
+        # STREAMING MODE
+        print("\n📊 STREAMING MODE:")
+        print("  In this mode, results are processed as they become available.")
+
+        stream_config = base_config.clone()
+        stream_config.stream = True
+
+        start_time = time.perf_counter()
+        result_count = 0
+        first_result_time = None
+
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=stream_config
+        ):
+            result_count += 1
+            if result_count == 1:
+                first_result_time = time.perf_counter() - start_time
+                print(
+                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
+                )
+            elif result_count % 5 == 0:  # Show every 5th result for brevity
+                print(f"  → Result #{result_count}: {result.url}")
+
+        print(f"  ✅ Total: {result_count} results")
+        print(f"  ✅ First result: {first_result_time:.2f} seconds")
+        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
+        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
+
+# 3️⃣ Introduce Filters & Scorers
+async def filters_and_scorers():
+    """
+    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
+
+    This function progressively adds:
+    1. A single URL pattern filter
+    2. Multiple filters in a chain
+    3. Scorers for prioritizing pages
+    """
+    print("\n===== FILTERS AND SCORERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SINGLE FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
+        print("  Only crawl pages containing 'core' in the URL")
+
+        # Create a filter that only allows URLs with 'guide' in them
+        url_filter = URLPatternFilter(patterns=["*core*"])
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1,
+                include_external=False,
+                filter_chain=FilterChain([url_filter]),  # Single filter
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
+        for result in results[:3]:  # Show first 3 results
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # MULTIPLE FILTERS EXAMPLE
+        print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
+        print("  Only crawl pages that:")
+        print("  1. Contain '2024' in the URL")
+        print("  2. Are from 'techcrunch.com'")
+        print("  3. Are of text/html or application/javascript content type")
+
+        # Create a chain of filters
+        filter_chain = FilterChain(
+            [
+                URLPatternFilter(patterns=["*2024*"]),
+                DomainFilter(
+                    allowed_domains=["techcrunch.com"],
+                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
+                ),
+                ContentTypeFilter(
+                    allowed_types=["text/html", "application/javascript"]
+                ),
+            ]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, include_external=False, filter_chain=filter_chain
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://techcrunch.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
+        for result in results[:3]:
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # SCORERS EXAMPLE
+        print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
+        print(
+            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
+        )
+
+        # Create a keyword relevance scorer
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(  
+                max_depth=1, include_external=False, url_scorer=keyword_scorer
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+            stream=True,
+        )
+
+        results = []
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score")
+            print(f"  → Score: {score:.2f} | {result.url}")
+
+        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
+        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
+
+# 4️⃣ Advanced Filters
+async def advanced_filters():
+    """
+    PART 4: Demonstrates advanced filtering techniques for specialized crawling.
+
+    This function covers:
+    - SEO filters
+    - Text relevancy filtering
+    - Combining advanced filters
+    """
+    print("\n===== ADVANCED FILTERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SEO FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SEO FILTERS")
+        print(
+            "Quantitative SEO quality assessment filter based searching keywords in the head section"
+        )
+
+        seo_filter = SEOFilter(
+            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([seo_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages with relevant keywords")
+        for result in results:
+            print(f"  → {result.url}")
+
+        # ADVANCED TEXT RELEVANCY FILTER
+        print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
+
+        # More sophisticated content relevance filter
+        relevance_filter = ContentRelevanceFilter(
+            query="Interact with the web using your authentic digital identity",
+            threshold=0.7,
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([relevance_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages")
+        for result in results:
+            relevance_score = result.metadata.get("relevance_score", 0)
+            print(f"  → Score: {relevance_score:.2f} | {result.url}")
+
+# 5️⃣ Max Pages and Score Thresholds
+async def max_pages_and_thresholds():
+    """
+    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
+    
+    This function shows:
+    - How to limit the number of pages crawled
+    - How to set score thresholds for more targeted crawling
+    - Comparing BFS, DFS, and Best-First strategies with these parameters
+    """
+    print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
+    
+    from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+    
+    async with AsyncWebCrawler() as crawler:
+        # Define a common keyword scorer for all examples
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["browser", "crawler", "web", "automation"], 
+            weight=1.0
+        )
+        
+        # EXAMPLE 1: BFS WITH MAX PAGES
+        print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
+        print("  Limit the crawler to a maximum of 5 pages")
+        
+        bfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=2, 
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=5  # Only crawl 5 pages
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
+        
+        print(f"  ✅ Crawled exactly {len(results)} pages as specified by max_pages")
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | {result.url}")
+            
+        # EXAMPLE 2: DFS WITH SCORE THRESHOLD
+        print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
+        print("  Only crawl pages with a relevance score above 0.5")
+        
+        dfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=DFSDeepCrawlStrategy(
+                max_depth=2,
+                include_external=False, 
+                url_scorer=keyword_scorer,
+                score_threshold=0.7,  # Only process URLs with scores above 0.5
+                max_pages=10
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
+        
+        print(f"  ✅ Crawled {len(results)} pages with scores above threshold")
+        for result in results:
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
+        print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
+        print("  Limit to 7 pages with scores above 0.3, prioritizing highest scores")
+        
+        bf_config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(
+                max_depth=2,
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=7,          # Limit to 7 pages total
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+            stream=True,
+        )
+        
+        results = []
+        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        print(f"  ✅ Crawled {len(results)} high-value pages with scores above 0.3")
+        if results:
+            avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
+            print(f"  ✅ Average score: {avg_score:.2f}")
+            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
+
+# 6️⃣ Wrap-Up and Key Takeaways
+async def wrap_up():
+    """
+    PART 6: Wrap-Up and Key Takeaways
+
+    Summarize the key concepts learned in this tutorial.
+    """
+    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
+    print("Combining filters, scorers, and streaming for an optimized crawl")
+
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain(
+        [
+            DomainFilter(
+                allowed_domains=["docs.crawl4ai.com"],
+                blocked_domains=["old.docs.crawl4ai.com"],
+            ),
+            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
+            ContentTypeFilter(allowed_types=["text/html"]),
+        ]
+    )
+
+    # Create a composite scorer that combines multiple scoring strategies
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"], weight=0.7
+    )
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=1,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+    )
+
+    # Execute the crawl
+    results = []
+    start_time = time.perf_counter()
+
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    duration = time.perf_counter() - start_time
+
+    # Summarize the results
+    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+    print(
+        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
+    )
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("\n📊 Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+async def run_tutorial():
+    """
+    Executes all tutorial sections in sequence.
+    """
+    print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
+    print("======================================")
+    print("This tutorial will walk you through deep crawling techniques,")
+    print("from basic to advanced, using the Crawl4AI library.")
+
+    # Define sections - uncomment to run specific parts during development
+    tutorial_sections = [
+        basic_deep_crawl,
+        stream_vs_nonstream,
+        filters_and_scorers,
+        max_pages_and_thresholds, 
+        advanced_filters,
+        wrap_up,
+    ]
+
+    for section in tutorial_sections:
+        await section()
+
+    print("\n🎉 TUTORIAL COMPLETE! 🎉")
+    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
+    print("For more information, check out https://docs.crawl4ai.com")
+
+# Execute the tutorial when run directly
+if __name__ == "__main__":
+    asyncio.run(run_tutorial())
+```
diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
new file mode 100644
index 00000000..1642f85e
--- /dev/null
+++ b/deploy/docker/c4ai-doc-context.md
@@ -0,0 +1,8899 @@
+# Crawl4AI Doc Context
+
+Generated on 2025-04-21
+
+## File: docs/md_v2/core/ask-ai.md
+
+```md
+<div class="ask-ai-container">
+<iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
+</div>
+
+<script>
+// Iframe height adjustment
+function resizeAskAiIframe() {
+  const iframe = document.getElementById('ask-ai-frame');
+  if (iframe) {
+    const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
+    // Footer is removed by JS below, so calculate height based on header + small buffer
+    const topOffset = headerHeight + 20; // Header + buffer/margin
+
+    const availableHeight = window.innerHeight - topOffset;
+    iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
+  }
+}
+
+// Run immediately and on resize/load
+resizeAskAiIframe(); // Initial call
+let resizeTimer;
+window.addEventListener('load', resizeAskAiIframe);
+window.addEventListener('resize', () => {
+    clearTimeout(resizeTimer);
+    resizeTimer = setTimeout(resizeAskAiIframe, 150);
+});
+
+// Remove Footer & HR from parent page (DOM Ready might be safer)
+document.addEventListener('DOMContentLoaded', () => {
+    setTimeout(() => { // Add slight delay just in case elements render slowly
+        const footer = window.parent.document.querySelector('footer'); // Target parent document
+        if (footer) {
+            const hrBeforeFooter = footer.previousElementSibling;
+            if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
+                hrBeforeFooter.remove();
+            }
+            footer.remove();
+            // Trigger resize again after removing footer
+            resizeAskAiIframe();
+        } else {
+             console.warn("Ask AI Page: Could not find footer in parent document to remove.");
+        }
+    }, 100); // Shorter delay
+});
+</script>
+
+<style>
+#terminal-mkdocs-main-content {
+    padding: 0 !important;
+    margin: 0;
+    width: 100%;
+    height: 100%;
+    overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
+}
+
+/* Ensure iframe container takes full space */
+#terminal-mkdocs-main-content .ask-ai-container {
+    /* Remove negative margins if footer removal handles space */
+     margin: 0;
+    padding: 0;
+    max-width: none;
+    /* Let the JS set the height */
+    /* height: 600px; Initial fallback height */
+    overflow: hidden; /* Hide potential overflow before JS resize */
+}
+
+/* Hide title/paragraph if they were part of the markdown */
+/* Alternatively, just remove them from the .md file directly */
+/* #terminal-mkdocs-main-content > h1,
+#terminal-mkdocs-main-content > p:first-of-type {
+    display: none;
+} */
+
+</style>
+
+```
+
+
+## File: docs/md_v2/core/browser-crawler-config.md
+
+```md
+# Browser, Crawler & LLM Configuration (Quick Overview)
+
+Crawl4AI’s flexibility stems from two key classes:
+
+1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).  
+2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).  
+3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
+
+In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
+
+---
+
+## 1. BrowserConfig Essentials
+
+```python
+class BrowserConfig:
+    def __init__(
+        browser_type="chromium",
+        headless=True,
+        proxy_config=None,
+        viewport_width=1080,
+        viewport_height=600,
+        verbose=True,
+        use_persistent_context=False,
+        user_data_dir=None,
+        cookies=None,
+        headers=None,
+        user_agent=None,
+        text_mode=False,
+        light_mode=False,
+        extra_args=None,
+        # ... other advanced parameters omitted here
+    ):
+        ...
+```
+
+### Key Fields to Note
+
+
+
+1. **`browser_type`**  
+- Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
+- Defaults to `"chromium"`.  
+- If you need a different engine, specify it here.
+
+2. **`headless`**  
+   - `True`: Runs the browser in headless mode (invisible browser).  
+   - `False`: Runs the browser in visible mode, which helps with debugging.
+
+3. **`proxy_config`**  
+   - A dictionary with fields like:  
+```json
+{
+    "server": "http://proxy.example.com:8080", 
+    "username": "...", 
+    "password": "..."
+}
+```
+   - Leave as `None` if a proxy is not required.
+
+4. **`viewport_width` & `viewport_height`**:  
+   - The initial window size.  
+   - Some sites behave differently with smaller or bigger viewports.
+
+5. **`verbose`**:  
+   - If `True`, prints extra logs.  
+   - Handy for debugging.
+
+6. **`use_persistent_context`**:  
+   - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.  
+   - Typically also set `user_data_dir` to point to a folder.
+
+7. **`cookies`** & **`headers`**:  
+   - If you want to start with specific cookies or add universal HTTP headers, set them here.  
+   - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
+
+8. **`user_agent`**:  
+   - Custom User-Agent string. If `None`, a default is used.  
+   - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
+
+9. **`text_mode`** & **`light_mode`**:  
+   - `text_mode=True` disables images, possibly speeding up text-only crawls.  
+   - `light_mode=True` turns off certain background features for performance.  
+
+10. **`extra_args`**:  
+    - Additional flags for the underlying browser.  
+    - E.g. `["--disable-extensions"]`.
+
+### Helper Methods
+
+Both configuration classes provide a `clone()` method to create modified copies:
+
+```python
+# Create a base browser config
+base_browser = BrowserConfig(
+    browser_type="chromium",
+    headless=True,
+    text_mode=True
+)
+
+# Create a visible browser config for debugging
+debug_browser = base_browser.clone(
+    headless=False,
+    verbose=True
+)
+```
+
+**Minimal Example**:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+browser_conf = BrowserConfig(
+    browser_type="firefox",
+    headless=False,
+    text_mode=True
+)
+
+async with AsyncWebCrawler(config=browser_conf) as crawler:
+    result = await crawler.arun("https://example.com")
+    print(result.markdown[:300])
+```
+
+---
+
+## 2. CrawlerRunConfig Essentials
+
+```python
+class CrawlerRunConfig:
+    def __init__(
+        word_count_threshold=200,
+        extraction_strategy=None,
+        markdown_generator=None,
+        cache_mode=None,
+        js_code=None,
+        wait_for=None,
+        screenshot=False,
+        pdf=False,
+        capture_mhtml=False,
+        enable_rate_limiting=False,
+        rate_limit_config=None,
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=20,
+        display_mode=None,
+        verbose=True,
+        stream=False,  # Enable streaming for arun_many()
+        # ... other advanced parameters omitted
+    ):
+        ...
+```
+
+### Key Fields to Note
+
+1. **`word_count_threshold`**:  
+   - The minimum word count before a block is considered.  
+   - If your site has lots of short paragraphs or items, you can lower it.
+
+2. **`extraction_strategy`**:  
+   - Where you plug in JSON-based extraction (CSS, LLM, etc.).  
+   - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
+
+3. **`markdown_generator`**:  
+   - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.  
+   - If `None`, a default approach is used.
+
+4. **`cache_mode`**:  
+   - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).  
+   - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
+
+5. **`js_code`**:  
+   - A string or list of JS strings to execute.  
+   - Great for “Load More” buttons or user interactions.  
+
+6. **`wait_for`**:  
+   - A CSS or JS expression to wait for before extracting content.  
+   - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
+
+7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
+   - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
+   - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
+8. **`verbose`**:  
+   - Logs additional runtime details.  
+   - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
+
+9. **`enable_rate_limiting`**:  
+   - If `True`, enables rate limiting for batch processing.  
+   - Requires `rate_limit_config` to be set.
+
+10. **`memory_threshold_percent`**:  
+    - The memory threshold (as a percentage) to monitor.  
+    - If exceeded, the crawler will pause or slow down.
+
+11. **`check_interval`**:  
+    - The interval (in seconds) to check system resources.  
+    - Affects how often memory and CPU usage are monitored.
+
+12. **`max_session_permit`**:  
+    - The maximum number of concurrent crawl sessions.  
+    - Helps prevent overwhelming the system.
+
+13. **`display_mode`**:  
+    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
+    - Affects how much information is printed during the crawl.
+
+### Helper Methods
+
+The `clone()` method is particularly useful for creating variations of your crawler configuration:
+
+```python
+# Create a base configuration
+base_config = CrawlerRunConfig(
+    cache_mode=CacheMode.ENABLED,
+    word_count_threshold=200,
+    wait_until="networkidle"
+)
+
+# Create variations for different use cases
+stream_config = base_config.clone(
+    stream=True,  # Enable streaming mode
+    cache_mode=CacheMode.BYPASS
+)
+
+debug_config = base_config.clone(
+    page_timeout=120000,  # Longer timeout for debugging
+    verbose=True
+)
+```
+
+The `clone()` method:
+- Creates a new instance with all the same settings
+- Updates only the specified parameters
+- Leaves the original configuration unchanged
+- Perfect for creating variations without repeating all parameters
+
+---
+
+
+
+
+
+## 3. LLMConfig Essentials
+
+### Key fields to note
+
+1. **`provider`**:  
+- Which LLM provoder to use. 
+- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
+
+2. **`api_token`**:  
+    - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables  
+    - API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
+    - Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`            
+
+3. **`base_url`**:  
+   - If your provider has a custom endpoint
+
+```python
+llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+```
+
+## 4. Putting It All Together
+
+In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    # 1) Browser config: headless, bigger viewport, no proxy
+    browser_conf = BrowserConfig(
+        headless=True,
+        viewport_width=1280,
+        viewport_height=720
+    )
+
+    # 2) Example extraction strategy
+    schema = {
+        "name": "Articles",
+        "baseSelector": "div.article",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+    extraction = JsonCssExtractionStrategy(schema)
+
+    # 3) Example LLM content filtering
+
+    gemini_config = LLMConfig(
+        provider="gemini/gemini-1.5-pro" 
+        api_token = "env:GEMINI_API_TOKEN"
+    )
+
+    # Initialize LLM filter with specific instruction
+    filter = LLMContentFilter(
+        llm_config=gemini_config,  # or your preferred provider
+        instruction="""
+        Focus on extracting the core educational content.
+        Include:
+        - Key concepts and explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        chunk_token_threshold=500,  # Adjust based on your needs
+        verbose=True
+    )
+
+    md_generator = DefaultMarkdownGenerator(
+    content_filter=filter,
+    options={"ignore_links": True}
+
+    # 4) Crawler run config: skip cache, use extraction
+    run_conf = CrawlerRunConfig(
+        markdown_generator=md_generator,
+        extraction_strategy=extraction,
+        cache_mode=CacheMode.BYPASS,
+    )
+
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        # 4) Execute the crawl
+        result = await crawler.arun(url="https://example.com/news", config=run_conf)
+
+        if result.success:
+            print("Extracted content:", result.extracted_content)
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 5. Next Steps
+
+For a **detailed list** of available parameters (including advanced ones), see:
+
+- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)  
+
+You can explore topics like:
+
+- **Custom Hooks & Auth** (Inject JavaScript or handle login forms).  
+- **Session Management** (Re-use pages, preserve state across multiple calls).  
+- **Magic Mode** or **Identity-based Crawling** (Fight bot detection by simulating user behavior).  
+- **Advanced Caching** (Fine-tune read/write cache modes).  
+
+---
+
+## 6. Conclusion
+
+**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
+
+- **Which** browser to launch, how it should run, and any proxy or user agent needs.  
+- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
+- **Which** LLM provider to use, api token, temperature and base url for custom endpoints
+
+Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling!
+```
+
+
+## File: docs/md_v2/core/cache-modes.md
+
+```md
+# Crawl4AI Cache System and Migration Guide
+
+## Overview
+Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
+
+## Old vs New Approach
+
+### Old Way (Deprecated)
+The old system used multiple boolean flags:
+- `bypass_cache`: Skip cache entirely
+- `disable_cache`: Disable all caching
+- `no_cache_read`: Don't read from cache
+- `no_cache_write`: Don't write to cache
+
+### New Way (Recommended)
+The new system uses a single `CacheMode` enum:
+- `CacheMode.ENABLED`: Normal caching (read/write)
+- `CacheMode.DISABLED`: No caching at all
+- `CacheMode.READ_ONLY`: Only read from cache
+- `CacheMode.WRITE_ONLY`: Only write to cache
+- `CacheMode.BYPASS`: Skip cache for this operation
+
+## Migration Example
+
+### Old Code (Deprecated)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def use_proxy():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            bypass_cache=True  # Old way
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### New Code (Recommended)
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def use_proxy():
+    # Use CacheMode in CrawlerRunConfig
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)  
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=config  # Pass the configuration object
+        )
+        print(len(result.markdown))
+
+async def main():
+    await use_proxy()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Common Migration Patterns
+
+| Old Flag              | New Mode                       |
+|-----------------------|---------------------------------|
+| `bypass_cache=True`   | `cache_mode=CacheMode.BYPASS`  |
+| `disable_cache=True`  | `cache_mode=CacheMode.DISABLED`|
+| `no_cache_read=True`  | `cache_mode=CacheMode.WRITE_ONLY` |
+| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
+```
+
+
+## File: docs/md_v2/core/cli.md
+
+```md
+# Crawl4AI CLI Guide
+
+## Table of Contents
+- [Installation](#installation)
+- [Basic Usage](#basic-usage)
+- [Configuration](#configuration)
+  - [Browser Configuration](#browser-configuration)
+  - [Crawler Configuration](#crawler-configuration)
+  - [Extraction Configuration](#extraction-configuration)
+  - [Content Filtering](#content-filtering)
+- [Advanced Features](#advanced-features)
+  - [LLM Q&A](#llm-qa)
+  - [Structured Data Extraction](#structured-data-extraction)
+  - [Content Filtering](#content-filtering-1)
+- [Output Formats](#output-formats)
+- [Examples](#examples)
+- [Configuration Reference](#configuration-reference)
+- [Best Practices & Tips](#best-practices--tips)
+
+## Basic Usage
+
+The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
+
+```bash
+# Basic crawling
+crwl https://example.com
+
+# Get markdown output
+crwl https://example.com -o markdown
+
+# Verbose JSON output with cache bypass
+crwl https://example.com -o json -v --bypass-cache
+
+# See usage examples
+crwl --example
+```
+
+## Quick Example of Advanced Usage
+
+If you clone the repository and run the following command, you will receive the content of the page in JSON format according to a JSON-CSS schema:
+
+```bash
+crwl "https://www.infoq.com/ai-ml-data-eng/" -e docs/examples/cli/extract_css.yml -s docs/examples/cli/css_schema.json -o json;
+```
+
+## Configuration
+
+### Browser Configuration
+
+Browser settings can be configured via YAML file or command line parameters:
+
+```yaml
+# browser.yml
+headless: true
+viewport_width: 1280
+user_agent_mode: "random"
+verbose: true
+ignore_https_errors: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -B browser.yml
+
+# Using direct parameters
+crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+```
+
+### Crawler Configuration
+
+Control crawling behavior:
+
+```yaml
+# crawler.yml
+cache_mode: "bypass"
+wait_until: "networkidle"
+page_timeout: 30000
+delay_before_return_html: 0.5
+word_count_threshold: 100
+scan_full_page: true
+scroll_delay: 0.3
+process_iframes: false
+remove_overlay_elements: true
+magic: true
+verbose: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -C crawler.yml
+
+# Using direct parameters
+crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+```
+
+### Extraction Configuration
+
+Two types of extraction are supported:
+
+1. CSS/XPath-based extraction:
+```yaml
+# extract_css.yml
+type: "json-css"
+params:
+  verbose: true
+```
+
+```json
+// css_schema.json
+{
+  "name": "ArticleExtractor",
+  "baseSelector": ".article",
+  "fields": [
+    {
+      "name": "title",
+      "selector": "h1.title",
+      "type": "text"
+    },
+    {
+      "name": "link",
+      "selector": "a.read-more",
+      "type": "attribute",
+      "attribute": "href"
+    }
+  ]
+}
+```
+
+2. LLM-based extraction:
+```yaml
+# extract_llm.yml
+type: "llm"
+provider: "openai/gpt-4"
+instruction: "Extract all articles with their titles and links"
+api_token: "your-token"
+params:
+  temperature: 0.3
+  max_tokens: 1000
+```
+
+```json
+// llm_schema.json
+{
+  "title": "Article",
+  "type": "object",
+  "properties": {
+    "title": {
+      "type": "string",
+      "description": "The title of the article"
+    },
+    "link": {
+      "type": "string",
+      "description": "URL to the full article"
+    }
+  }
+}
+```
+
+## Advanced Features
+
+### LLM Q&A
+
+Ask questions about crawled content:
+
+```bash
+# Simple question
+crwl https://example.com -q "What is the main topic discussed?"
+
+# View content then ask questions
+crwl https://example.com -o markdown  # See content first
+crwl https://example.com -q "Summarize the key points"
+crwl https://example.com -q "What are the conclusions?"
+
+# Combined with advanced crawling
+crwl https://example.com \
+    -B browser.yml \
+    -c "css_selector=article,scan_full_page=true" \
+    -q "What are the pros and cons mentioned?"
+```
+
+First-time setup:
+- Prompts for LLM provider and API token
+- Saves configuration in `~/.crawl4ai/global.yml`
+- Supports various providers (openai/gpt-4, anthropic/claude-3-sonnet, etc.)
+- For case of `ollama` you do not need to provide API token.
+- See [LiteLLM Providers](https://docs.litellm.ai/docs/providers) for full list
+
+### Structured Data Extraction
+
+Extract structured data using CSS selectors:
+
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json
+```
+
+Or using LLM-based extraction:
+
+```bash
+crwl https://example.com \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -o json
+```
+
+### Content Filtering
+
+Filter content for relevance:
+
+```yaml
+# filter_bm25.yml
+type: "bm25"
+query: "target content"
+threshold: 1.0
+
+# filter_pruning.yml
+type: "pruning"
+query: "focus topic"
+threshold: 0.48
+```
+
+```bash
+crwl https://example.com -f filter_bm25.yml -o markdown-fit
+```
+
+## Output Formats
+
+- `all` - Full crawl result including metadata
+- `json` - Extracted structured data (when using extraction)
+- `markdown` / `md` - Raw markdown output
+- `markdown-fit` / `md-fit` - Filtered markdown for better readability
+
+## Complete Examples
+
+1. Basic Extraction:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -C crawler.yml \
+    -o json
+```
+
+2. Structured Data Extraction:
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json \
+    -v
+```
+
+3. LLM Extraction with Filtering:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -f filter_bm25.yml \
+    -o json
+```
+
+4. Interactive Q&A:
+```bash
+# First crawl and view
+crwl https://example.com -o markdown
+
+# Then ask questions
+crwl https://example.com -q "What are the main points?"
+crwl https://example.com -q "Summarize the conclusions"
+```
+
+## Best Practices & Tips
+
+1. **Configuration Management**:
+   - Keep common configurations in YAML files
+   - Use CLI parameters for quick overrides
+   - Store sensitive data (API tokens) in `~/.crawl4ai/global.yml`
+
+2. **Performance Optimization**:
+   - Use `--bypass-cache` for fresh content
+   - Enable `scan_full_page` for infinite scroll pages
+   - Adjust `delay_before_return_html` for dynamic content
+
+3. **Content Extraction**:
+   - Use CSS extraction for structured content
+   - Use LLM extraction for unstructured content
+   - Combine with filters for focused results
+
+4. **Q&A Workflow**:
+   - View content first with `-o markdown`
+   - Ask specific questions
+   - Use broader context with appropriate selectors
+
+## Recap
+
+The Crawl4AI CLI provides:
+- Flexible configuration via files and parameters
+- Multiple extraction strategies (CSS, XPath, LLM)
+- Content filtering and optimization
+- Interactive Q&A capabilities
+- Various output formats
+
+
+```
+
+
+## File: docs/md_v2/core/content-selection.md
+
+```md
+# Content Selection
+
+Crawl4AI provides multiple ways to **select**, **filter**, and **refine** the content from your crawls. Whether you need to target a specific CSS region, exclude entire tags, filter out external links, or remove certain domains and images, **`CrawlerRunConfig`** offers a wide range of parameters.
+
+Below, we show how to configure these parameters and combine them for precise control.
+
+---
+
+## 1. CSS-Based Selection
+
+There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`.
+
+### 1.1 Using `css_selector`
+
+A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # e.g., first 30 items from Hacker News
+        css_selector=".athing:nth-child(-n+30)"  
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com/newest", 
+            config=config
+        )
+        print("Partial HTML length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Result**: Only elements matching that selector remain in `result.cleaned_html`.
+
+### 1.2 Using `target_elements`
+
+The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # Target article body and sidebar, but not other content
+        target_elements=["article.main-content", "aside.sidebar"]
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/blog-post", 
+            config=config
+        )
+        print("Markdown focused on target elements")
+        print("Links from entire page still available:", len(result.links.get("internal", [])))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection.
+
+---
+
+## 2. Content Filtering & Exclusions
+
+### 2.1 Basic Overview
+
+```python
+config = CrawlerRunConfig(
+    # Content thresholds
+    word_count_threshold=10,        # Minimum words per block
+
+    # Tag exclusions
+    excluded_tags=['form', 'header', 'footer', 'nav'],
+
+    # Link filtering
+    exclude_external_links=True,    
+    exclude_social_media_links=True,
+    # Block entire domains
+    exclude_domains=["adtrackers.com", "spammynews.org"],    
+    exclude_social_media_domains=["facebook.com", "twitter.com"],
+
+    # Media filtering
+    exclude_external_images=True
+)
+```
+
+**Explanation**:
+
+- **`word_count_threshold`**: Ignores text blocks under X words. Helps skip trivial blocks like short nav or disclaimers.  
+- **`excluded_tags`**: Removes entire tags (`<form>`, `<header>`, `<footer>`, etc.).  
+- **Link Filtering**:  
+  - `exclude_external_links`: Strips out external links and may remove them from `result.links`.  
+  - `exclude_social_media_links`: Removes links pointing to known social media domains.  
+  - `exclude_domains`: A custom list of domains to block if discovered in links.  
+  - `exclude_social_media_domains`: A curated list (override or add to it) for social media sites.  
+- **Media Filtering**:  
+  - `exclude_external_images`: Discards images not hosted on the same domain as the main page (or its subdomains).
+
+By default in case you set `exclude_social_media_links=True`, the following social media domains are excluded:
+```python
+[
+    'facebook.com',
+    'twitter.com',
+    'x.com',
+    'linkedin.com',
+    'instagram.com',
+    'pinterest.com',
+    'tiktok.com',
+    'snapchat.com',
+    'reddit.com',
+]
+```
+
+
+### 2.2 Example Usage
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    config = CrawlerRunConfig(
+        css_selector="main.content", 
+        word_count_threshold=10,
+        excluded_tags=["nav", "footer"],
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+        exclude_domains=["ads.com", "spammytrackers.net"],
+        exclude_external_images=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
+        print("Cleaned HTML length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Note**: If these parameters remove too much, reduce or disable them accordingly.
+
+---
+
+## 3. Handling Iframes
+
+Some sites embed content in `<iframe>` tags. If you want that inline:
+```python
+config = CrawlerRunConfig(
+    # Merge iframe content into the final output
+    process_iframes=True,    
+    remove_overlay_elements=True
+)
+```
+
+**Usage**:
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        process_iframes=True,
+        remove_overlay_elements=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.org/iframe-demo", 
+            config=config
+        )
+        print("Iframe-merged length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 4. Structured Extraction Examples
+
+You can combine content selection with a more advanced extraction strategy. For instance, a **CSS-based** or **LLM-based** extraction strategy can run on the filtered HTML.
+
+### 4.1 Pattern-Based with `JsonCssExtractionStrategy`
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    # Minimal schema for repeated items
+    schema = {
+        "name": "News Items",
+        "baseSelector": "tr.athing",
+        "fields": [
+            {"name": "title", "selector": "span.titleline a", "type": "text"},
+            {
+                "name": "link", 
+                "selector": "span.titleline a", 
+                "type": "attribute", 
+                "attribute": "href"
+            }
+        ]
+    }
+
+    config = CrawlerRunConfig(
+        # Content filtering
+        excluded_tags=["form", "header"],
+        exclude_domains=["adsite.com"],
+        
+        # CSS selection or entire page
+        css_selector="table.itemlist",
+
+        # No caching for demonstration
+        cache_mode=CacheMode.BYPASS,
+
+        # Extraction strategy
+        extraction_strategy=JsonCssExtractionStrategy(schema)
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com/newest", 
+            config=config
+        )
+        data = json.loads(result.extracted_content)
+        print("Sample extracted item:", data[:1])  # Show first item
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 4.2 LLM-Based Extraction
+
+```python
+import asyncio
+import json
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class ArticleData(BaseModel):
+    headline: str
+    summary: str
+
+async def main():
+    llm_strategy = LLMExtractionStrategy(
+        llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
+        schema=ArticleData.schema(),
+        extraction_type="schema",
+        instruction="Extract 'headline' and a short 'summary' from the content."
+    )
+
+    config = CrawlerRunConfig(
+        exclude_external_links=True,
+        word_count_threshold=20,
+        extraction_strategy=llm_strategy
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
+        article = json.loads(result.extracted_content)
+        print(article)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Here, the crawler:
+
+- Filters out external links (`exclude_external_links=True`).  
+- Ignores very short text blocks (`word_count_threshold=20`).  
+- Passes the final HTML to your LLM strategy for an AI-driven parse.
+
+---
+
+## 5. Comprehensive Example
+
+Below is a short function that unifies **CSS selection**, **exclusion** logic, and a pattern-based extraction, demonstrating how you can fine-tune your final data:
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_main_articles(url: str):
+    schema = {
+        "name": "ArticleBlock",
+        "baseSelector": "div.article-block",
+        "fields": [
+            {"name": "headline", "selector": "h2", "type": "text"},
+            {"name": "summary", "selector": ".summary", "type": "text"},
+            {
+                "name": "metadata",
+                "type": "nested",
+                "fields": [
+                    {"name": "author", "selector": ".author", "type": "text"},
+                    {"name": "date", "selector": ".date", "type": "text"}
+                ]
+            }
+        ]
+    }
+
+    config = CrawlerRunConfig(
+        # Keep only #main-content
+        css_selector="#main-content",
+        
+        # Filtering
+        word_count_threshold=10,
+        excluded_tags=["nav", "footer"],  
+        exclude_external_links=True,
+        exclude_domains=["somebadsite.com"],
+        exclude_external_images=True,
+
+        # Extraction
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=url, config=config)
+        if not result.success:
+            print(f"Error: {result.error_message}")
+            return None
+        return json.loads(result.extracted_content)
+
+async def main():
+    articles = await extract_main_articles("https://news.ycombinator.com/newest")
+    if articles:
+        print("Extracted Articles:", articles[:2])  # Show first 2
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why This Works**:
+- **CSS** scoping with `#main-content`.  
+- Multiple **exclude_** parameters to remove domains, external images, etc.  
+- A **JsonCssExtractionStrategy** to parse repeated article blocks.
+
+---
+
+## 6. Scraping Modes
+
+Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
+
+async def main():
+    config = CrawlerRunConfig(
+        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com", 
+            config=config
+        )
+```
+
+You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
+
+```python
+from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
+
+class CustomScrapingStrategy(ContentScrapingStrategy):
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # Implement your custom scraping logic here
+        return ScrapingResult(
+            cleaned_html="<html>...</html>",  # Cleaned HTML content
+            success=True,                     # Whether scraping was successful
+            media=Media(
+                images=[                      # List of images found
+                    MediaItem(
+                        src="https://example.com/image.jpg",
+                        alt="Image description",
+                        desc="Surrounding text",
+                        score=1,
+                        type="image",
+                        group_id=1,
+                        format="jpg",
+                        width=800
+                    )
+                ],
+                videos=[],                    # List of videos (same structure as images)
+                audios=[]                     # List of audio files (same structure as images)
+            ),
+            links=Links(
+                internal=[                    # List of internal links
+                    Link(
+                        href="https://example.com/page",
+                        text="Link text",
+                        title="Link title",
+                        base_domain="example.com"
+                    )
+                ],
+                external=[]                   # List of external links (same structure)
+            ),
+            metadata={                        # Additional metadata
+                "title": "Page Title",
+                "description": "Page description"
+            }
+        )
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # For simple cases, you can use the sync version
+        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
+```
+
+### Performance Considerations
+
+The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
+
+1. LXML strategy is currently experimental
+2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
+3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
+
+Choose LXML strategy when:
+- Processing large HTML documents (recommended for >100KB)
+- Performance is critical
+- Working with well-formed HTML
+
+Stick to BeautifulSoup strategy (default) when:
+- Maximum compatibility is needed
+- Working with malformed HTML
+- Exact parsing behavior is critical
+
+---
+
+## 7. Combining CSS Selection Methods
+
+You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    # Target specific content but preserve page context
+    config = CrawlerRunConfig(
+        # Focus markdown on main content and sidebar
+        target_elements=["#main-content", ".sidebar"],
+        
+        # Global filters applied to entire page
+        excluded_tags=["nav", "footer", "header"],
+        exclude_external_links=True,
+        
+        # Use basic content thresholds
+        word_count_threshold=15,
+        
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/article",
+            config=config
+        )
+        
+        print(f"Content focuses on specific elements, but all links still analyzed")
+        print(f"Internal links: {len(result.links.get('internal', []))}")
+        print(f"External links: {len(result.links.get('external', []))}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+This approach gives you the best of both worlds:
+- Markdown generation and content extraction focus on the elements you care about
+- Links, images and other page data still give you the full context of the page
+- Content filtering still applies globally
+
+## 8. Conclusion
+
+By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
+
+1. **`target_elements`** – Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media.
+2. **`css_selector`** – Basic scoping to an element or region for all extraction processes.  
+3. **`word_count_threshold`** – Skip short blocks.  
+4. **`excluded_tags`** – Remove entire HTML tags.  
+5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.  
+6. **`exclude_external_images`** – Remove images from external sources.  
+7. **`process_iframes`** – Merge iframe content if needed.  
+
+Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!
+```
+
+
+## File: docs/md_v2/core/crawler-result.md
+
+```md
+# Crawl Result and Output
+
+When you call `arun()` on a page, Crawl4AI returns a **`CrawlResult`** object containing everything you might need—raw HTML, a cleaned version, optional screenshots or PDFs, structured extraction results, and more. This document explains those fields and how they map to different output types.  
+
+---
+
+## 1. The `CrawlResult` Model
+
+Below is the core schema. Each field captures a different aspect of the crawl’s result:
+
+```python
+class MarkdownGenerationResult(BaseModel):
+    raw_markdown: str
+    markdown_with_citations: str
+    references_markdown: str
+    fit_markdown: Optional[str] = None
+    fit_html: Optional[str] = None
+
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    screenshot: Optional[str] = None
+    pdf : Optional[bytes] = None
+    mhtml: Optional[str] = None
+    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    class Config:
+        arbitrary_types_allowed = True
+```
+
+### Table: Key Fields in `CrawlResult`
+
+| Field (Name & Type)                       | Description                                                                                         |
+|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
+| **url (`str`)**                           | The final or actual URL crawled (in case of redirects).                                             |
+| **html (`str`)**                          | Original, unmodified page HTML. Good for debugging or custom processing.                            |
+| **success (`bool`)**                      | `True` if the crawl completed without major errors, else `False`.                                   |
+| **cleaned_html (`Optional[str]`)**        | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
+| **media (`Dict[str, List[Dict]]`)**       | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc.   |
+| **links (`Dict[str, List[Dict]]`)**       | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
+| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads.         |
+| **screenshot (`Optional[str]`)**          | Screenshot of the page (base64-encoded) if `screenshot=True`.                                       |
+| **pdf (`Optional[bytes]`)**               | PDF of the page if `pdf=True`.                                                                      |
+| **mhtml (`Optional[str]`)**               | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources.      |
+| **markdown (`Optional[str or MarkdownGenerationResult]`)** | It holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
+| **extracted_content (`Optional[str]`)**   | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text.          |
+| **metadata (`Optional[dict]`)**           | Additional info about the crawl or extracted data.                                                  |
+| **error_message (`Optional[str]`)**       | If `success=False`, contains a short description of what went wrong.                                |
+| **session_id (`Optional[str]`)**          | The ID of the session used for multi-page or persistent crawling.                                   |
+| **response_headers (`Optional[dict]`)**   | HTTP response headers, if captured.                                                                 |
+| **status_code (`Optional[int]`)**         | HTTP status code (e.g., 200 for OK).                                                                |
+| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`.                                               |
+
+---
+
+## 2. HTML Variants
+
+### `html`: Raw HTML
+
+Crawl4AI preserves the exact HTML as `result.html`. Useful for:
+
+- Debugging page issues or checking the original content.
+- Performing your own specialized parse if needed.
+
+### `cleaned_html`: Sanitized
+
+If you specify any cleanup or exclusion parameters in `CrawlerRunConfig` (like `excluded_tags`, `remove_forms`, etc.), you’ll see the result here:
+
+```python
+config = CrawlerRunConfig(
+    excluded_tags=["form", "header", "footer"],
+    keep_data_attributes=False
+)
+result = await crawler.arun("https://example.com", config=config)
+print(result.cleaned_html)  # Freed of forms, header, footer, data-* attributes
+```
+
+---
+
+## 3. Markdown Generation
+
+### 3.1 `markdown`
+
+- **`markdown`**: The current location for detailed markdown output, returning a **`MarkdownGenerationResult`** object.  
+- **`markdown_v2`**: Deprecated since v0.5.
+
+**`MarkdownGenerationResult`** Fields:
+
+| Field                   | Description                                                                    |
+|-------------------------|--------------------------------------------------------------------------------|
+| **raw_markdown**        | The basic HTML→Markdown conversion.                                            |
+| **markdown_with_citations** | Markdown including inline citations that reference links at the end.        |
+| **references_markdown** | The references/citations themselves (if `citations=True`).                      |
+| **fit_markdown**        | The filtered/“fit” markdown if a content filter was used.                       |
+| **fit_html**            | The filtered HTML that generated `fit_markdown`.                                |
+
+### 3.2 Basic Example with a Markdown Generator
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+config = CrawlerRunConfig(
+    markdown_generator=DefaultMarkdownGenerator(
+        options={"citations": True, "body_width": 80}  # e.g. pass html2text style options
+    )
+)
+result = await crawler.arun(url="https://example.com", config=config)
+
+md_res = result.markdown  # or eventually 'result.markdown'
+print(md_res.raw_markdown[:500])
+print(md_res.markdown_with_citations)
+print(md_res.references_markdown)
+```
+
+**Note**: If you use a filter like `PruningContentFilter`, you’ll get `fit_markdown` and `fit_html` as well.
+
+---
+
+## 4. Structured Extraction: `extracted_content`
+
+If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structured data is **not** stored in `markdown`—it’s placed in **`result.extracted_content`** as a JSON string (or sometimes plain text).
+
+### Example: CSS Extraction with `raw://` HTML
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    schema = {
+        "name": "Example Items",
+        "baseSelector": "div.item",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="raw://" + raw_html,
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=JsonCssExtractionStrategy(schema)
+            )
+        )
+        data = json.loads(result.extracted_content)
+        print(data)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+Here:
+- `url="raw://..."` passes the HTML content directly, no network requests.  
+- The **CSS** extraction strategy populates `result.extracted_content` with the JSON array `[{"title": "...", "link": "..."}]`.
+
+---
+
+## 5. More Fields: Links, Media, and More
+
+### 5.1 `links`
+
+A dictionary, typically with `"internal"` and `"external"` lists. Each entry might have `href`, `text`, `title`, etc. This is automatically captured if you haven’t disabled link extraction.
+
+```python
+print(result.links["internal"][:3])  # Show first 3 internal links
+```
+
+### 5.2 `media`
+
+Similarly, a dictionary with `"images"`, `"audio"`, `"video"`, etc. Each item could include `src`, `alt`, `score`, and more, if your crawler is set to gather them.
+
+```python
+images = result.media.get("images", [])
+for img in images:
+    print("Image URL:", img["src"], "Alt:", img.get("alt"))
+```
+
+### 5.3 `screenshot`, `pdf`, and `mhtml`
+
+If you set `screenshot=True`, `pdf=True`, or `capture_mhtml=True` in **`CrawlerRunConfig`**, then:
+
+- `result.screenshot` contains a base64-encoded PNG string.
+- `result.pdf` contains raw PDF bytes (you can write them to a file).
+- `result.mhtml` contains the MHTML snapshot of the page as a string (you can write it to a .mhtml file).
+
+```python
+# Save the PDF
+with open("page.pdf", "wb") as f:
+    f.write(result.pdf)
+
+# Save the MHTML
+if result.mhtml:
+    with open("page.mhtml", "w", encoding="utf-8") as f:
+        f.write(result.mhtml)
+```
+
+The MHTML (MIME HTML) format is particularly useful as it captures the entire web page including all of its resources (CSS, images, scripts, etc.) in a single file, making it perfect for archiving or offline viewing.
+
+### 5.4 `ssl_certificate`
+
+If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the site’s SSL cert, such as issuer, validity dates, etc.
+
+---
+
+## 6. Accessing These Fields
+
+After you run:
+
+```python
+result = await crawler.arun(url="https://example.com", config=some_config)
+```
+
+Check any field:
+
+```python
+if result.success:
+    print(result.status_code, result.response_headers)
+    print("Links found:", len(result.links.get("internal", [])))
+    if result.markdown:
+        print("Markdown snippet:", result.markdown.raw_markdown[:200])
+    if result.extracted_content:
+        print("Structured JSON:", result.extracted_content)
+else:
+    print("Error:", result.error_message)
+```
+
+**Deprecation**: Since v0.5 `result.markdown_v2`, `result.fit_html`,`result.fit_markdown` are deprecated. Use `result.markdown` instead! It holds `MarkdownGenerationResult`, which includes `fit_html` and `fit_markdown`
+as it's properties.
+
+
+---
+
+## 7. Next Steps
+
+- **Markdown Generation**: Dive deeper into how to configure `DefaultMarkdownGenerator` and various filters.  
+- **Content Filtering**: Learn how to use `BM25ContentFilter` and `PruningContentFilter`.
+- **Session & Hooks**: If you want to manipulate the page or preserve state across multiple `arun()` calls, see the hooking or session docs.  
+- **LLM Extraction**: For complex or unstructured content requiring AI-driven parsing, check the LLM-based strategies doc.
+
+**Enjoy** exploring all that `CrawlResult` offers—whether you need raw HTML, sanitized output, markdown, or fully structured data, Crawl4AI has you covered!
+```
+
+
+## File: docs/md_v2/core/deep-crawling.md
+
+```md
+# Deep Crawling
+
+One of Crawl4AI's most powerful features is its ability to perform **configurable deep crawling** that can explore websites beyond a single page. With fine-tuned control over crawl depth, domain boundaries, and content filtering, Crawl4AI gives you the tools to extract precisely the content you need.
+
+In this tutorial, you'll learn:
+
+1. How to set up a **Basic Deep Crawler** with BFS strategy  
+2. Understanding the difference between **streamed and non-streamed** output  
+3. Implementing **filters and scorers** to target specific content  
+4. Creating **advanced filtering chains** for sophisticated crawls  
+5. Using **BestFirstCrawling** for intelligent exploration prioritization  
+
+> **Prerequisites**  
+> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
+> - You know how to configure `CrawlerRunConfig`.
+
+---
+
+## 1. Quick Example
+
+Here's a minimal code snippet that implements a basic deep crawl using the **BFSDeepCrawlStrategy**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+
+async def main():
+    # Configure a 2-level deep crawl
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2, 
+            include_external=False
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun("https://example.com", config=config)
+        
+        print(f"Crawled {len(results)} pages in total")
+        
+        # Access individual results
+        for result in results[:3]:  # Show first 3 results
+            print(f"URL: {result.url}")
+            print(f"Depth: {result.metadata.get('depth', 0)}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What's happening?**  
+- `BFSDeepCrawlStrategy(max_depth=2, include_external=False)` instructs Crawl4AI to:
+  - Crawl the starting page (depth 0) plus 2 more levels
+  - Stay within the same domain (don't follow external links)
+- Each result contains metadata like the crawl depth
+- Results are returned as a list after all crawling is complete
+
+---
+
+## 2. Understanding Deep Crawling Strategy Options
+
+### 2.1 BFSDeepCrawlStrategy (Breadth-First Search)
+
+The **BFSDeepCrawlStrategy** uses a breadth-first approach, exploring all links at one depth before moving deeper:
+
+```python
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+
+# Basic configuration
+strategy = BFSDeepCrawlStrategy(
+    max_depth=2,               # Crawl initial page + 2 levels deep
+    include_external=False,    # Stay within the same domain
+    max_pages=50,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.3,       # Minimum score for URLs to be crawled (optional)
+)
+```
+
+**Key parameters:**
+- **`max_depth`**: Number of levels to crawl beyond the starting page
+- **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs
+
+### 2.2 DFSDeepCrawlStrategy (Depth-First Search)
+
+The **DFSDeepCrawlStrategy** uses a depth-first approach, explores as far down a branch as possible before backtracking.
+
+```python
+from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+
+# Basic configuration
+strategy = DFSDeepCrawlStrategy(
+    max_depth=2,               # Crawl initial page + 2 levels deep
+    include_external=False,    # Stay within the same domain
+    max_pages=30,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.5,       # Minimum score for URLs to be crawled (optional)
+)
+```
+
+**Key parameters:**
+- **`max_depth`**: Number of levels to crawl beyond the starting page
+- **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs
+
+### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)
+
+For more intelligent crawling, use **BestFirstCrawlingStrategy** with scorers to prioritize the most relevant pages:
+
+```python
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+# Create a scorer
+scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"],
+    weight=0.7
+)
+
+# Configure the strategy
+strategy = BestFirstCrawlingStrategy(
+    max_depth=2,
+    include_external=False,
+    url_scorer=scorer,
+    max_pages=25,              # Maximum number of pages to crawl (optional)
+)
+```
+
+This crawling approach:
+- Evaluates each discovered URL based on scorer criteria
+- Visits higher-scoring pages first
+- Helps focus crawl resources on the most relevant content
+- Can limit total pages crawled with `max_pages`
+- Does not need `score_threshold` as it naturally prioritizes by score
+
+---
+
+## 3. Streaming vs. Non-Streaming Results
+
+Crawl4AI can return results in two modes:
+
+### 3.1 Non-Streaming Mode (Default)
+
+```python
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+    stream=False  # Default behavior
+)
+
+async with AsyncWebCrawler() as crawler:
+    # Wait for ALL results to be collected before returning
+    results = await crawler.arun("https://example.com", config=config)
+    
+    for result in results:
+        process_result(result)
+```
+
+**When to use non-streaming mode:**
+- You need the complete dataset before processing
+- You're performing batch operations on all results together
+- Crawl time isn't a critical factor
+
+### 3.2 Streaming Mode
+
+```python
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+    stream=True  # Enable streaming
+)
+
+async with AsyncWebCrawler() as crawler:
+    # Returns an async iterator
+    async for result in await crawler.arun("https://example.com", config=config):
+        # Process each result as it becomes available
+        process_result(result)
+```
+
+**Benefits of streaming mode:**
+- Process results immediately as they're discovered
+- Start working with early results while crawling continues
+- Better for real-time applications or progressive display
+- Reduces memory pressure when handling many pages
+
+---
+
+## 4. Filtering Content with Filter Chains
+
+Filters help you narrow down which pages to crawl. Combine multiple filters using **FilterChain** for powerful targeting.
+
+### 4.1 Basic URL Pattern Filter
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
+
+# Only follow URLs containing "blog" or "docs"
+url_filter = URLPatternFilter(patterns=["*blog*", "*docs*"])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([url_filter])
+    )
+)
+```
+
+### 4.2 Combining Multiple Filters
+
+```python
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter
+)
+
+# Create a chain of filters
+filter_chain = FilterChain([
+    # Only follow URLs with specific patterns
+    URLPatternFilter(patterns=["*guide*", "*tutorial*"]),
+    
+    # Only crawl specific domains
+    DomainFilter(
+        allowed_domains=["docs.example.com"],
+        blocked_domains=["old.docs.example.com"]
+    ),
+    
+    # Only include specific content types
+    ContentTypeFilter(allowed_types=["text/html"])
+])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=2,
+        filter_chain=filter_chain
+    )
+)
+```
+
+### 4.3 Available Filter Types
+
+Crawl4AI includes several specialized filters:
+
+- **`URLPatternFilter`**: Matches URL patterns using wildcard syntax
+- **`DomainFilter`**: Controls which domains to include or exclude
+- **`ContentTypeFilter`**: Filters based on HTTP Content-Type
+- **`ContentRelevanceFilter`**: Uses similarity to a text query
+- **`SEOFilter`**: Evaluates SEO elements (meta tags, headers, etc.)
+
+---
+
+## 5. Using Scorers for Prioritized Crawling
+
+Scorers assign priority values to discovered URLs, helping the crawler focus on the most relevant content first.
+
+### 5.1 KeywordRelevanceScorer
+
+```python
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+
+# Create a keyword relevance scorer
+keyword_scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"],
+    weight=0.7  # Importance of this scorer (0.0 to 1.0)
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BestFirstCrawlingStrategy(
+        max_depth=2,
+        url_scorer=keyword_scorer
+    ),
+    stream=True  # Recommended with BestFirstCrawling
+)
+
+# Results will come in order of relevance score
+async with AsyncWebCrawler() as crawler:
+    async for result in await crawler.arun("https://example.com", config=config):
+        score = result.metadata.get("score", 0)
+        print(f"Score: {score:.2f} | {result.url}")
+```
+
+**How scorers work:**
+- Evaluate each discovered URL before crawling
+- Calculate relevance based on various signals
+- Help the crawler make intelligent choices about traversal order
+
+---
+
+## 6. Advanced Filtering Techniques
+
+### 6.1 SEO Filter for Quality Assessment
+
+The **SEOFilter** helps you identify pages with strong SEO characteristics:
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, SEOFilter
+
+# Create an SEO filter that looks for specific keywords in page metadata
+seo_filter = SEOFilter(
+    threshold=0.5,  # Minimum score (0.0 to 1.0)
+    keywords=["tutorial", "guide", "documentation"]
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([seo_filter])
+    )
+)
+```
+
+### 6.2 Content Relevance Filter
+
+The **ContentRelevanceFilter** analyzes the actual content of pages:
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, ContentRelevanceFilter
+
+# Create a content relevance filter
+relevance_filter = ContentRelevanceFilter(
+    query="Web crawling and data extraction with Python",
+    threshold=0.7  # Minimum similarity score (0.0 to 1.0)
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([relevance_filter])
+    )
+)
+```
+
+This filter:
+- Measures semantic similarity between query and page content
+- It's a BM25-based relevance filter using head section content
+
+---
+
+## 7. Building a Complete Advanced Crawler
+
+This example combines multiple techniques for a sophisticated crawl:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    DomainFilter,
+    URLPatternFilter,
+    ContentTypeFilter
+)
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+async def run_advanced_crawler():
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain([
+        # Domain boundaries
+        DomainFilter(
+            allowed_domains=["docs.example.com"],
+            blocked_domains=["old.docs.example.com"]
+        ),
+        
+        # URL patterns to include
+        URLPatternFilter(patterns=["*guide*", "*tutorial*", "*blog*"]),
+        
+        # Content type filtering
+        ContentTypeFilter(allowed_types=["text/html"])
+    ])
+
+    # Create a relevance scorer
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"],
+        weight=0.7
+    )
+
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True
+    )
+
+    # Execute the crawl
+    results = []
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun("https://docs.example.com", config=config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    # Analyze the results
+    print(f"Crawled {len(results)} high-value pages")
+    print(f"Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}")
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+if __name__ == "__main__":
+    asyncio.run(run_advanced_crawler())
+```
+
+---
+
+
+## 8. Limiting and Controlling Crawl Size
+
+### 8.1 Using max_pages
+
+You can limit the total number of pages crawled with the `max_pages` parameter:
+
+```python
+# Limit to exactly 20 pages regardless of depth
+strategy = BFSDeepCrawlStrategy(
+    max_depth=3,
+    max_pages=20
+)
+```
+
+This feature is useful for:
+- Controlling API costs
+- Setting predictable execution times
+- Focusing on the most important content
+- Testing crawl configurations before full execution
+
+### 8.2 Using score_threshold
+
+For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages:
+
+```python
+# Only follow links with scores above 0.4
+strategy = DFSDeepCrawlStrategy(
+    max_depth=2,
+    url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]),
+    score_threshold=0.4  # Skip URLs with scores below this value
+)
+```
+
+Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first.
+
+## 9. Common Pitfalls & Tips
+
+1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits.
+
+2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.
+
+3.**Be a good web citizen.**  Respect robots.txt. (disabled by default)
+  
+4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results.
+
+5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
+
+---
+
+## 10. Summary & Next Steps
+
+In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
+
+- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy**
+- Process results in streaming or non-streaming mode
+- Apply filters to target specific content
+- Use scorers to prioritize the most relevant pages
+- Limit crawls with `max_pages` and `score_threshold` parameters
+- Build a complete advanced crawler with combined techniques
+
+With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.
+
+```
+
+
+## File: docs/md_v2/core/docker-deployment.md
+
+```md
+# Crawl4AI Docker Guide 🐳
+
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+  - [Local Build](#local-build)
+  - [Docker Hub](#docker-hub)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+  - [Understanding Request Schema](#understanding-request-schema)
+  - [REST API Examples](#rest-api-examples)
+  - [Python SDK](#python-sdk)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Getting Help](#getting-help)
+
+## Prerequisites
+
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher)
+- At least 4GB of RAM available for the container
+- Python 3.10+ (if using the Python SDK)
+- Node.js 16+ (if using the Node.js examples)
+
+> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+### Local Build
+
+Let's get your local environment set up step by step!
+
+#### 1. Building the Image
+
+First, clone the repository and build the Docker image:
+
+```bash
+# Clone the repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai/deploy
+
+# Build the Docker image
+docker build --platform=linux/amd64 --no-cache -t crawl4ai .
+
+# Or build for arm64
+docker build --platform=linux/arm64 --no-cache -t crawl4ai .
+```
+
+#### 2. Environment Setup
+
+If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file:
+
+```env
+# OpenAI
+OPENAI_API_KEY=sk-your-key
+
+# Anthropic
+ANTHROPIC_API_KEY=your-anthropic-key
+
+# DeepSeek
+DEEPSEEK_API_KEY=your-deepseek-key
+
+# Check out https://docs.litellm.ai/docs/providers for more providers!
+```
+
+> 🔑 **Note**: Keep your API keys secure! Never commit them to version control.
+
+#### 3. Running the Container
+
+You have several options for running the container:
+
+Basic run (no LLM support):
+```bash
+docker run -d -p 8000:8000 --name crawl4ai crawl4ai
+```
+
+With LLM support:
+```bash
+docker run -d -p 8000:8000 \
+  --env-file .llm.env \
+  --name crawl4ai \
+  crawl4ai
+```
+
+Using host environment variables (Not a good practice, but works for local testing):
+```bash
+docker run -d -p 8000:8000 \
+  --env-file .llm.env \
+  --env "$(env)" \
+  --name crawl4ai \
+  crawl4ai
+```
+
+#### Multi-Platform Build
+For distributing your image across different architectures, use `buildx`:
+
+```bash
+# Set up buildx builder
+docker buildx create --use
+
+# Build for multiple platforms
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  -t crawl4ai \
+  --push \
+  .
+```
+
+> 💡 **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry.
+
+#### Development Build
+For development, you might want to enable all features:
+
+```bash
+docker build -t crawl4ai
+  --build-arg INSTALL_TYPE=all \
+  --build-arg PYTHON_VERSION=3.10 \
+  --build-arg ENABLE_GPU=true \
+  .
+```
+
+#### GPU-Enabled Build
+If you plan to use GPU acceleration:
+
+```bash
+docker build -t crawl4ai
+  --build-arg ENABLE_GPU=true \
+  deploy/docker/
+```
+
+### Build Arguments Explained
+
+| Argument | Description | Default | Options |
+|----------|-------------|---------|----------|
+| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
+| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
+| ENABLE_GPU | GPU support | false | true, false |
+| APP_HOME | Install path | /app | any valid path |
+
+### Build Best Practices
+
+1. **Choose the Right Install Type**
+   - `default`: Basic installation, smallest image, to be honest, I use this most of the time.
+   - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them)
+
+2. **Platform Considerations**
+   - Let Docker auto-detect platform unless you need cross-compilation
+   - Use --platform for specific architecture requirements
+   - Consider buildx for multi-architecture distribution
+
+3. **Performance Optimization**
+   - The image automatically includes platform-specific optimizations
+   - AMD64 gets OpenMP optimizations
+   - ARM64 gets OpenBLAS optimizations
+
+### Docker Hub
+
+> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned!
+
+## Using the API
+
+In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
+
+### Python SDK
+
+The SDK makes things easier! Here's how to use it:
+
+```python
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig
+
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
+      # If JWT is enabled, you can authenticate like this: (more on this later)
+        # await client.authenticate("test@example.com")
+        
+        # Non-streaming crawl
+        results = await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig()
+        )
+        print(f"Non-streaming results: {results}")
+        
+        # Streaming crawl
+        crawler_config = CrawlerRunConfig(stream=True)
+        async for result in await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=crawler_config
+        ):
+            print(f"Streamed result: {result}")
+        
+        # Get schema
+        schema = await client.get_schema()
+        print(f"Schema: {schema}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
+
+- `base_url` (str): Base URL of the Crawl4AI Docker server
+- `timeout` (float): Default timeout for requests in seconds
+- `verify_ssl` (bool): Whether to verify SSL certificates
+- `verbose` (bool): Whether to show logging output
+- `log_file` (str, optional): Path to log file if file logging is desired
+
+This client SDK generates a properly structured JSON request for the server's HTTP API.
+
+## Second Approach: Direct API Calls
+
+This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
+
+### Understanding Configuration Structure
+
+Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity.
+
+#### The Basic Pattern
+
+Try this in Python to understand the structure:
+```python
+from crawl4ai import BrowserConfig
+
+# Create a config and see its structure
+config = BrowserConfig(headless=True)
+print(config.dump())
+```
+
+This outputs:
+```json
+{
+    "type": "BrowserConfig",
+    "params": {
+        "headless": true
+    }
+}
+```
+
+#### Simple vs Complex Values
+
+The structure follows these rules:
+- Simple values (strings, numbers, booleans, lists) are passed directly
+- Complex values (classes, dictionaries) use the type-params pattern
+
+For example, with dictionaries:
+```json
+{
+    "browser_config": {
+        "type": "BrowserConfig",
+        "params": {
+            "headless": true,           // Simple boolean - direct value
+            "viewport": {               // Complex dictionary - needs type-params
+                "type": "dict",
+                "value": {
+                    "width": 1200,
+                    "height": 800
+                }
+            }
+        }
+    }
+}
+```
+
+#### Strategy Pattern and Nesting
+
+Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "chunking_strategy": {
+                "type": "RegexChunking",      // Strategy implementation
+                "params": {
+                    "patterns": ["\n\n", "\\.\\s+"]
+                }
+            }
+        }
+    }
+}
+```
+
+Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy.
+
+#### Complex Nested Example
+
+Let's look at a more complex example with content filtering:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {
+                            "threshold": 0.48,
+                            "threshold_type": "fixed"
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+This shows how deeply configurations can nest while maintaining a consistent structure.
+
+#### Quick Grammar Overview
+```
+config := {
+    "type": string,
+    "params": {
+        key: simple_value | complex_value
+    }
+}
+
+simple_value := string | number | boolean | [simple_value]
+complex_value := config | dict_value
+
+dict_value := {
+    "type": "dict",
+    "value": object
+}
+```
+
+#### Important Rules 🚨
+
+- Always use the type-params pattern for class instances
+- Use direct values for primitives (numbers, strings, booleans)
+- Wrap dictionaries with {"type": "dict", "value": {...}}
+- Arrays/lists are passed directly without type-params
+- All parameters are optional unless specifically required
+
+#### Pro Tip 💡
+
+The easiest way to get the correct structure is to:
+1. Create configuration objects in Python
+2. Use the `dump()` method to see their JSON representation
+3. Use that JSON in your API calls
+
+Example:
+```python
+from crawl4ai import CrawlerRunConfig, PruningContentFilter
+
+config = CrawlerRunConfig(
+    markdown_generator=DefaultMarkdownGenerator(
+        content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed")
+    ),
+    cache_mode= CacheMode.BYPASS
+)
+print(config.dump())  # Use this JSON in your API calls
+```
+
+
+#### More Examples
+
+**Advanced Crawler Configuration**
+
+```json
+{
+    "urls": ["https://example.com"],
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "cache_mode": "bypass",
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {
+                            "threshold": 0.48,
+                            "threshold_type": "fixed",
+                            "min_word_threshold": 0
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**Extraction Strategy**:
+
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "JsonCssExtractionStrategy",
+                "params": {
+                    "schema": {
+                        "baseSelector": "article.post",
+                        "fields": [
+                            {"name": "title", "selector": "h1", "type": "text"},
+                            {"name": "content", "selector": ".content", "type": "html"}
+                        ]
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**LLM Extraction Strategy**
+
+```json
+{
+  "crawler_config": {
+    "type": "CrawlerRunConfig",
+    "params": {
+      "extraction_strategy": {
+        "type": "LLMExtractionStrategy",
+        "params": {
+          "instruction": "Extract article title, author, publication date and main content",
+          "provider": "openai/gpt-4",
+          "api_token": "your-api-token",
+          "schema": {
+            "type": "dict",
+            "value": {
+              "title": "Article Schema",
+              "type": "object",
+              "properties": {
+                "title": {
+                  "type": "string",
+                  "description": "The article's headline"
+                },
+                "author": {
+                  "type": "string",
+                  "description": "The author's name"
+                },
+                "published_date": {
+                  "type": "string",
+                  "format": "date-time",
+                  "description": "Publication date and time"
+                },
+                "content": {
+                  "type": "string",
+                  "description": "The main article content"
+                }
+              },
+              "required": ["title", "content"]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+**Deep Crawler Example**
+
+```json
+{
+  "crawler_config": {
+    "type": "CrawlerRunConfig",
+    "params": {
+      "deep_crawl_strategy": {
+        "type": "BFSDeepCrawlStrategy",
+        "params": {
+          "max_depth": 3,
+          "filter_chain": {
+            "type": "FilterChain",
+            "params": {
+              "filters": [
+                {
+                  "type": "ContentTypeFilter",
+                  "params": {
+                    "allowed_types": ["text/html", "application/xhtml+xml"]
+                  }
+                },
+                {
+                  "type": "DomainFilter",
+                  "params": {
+                    "allowed_domains": ["blog.*", "docs.*"],
+                  }
+                }
+              ]
+            }
+          },
+          "url_scorer": {
+            "type": "CompositeScorer",
+            "params": {
+              "scorers": [
+                {
+                  "type": "KeywordRelevanceScorer",
+                  "params": {
+                    "keywords": ["tutorial", "guide", "documentation"],
+                  }
+                },
+                {
+                  "type": "PathDepthScorer",
+                  "params": {
+                    "weight": 0.5,
+                    "optimal_depth": 3  
+                  }
+                }
+              ]
+            }
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+### REST API Examples
+
+Let's look at some practical examples:
+
+#### Simple Crawl
+
+```python
+import requests
+
+crawl_payload = {
+    "urls": ["https://example.com"],
+    "browser_config": {"headless": True},
+    "crawler_config": {"stream": False}
+}
+response = requests.post(
+    "http://localhost:8000/crawl",
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled, more on this later
+    json=crawl_payload
+)
+print(response.json())  # Print the response for debugging
+```
+
+#### Streaming Results
+
+```python
+async def test_stream_crawl(session, token: str):
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:8000/crawl/stream"
+    payload = {
+        "urls": [
+            "https://example.com",
+            "https://example.com/page1",  
+            "https://example.com/page2",  
+            "https://example.com/page3",  
+        ],
+        "browser_config": {"headless": True, "viewport": {"width": 1200}},
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
+    }
+
+    # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
+    
+    try:
+        async with session.post(url, json=payload, headers=headers) as response:
+            status = response.status
+            print(f"Status: {status} (Expected: 200)")
+            assert status == 200, f"Expected 200, got {status}"
+            
+            # Read streaming response line-by-line (NDJSON)
+            async for line in response.content:
+                if line:
+                    data = json.loads(line.decode('utf-8').strip())
+                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+```
+
+## Metrics & Monitoring
+
+Keep an eye on your crawler with these endpoints:
+
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
+
+Example health check:
+```bash
+curl http://localhost:8000/health
+```
+
+## Deployment Scenarios
+
+> 🚧 Coming soon! We'll cover:
+> - Kubernetes deployment
+> - Cloud provider setups (AWS, GCP, Azure)
+> - High-availability configurations
+> - Load balancing strategies
+
+## Complete Examples
+
+Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
+[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py)
+[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py)
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security.
+
+### Understanding config.yml
+
+The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container.
+
+Here's a detailed breakdown of the configuration options:
+
+```yaml
+# Application Configuration
+app:
+  title: "Crawl4AI API"           # Server title in OpenAPI docs
+  version: "1.0.0"               # API version
+  host: "0.0.0.0"               # Listen on all interfaces
+  port: 8000                    # Server port
+  reload: True                  # Enable hot reloading (development only)
+  timeout_keep_alive: 300       # Keep-alive timeout in seconds
+
+# Rate Limiting Configuration
+rate_limiting:
+  enabled: True                 # Enable/disable rate limiting
+  default_limit: "100/minute"   # Rate limit format: "number/timeunit"
+  trusted_proxies: []          # List of trusted proxy IPs
+  storage_uri: "memory://"     # Use "redis://localhost:6379" for production
+
+# Security Configuration
+security:
+  enabled: false               # Master toggle for security features
+  jwt_enabled: true            # Enable JWT authentication
+  https_redirect: True         # Force HTTPS
+  trusted_hosts: ["*"]         # Allowed hosts (use specific domains in production)
+  headers:                     # Security headers
+    x_content_type_options: "nosniff"
+    x_frame_options: "DENY"
+    content_security_policy: "default-src 'self'"
+    strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+  memory_threshold_percent: 95.0  # Memory usage threshold
+  rate_limiter:
+    base_delay: [1.0, 2.0]      # Min and max delay between requests
+  timeouts:
+    stream_init: 30.0           # Stream initialization timeout
+    batch_process: 300.0        # Batch processing timeout
+
+# Logging Configuration
+logging:
+  level: "INFO"                 # Log level (DEBUG, INFO, WARNING, ERROR)
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+  prometheus:
+    enabled: True              # Enable Prometheus metrics
+    endpoint: "/metrics"       # Metrics endpoint
+  health_check:
+    endpoint: "/health"        # Health check endpoint
+```
+
+### JWT Authentication
+
+When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works:
+
+#### Getting a Token
+```python
+POST /token
+Content-Type: application/json
+
+{
+    "email": "user@example.com"
+}
+```
+
+The endpoint returns:
+```json
+{
+    "email": "user@example.com",
+    "access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...",
+    "token_type": "bearer"
+}
+```
+
+#### Using the Token
+Add the token to your requests:
+```bash
+curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl
+```
+
+Using the Python SDK:
+```python
+from crawl4ai.docker_client import Crawl4aiDockerClient
+
+async with Crawl4aiDockerClient() as client:
+    # Authenticate first
+    await client.authenticate("user@example.com")
+    
+    # Now all requests will include the token automatically
+    result = await client.crawl(urls=["https://example.com"])
+```
+
+#### Production Considerations 💡
+The default implementation uses a simple email verification. For production use, consider:
+- Email verification via OTP/magic links
+- OAuth2 integration
+- Rate limiting token generation
+- Token expiration and refresh mechanisms
+- IP-based restrictions
+
+### Configuration Tips and Best Practices
+
+1. **Production Settings** 🏭
+
+   ```yaml
+   app:
+     reload: False              # Disable reload in production
+     timeout_keep_alive: 120    # Lower timeout for better resource management
+   
+   rate_limiting:
+     storage_uri: "redis://redis:6379"  # Use Redis for distributed rate limiting
+     default_limit: "50/minute"         # More conservative rate limit
+   
+   security:
+     enabled: true                      # Enable all security features
+     trusted_hosts: ["your-domain.com"] # Restrict to your domain
+   ```
+
+2. **Development Settings** 🛠️
+
+   ```yaml
+   app:
+     reload: True               # Enable hot reloading
+     timeout_keep_alive: 300    # Longer timeout for debugging
+   
+   logging:
+     level: "DEBUG"            # More verbose logging
+   ```
+
+3. **High-Traffic Settings** 🚦
+
+   ```yaml
+   crawler:
+     memory_threshold_percent: 85.0  # More conservative memory limit
+     rate_limiter:
+       base_delay: [2.0, 4.0]       # More aggressive rate limiting
+   ```
+
+### Customizing Your Configuration
+
+#### Method 1: Pre-build Configuration
+
+```bash
+# Copy and modify config before building
+cd crawl4ai/deploy
+vim custom-config.yml # Or use any editor
+
+# Build with custom config
+docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest .
+```
+
+#### Method 2: Build-time Configuration
+
+Use a custom config during build:
+
+```bash
+# Build with custom config
+docker build --platform=linux/amd64 --no-cache \
+  --build-arg CONFIG_PATH=/path/to/custom-config.yml \ 
+  -t crawl4ai:latest .
+```
+
+#### Method 3: Runtime Configuration
+```bash
+# Mount custom config at runtime
+docker run -d -p 8000:8000 \
+  -v $(pwd)/custom-config.yml:/app/config.yml \
+  crawl4ai-server:prod
+```
+
+> 💡 Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory.
+> 💡 Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config.
+
+### Configuration Recommendations
+
+1. **Security First** 🔒
+   - Always enable security in production
+   - Use specific trusted_hosts instead of wildcards
+   - Set up proper rate limiting to protect your server
+   - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** 💻
+   - Adjust memory_threshold_percent based on available RAM
+   - Set timeouts according to your content size and network conditions
+   - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** 📊
+   - Enable Prometheus if you need metrics
+   - Set DEBUG logging in development, INFO in production
+   - Regular health check monitoring is crucial
+
+4. **Performance Tuning** ⚡
+   - Start with conservative rate limiter delays
+   - Increase batch_process timeout for large content
+   - Adjust stream_init timeout based on initial response times
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- 📖 Check our [full documentation](https://docs.crawl4ai.com)
+- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
+- ⭐ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment
+- Making API requests with proper typing
+- Using the Python SDK
+- Monitoring your deployment
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
+
+Happy crawling! 🕷️
+```
+
+
+## File: docs/md_v2/core/fit-markdown.md
+
+```md
+# Fit Markdown with Pruning & BM25
+
+**Fit Markdown** is a specialized **filtered** version of your page’s markdown, focusing on the most relevant content. By default, Crawl4AI converts the entire HTML into a broad **raw_markdown**. With fit markdown, we apply a **content filter** algorithm (e.g., **Pruning** or **BM25**) to remove or rank low-value sections—such as repetitive sidebars, shallow text blocks, or irrelevancies—leaving a concise textual “core.”
+
+---
+
+## 1. How “Fit Markdown” Works
+
+### 1.1 The `content_filter`
+
+In **`CrawlerRunConfig`**, you can specify a **`content_filter`** to shape how content is pruned or ranked before final markdown generation. A filter’s logic is applied **before** or **during** the HTML→Markdown process, producing:
+
+- **`result.markdown.raw_markdown`** (unfiltered)
+- **`result.markdown.fit_markdown`** (filtered or “fit” version)
+- **`result.markdown.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`)
+
+
+### 1.2 Common Filters
+
+1. **PruningContentFilter** – Scores each node by text density, link density, and tag importance, discarding those below a threshold.  
+2. **BM25ContentFilter** – Focuses on textual relevance using BM25 ranking, especially useful if you have a specific user query (e.g., “machine learning” or “food nutrition”).
+
+---
+
+## 2. PruningContentFilter
+
+**Pruning** discards less relevant nodes based on **text density, link density, and tag importance**. It’s a heuristic-based approach—if certain sections appear too “thin” or too “spammy,” they’re pruned.
+
+### 2.1 Usage Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    # Step 1: Create a pruning filter
+    prune_filter = PruningContentFilter(
+        # Lower → more content retained, higher → more content pruned
+        threshold=0.45,           
+        # "fixed" or "dynamic"
+        threshold_type="dynamic",  
+        # Ignore nodes with <5 words
+        min_word_threshold=5      
+    )
+
+    # Step 2: Insert it into a Markdown Generator
+    md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
+    
+    # Step 3: Pass it to CrawlerRunConfig
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com", 
+            config=config
+        )
+        
+        if result.success:
+            # 'fit_markdown' is your pruned content, focusing on "denser" text
+            print("Raw Markdown length:", len(result.markdown.raw_markdown))
+            print("Fit Markdown length:", len(result.markdown.fit_markdown))
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 2.2 Key Parameters
+
+- **`min_word_threshold`** (int): If a block has fewer words than this, it’s pruned.  
+- **`threshold_type`** (str):
+  - `"fixed"` → each node must exceed `threshold` (0–1).  
+  - `"dynamic"` → node scoring adjusts according to tag type, text/link density, etc.  
+- **`threshold`** (float, default ~0.48): The base or “anchor” cutoff.  
+
+**Algorithmic Factors**:
+
+- **Text density** – Encourages blocks that have a higher ratio of text to overall content.  
+- **Link density** – Penalizes sections that are mostly links.  
+- **Tag importance** – e.g., an `<article>` or `<p>` might be more important than a `<div>`.  
+- **Structural context** – If a node is deeply nested or in a suspected sidebar, it might be deprioritized.
+
+---
+
+## 3. BM25ContentFilter
+
+**BM25** is a classical text ranking algorithm often used in search engines. If you have a **user query** or rely on page metadata to derive a query, BM25 can identify which text chunks best match that query.
+
+### 3.1 Usage Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    # 1) A BM25 filter with a user query
+    bm25_filter = BM25ContentFilter(
+        user_query="startup fundraising tips",
+        # Adjust for stricter or looser results
+        bm25_threshold=1.2  
+    )
+
+    # 2) Insert into a Markdown Generator
+    md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
+    
+    # 3) Pass to crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com", 
+            config=config
+        )
+        if result.success:
+            print("Fit Markdown (BM25 query-based):")
+            print(result.markdown.fit_markdown)
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 3.2 Parameters
+
+- **`user_query`** (str, optional): E.g. `"machine learning"`. If blank, the filter tries to glean a query from page metadata.  
+- **`bm25_threshold`** (float, default 1.0):  
+  - Higher → fewer chunks but more relevant.  
+  - Lower → more inclusive.  
+
+> In more advanced scenarios, you might see parameters like `use_stemming`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted.
+
+---
+
+## 4. Accessing the “Fit” Output
+
+After the crawl, your “fit” content is found in **`result.markdown.fit_markdown`**. 
+
+```python
+fit_md = result.markdown.fit_markdown
+fit_html = result.markdown.fit_html
+```
+
+If the content filter is **BM25**, you might see additional logic or references in `fit_markdown` that highlight relevant segments. If it’s **Pruning**, the text is typically well-cleaned but not necessarily matched to a query.
+
+---
+
+## 5. Code Patterns Recap
+
+### 5.1 Pruning
+
+```python
+prune_filter = PruningContentFilter(
+    threshold=0.5,
+    threshold_type="fixed",
+    min_word_threshold=10
+)
+md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
+config = CrawlerRunConfig(markdown_generator=md_generator)
+```
+
+### 5.2 BM25
+
+```python
+bm25_filter = BM25ContentFilter(
+    user_query="health benefits fruit",
+    bm25_threshold=1.2
+)
+md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
+config = CrawlerRunConfig(markdown_generator=md_generator)
+```
+
+---
+
+## 6. Combining with “word_count_threshold” & Exclusions
+
+Remember you can also specify:
+
+```python
+config = CrawlerRunConfig(
+    word_count_threshold=10,
+    excluded_tags=["nav", "footer", "header"],
+    exclude_external_links=True,
+    markdown_generator=DefaultMarkdownGenerator(
+        content_filter=PruningContentFilter(threshold=0.5)
+    )
+)
+```
+
+Thus, **multi-level** filtering occurs:
+
+1. The crawler’s `excluded_tags` are removed from the HTML first.  
+2. The content filter (Pruning, BM25, or custom) prunes or ranks the remaining text blocks.  
+3. The final “fit” content is generated in `result.markdown.fit_markdown`.
+
+---
+
+## 7. Custom Filters
+
+If you need a different approach (like a specialized ML model or site-specific heuristics), you can create a new class inheriting from `RelevantContentFilter` and implement `filter_content(html)`. Then inject it into your **markdown generator**:
+
+```python
+from crawl4ai.content_filter_strategy import RelevantContentFilter
+
+class MyCustomFilter(RelevantContentFilter):
+    def filter_content(self, html, min_word_threshold=None):
+        # parse HTML, implement custom logic
+        return [block for block in ... if ... some condition...]
+
+```
+
+**Steps**:
+
+1. Subclass `RelevantContentFilter`.  
+2. Implement `filter_content(...)`.  
+3. Use it in your `DefaultMarkdownGenerator(content_filter=MyCustomFilter(...))`.
+
+---
+
+## 8. Final Thoughts
+
+**Fit Markdown** is a crucial feature for:
+
+- **Summaries**: Quickly get the important text from a cluttered page.  
+- **Search**: Combine with **BM25** to produce content relevant to a query.  
+- **AI Pipelines**: Filter out boilerplate so LLM-based extraction or summarization runs on denser text.
+
+**Key Points**:
+- **PruningContentFilter**: Great if you just want the “meatiest” text without a user query.  
+- **BM25ContentFilter**: Perfect for query-based extraction or searching.  
+- Combine with **`excluded_tags`, `exclude_external_links`, `word_count_threshold`** to refine your final “fit” text.  
+- Fit markdown ends up in **`result.markdown.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions.
+
+With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
+
+- Last Updated: 2025-01-01
+```
+
+
+## File: docs/md_v2/core/installation.md
+
+```md
+# Installation & Setup (2023 Edition)
+
+## 1. Basic Installation
+
+```bash
+pip install crawl4ai
+```
+
+This installs the **core** Crawl4AI library along with essential dependencies. **No** advanced features (like transformers or PyTorch) are included yet.
+
+## 2. Initial Setup & Diagnostics
+
+### 2.1 Run the Setup Command
+After installing, call:
+
+```bash
+crawl4ai-setup
+```
+
+**What does it do?**
+- Installs or updates required Playwright browsers (Chromium, Firefox, etc.)
+- Performs OS-level checks (e.g., missing libs on Linux)
+- Confirms your environment is ready to crawl
+
+### 2.2 Diagnostics
+Optionally, you can run **diagnostics** to confirm everything is functioning:
+
+```bash
+crawl4ai-doctor
+```
+
+This command attempts to:
+- Check Python version compatibility
+- Verify Playwright installation
+- Inspect environment variables or library conflicts
+
+If any issues arise, follow its suggestions (e.g., installing additional system packages) and re-run `crawl4ai-setup`.
+
+---
+
+## 3. Verifying Installation: A Simple Crawl (Skip this step if you already run `crawl4ai-doctor`)
+
+Below is a minimal Python script demonstrating a **basic** crawl. It uses our new **`BrowserConfig`** and **`CrawlerRunConfig`** for clarity, though no custom settings are passed in this example:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+        )
+        print(result.markdown[:300])  # Show the first 300 characters of extracted text
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Expected** outcome:
+- A headless browser session loads `example.com`
+- Crawl4AI returns ~300 characters of markdown.  
+If errors occur, rerun `crawl4ai-doctor` or manually ensure Playwright is installed correctly.
+
+---
+
+## 4. Advanced Installation (Optional)
+
+**Warning**: Only install these **if you truly need them**. They bring in larger dependencies, including big models, which can increase disk usage and memory load significantly.
+
+### 4.1 Torch, Transformers, or All
+
+- **Text Clustering (Torch)**  
+  ```bash
+  pip install crawl4ai[torch]
+  crawl4ai-setup
+  ```
+  Installs PyTorch-based features (e.g., cosine similarity or advanced semantic chunking).
+
+- **Transformers**  
+  ```bash
+  pip install crawl4ai[transformer]
+  crawl4ai-setup
+  ```
+  Adds Hugging Face-based summarization or generation strategies.
+
+- **All Features**  
+  ```bash
+  pip install crawl4ai[all]
+  crawl4ai-setup
+  ```
+
+#### (Optional) Pre-Fetching Models
+```bash
+crawl4ai-download-models
+```
+This step caches large models locally (if needed). **Only do this** if your workflow requires them.
+
+---
+
+## 5. Docker (Experimental)
+
+We provide a **temporary** Docker approach for testing. **It’s not stable and may break** with future releases. We plan a major Docker revamp in a future stable version, 2025 Q1. If you still want to try:
+
+```bash
+docker pull unclecode/crawl4ai:basic
+docker run -p 11235:11235 unclecode/crawl4ai:basic
+```
+
+You can then make POST requests to `http://localhost:11235/crawl` to perform crawls. **Production usage** is discouraged until our new Docker approach is ready (planned in Jan or Feb 2025).
+
+---
+
+## 6. Local Server Mode (Legacy)
+
+Some older docs mention running Crawl4AI as a local server. This approach has been **partially replaced** by the new Docker-based prototype and upcoming stable server release. You can experiment, but expect major changes. Official local server instructions will arrive once the new Docker architecture is finalized.
+
+---
+
+## Summary
+
+1. **Install** with `pip install crawl4ai` and run `crawl4ai-setup`.
+2. **Diagnose** with `crawl4ai-doctor` if you see errors.
+3. **Verify** by crawling `example.com` with minimal `BrowserConfig` + `CrawlerRunConfig`.
+4. **Advanced** features (Torch, Transformers) are **optional**—avoid them if you don’t need them (they significantly increase resource usage).
+5. **Docker** is **experimental**—use at your own risk until the stable version is released.
+6. **Local server** references in older docs are largely deprecated; a new solution is in progress.
+
+**Got questions?** Check [GitHub issues](https://github.com/unclecode/crawl4ai/issues) for updates or ask the community!
+```
+
+
+## File: docs/md_v2/core/link-media.md
+
+```md
+# Link & Media 
+
+In this tutorial, you’ll learn how to:
+
+1. Extract links (internal, external) from crawled pages  
+2. Filter or exclude specific domains (e.g., social media or custom domains)  
+3. Access and ma### 3.2 Excluding Images
+
+#### Excluding External Images
+
+If you're dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_external_images=True
+)
+```
+
+This setting attempts to discard images from outside the primary domain, keeping only those from the site you're crawling.
+
+#### Excluding All Images
+
+If you want to completely remove all images from the page to maximize performance and reduce memory usage, use:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_all_images=True
+)
+```
+
+This setting removes all images very early in the processing pipeline, which significantly improves memory efficiency and processing speed. This is particularly useful when:
+- You don't need image data in your results
+- You're crawling image-heavy pages that cause memory issues
+- You want to focus only on text content
+- You need to maximize crawling speeddata (especially images) in the crawl result  
+4. Configure your crawler to exclude or prioritize certain images
+
+> **Prerequisites**  
+> - You have completed or are familiar with the [AsyncWebCrawler Basics](../core/simple-crawling.md) tutorial.  
+> - You can run Crawl4AI in your environment (Playwright, Python, etc.).
+
+---
+
+Below is a revised version of the **Link Extraction** and **Media Extraction** sections that includes example data structures showing how links and media items are stored in `CrawlResult`. Feel free to adjust any field names or descriptions to match your actual output.
+
+---
+
+## 1. Link Extraction
+
+### 1.1 `result.links`
+
+When you call `arun()` or `arun_many()` on a URL, Crawl4AI automatically extracts links and stores them in the `links` field of `CrawlResult`. By default, the crawler tries to distinguish **internal** links (same domain) from **external** links (different domains).
+
+**Basic Example**:
+
+```python
+from crawl4ai import AsyncWebCrawler
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun("https://www.example.com")
+    if result.success:
+        internal_links = result.links.get("internal", [])
+        external_links = result.links.get("external", [])
+        print(f"Found {len(internal_links)} internal links.")
+        print(f"Found {len(internal_links)} external links.")
+        print(f"Found {len(result.media)} media items.")
+
+        # Each link is typically a dictionary with fields like:
+        # { "href": "...", "text": "...", "title": "...", "base_domain": "..." }
+        if internal_links:
+            print("Sample Internal Link:", internal_links[0])
+    else:
+        print("Crawl failed:", result.error_message)
+```
+
+**Structure Example**:
+
+```python
+result.links = {
+  "internal": [
+    {
+      "href": "https://kidocode.com/",
+      "text": "",
+      "title": "",
+      "base_domain": "kidocode.com"
+    },
+    {
+      "href": "https://kidocode.com/degrees/technology",
+      "text": "Technology Degree",
+      "title": "KidoCode Tech Program",
+      "base_domain": "kidocode.com"
+    },
+    # ...
+  ],
+  "external": [
+    # possibly other links leading to third-party sites
+  ]
+}
+```
+
+- **`href`**: The raw hyperlink URL.  
+- **`text`**: The link text (if any) within the `<a>` tag.  
+- **`title`**: The `title` attribute of the link (if present).  
+- **`base_domain`**: The domain extracted from `href`. Helpful for filtering or grouping by domain.
+
+---
+
+## 2. Domain Filtering
+
+Some websites contain hundreds of third-party or affiliate links. You can filter out certain domains at **crawl time** by configuring the crawler. The most relevant parameters in `CrawlerRunConfig` are:
+
+- **`exclude_external_links`**: If `True`, discard any link pointing outside the root domain.  
+- **`exclude_social_media_domains`**: Provide a list of social media platforms (e.g., `["facebook.com", "twitter.com"]`) to exclude from your crawl.  
+- **`exclude_social_media_links`**: If `True`, automatically skip known social platforms.  
+- **`exclude_domains`**: Provide a list of custom domains you want to exclude (e.g., `["spammyads.com", "tracker.net"]`).
+
+### 2.1 Example: Excluding External & Social Media Links
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    crawler_cfg = CrawlerRunConfig(
+        exclude_external_links=True,          # No links outside primary domain
+        exclude_social_media_links=True       # Skip recognized social media domains
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            "https://www.example.com",
+            config=crawler_cfg
+        )
+        if result.success:
+            print("[OK] Crawled:", result.url)
+            print("Internal links count:", len(result.links.get("internal", [])))
+            print("External links count:", len(result.links.get("external", [])))  
+            # Likely zero external links in this scenario
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 2.2 Example: Excluding Specific Domains
+
+If you want to let external links in, but specifically exclude a domain (e.g., `suspiciousads.com`), do this:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_domains=["suspiciousads.com"]
+)
+```
+
+This approach is handy when you still want external links but need to block certain sites you consider spammy.
+
+---
+
+## 3. Media Extraction
+
+### 3.1 Accessing `result.media`
+
+By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
+
+**Basic Example**:
+
+```python
+if result.success:
+    # Get images
+    images_info = result.media.get("images", [])
+    print(f"Found {len(images_info)} images in total.")
+    for i, img in enumerate(images_info[:3]):  # Inspect just the first 3
+        print(f"[Image {i}] URL: {img['src']}")
+        print(f"           Alt text: {img.get('alt', '')}")
+        print(f"           Score: {img.get('score')}")
+        print(f"           Description: {img.get('desc', '')}\n")
+    
+    # Get tables
+    tables = result.media.get("tables", [])
+    print(f"Found {len(tables)} data tables in total.")
+    for i, table in enumerate(tables):
+        print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
+        print(f"           Columns: {len(table.get('headers', []))}")
+        print(f"           Rows: {len(table.get('rows', []))}")
+```
+
+**Structure Example**:
+
+```python
+result.media = {
+  "images": [
+    {
+      "src": "https://cdn.prod.website-files.com/.../Group%2089.svg",
+      "alt": "coding school for kids",
+      "desc": "Trial Class Degrees degrees All Degrees AI Degree Technology ...",
+      "score": 3,
+      "type": "image",
+      "group_id": 0,
+      "format": None,
+      "width": None,
+      "height": None
+    },
+    # ...
+  ],
+  "videos": [
+    # Similar structure but with video-specific fields
+  ],
+  "audio": [
+    # Similar structure but with audio-specific fields
+  ],
+  "tables": [
+    {
+      "headers": ["Name", "Age", "Location"],
+      "rows": [
+        ["John Doe", "34", "New York"],
+        ["Jane Smith", "28", "San Francisco"],
+        ["Alex Johnson", "42", "Chicago"]
+      ],
+      "caption": "Employee Directory",
+      "summary": "Directory of company employees"
+    },
+    # More tables if present
+  ]
+}
+```
+
+Depending on your Crawl4AI version or scraping strategy, these dictionaries can include fields like:
+
+- **`src`**: The media URL (e.g., image source)  
+- **`alt`**: The alt text for images (if present)  
+- **`desc`**: A snippet of nearby text or a short description (optional)  
+- **`score`**: A heuristic relevance score if you’re using content-scoring features  
+- **`width`**, **`height`**: If the crawler detects dimensions for the image/video  
+- **`type`**: Usually `"image"`, `"video"`, or `"audio"`  
+- **`group_id`**: If you’re grouping related media items, the crawler might assign an ID  
+
+With these details, you can easily filter out or focus on certain images (for instance, ignoring images with very low scores or a different domain), or gather metadata for analytics.
+
+### 3.2 Excluding External Images
+
+If you’re dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_external_images=True
+)
+```
+
+This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling.
+
+### 3.3 Working with Tables
+
+Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
+
+- Presence of thead and tbody sections
+- Use of th elements for headers
+- Column consistency
+- Text density
+- And other factors
+
+Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
+
+**Accessing Table Data**:
+
+```python
+if result.success:
+    tables = result.media.get("tables", [])
+    print(f"Found {len(tables)} data tables on the page")
+    
+    if tables:
+        # Access the first table
+        first_table = tables[0]
+        print(f"Table caption: {first_table.get('caption', 'No caption')}")
+        print(f"Headers: {first_table.get('headers', [])}")
+        
+        # Print the first 3 rows
+        for i, row in enumerate(first_table.get('rows', [])[:3]):
+            print(f"Row {i+1}: {row}")
+```
+
+**Configuring Table Extraction**:
+
+You can adjust the sensitivity of the table detection algorithm with:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    table_score_threshold=5  # Lower value = more tables detected (default: 7)
+)
+```
+
+Each extracted table contains:
+- `headers`: Column header names
+- `rows`: List of rows, each containing cell values
+- `caption`: Table caption text (if available)
+- `summary`: Table summary attribute (if specified)
+
+### 3.4 Additional Media Config
+
+- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.  
+- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.  
+- **`capture_mhtml`**: Set to `True` if you want an MHTML snapshot of the page in `result.mhtml`. This format preserves the entire web page with all its resources (CSS, images, scripts) in a single file, making it perfect for archiving or offline viewing.
+- **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction.
+
+#### Example: Capturing Page as MHTML
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    crawler_cfg = CrawlerRunConfig(
+        capture_mhtml=True  # Enable MHTML capture
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=crawler_cfg)
+        
+        if result.success and result.mhtml:
+            # Save the MHTML snapshot to a file
+            with open("example.mhtml", "w", encoding="utf-8") as f:
+                f.write(result.mhtml)
+            print("MHTML snapshot saved to example.mhtml")
+        else:
+            print("Failed to capture MHTML:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+The MHTML format is particularly useful because:
+- It captures the complete page state including all resources
+- It can be opened in most modern browsers for offline viewing
+- It preserves the page exactly as it appeared during crawling
+- It's a single file, making it easy to store and transfer
+
+---
+
+## 4. Putting It All Together: Link & Media Filtering
+
+Here’s a combined example demonstrating how to filter out external links, skip certain domains, and exclude external images:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    # Suppose we want to keep only internal links, remove certain domains, 
+    # and discard external images from the final crawl data.
+    crawler_cfg = CrawlerRunConfig(
+        exclude_external_links=True,
+        exclude_domains=["spammyads.com"],
+        exclude_social_media_links=True,   # skip Twitter, Facebook, etc.
+        exclude_external_images=True,      # keep only images from main domain
+        wait_for_images=True,             # ensure images are loaded
+        verbose=True
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://www.example.com", config=crawler_cfg)
+
+        if result.success:
+            print("[OK] Crawled:", result.url)
+            
+            # 1. Links
+            in_links = result.links.get("internal", [])
+            ext_links = result.links.get("external", [])
+            print("Internal link count:", len(in_links))
+            print("External link count:", len(ext_links))  # should be zero with exclude_external_links=True
+            
+            # 2. Images
+            images = result.media.get("images", [])
+            print("Images found:", len(images))
+            
+            # Let's see a snippet of these images
+            for i, img in enumerate(images[:3]):
+                print(f"  - {img['src']} (alt={img.get('alt','')}, score={img.get('score','N/A')})")
+        else:
+            print("[ERROR] Failed to crawl. Reason:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 5. Common Pitfalls & Tips
+
+1. **Conflicting Flags**:  
+   - `exclude_external_links=True` but then also specifying `exclude_social_media_links=True` is typically fine, but understand that the first setting already discards *all* external links. The second becomes somewhat redundant.  
+   - `exclude_external_images=True` but want to keep some external images? Currently no partial domain-based setting for images, so you might need a custom approach or hook logic.
+
+2. **Relevancy Scores**:  
+   - If your version of Crawl4AI or your scraping strategy includes an `img["score"]`, it’s typically a heuristic based on size, position, or content analysis. Evaluate carefully if you rely on it.
+
+3. **Performance**:  
+   - Excluding certain domains or external images can speed up your crawl, especially for large, media-heavy pages.  
+   - If you want a “full” link map, do *not* exclude them. Instead, you can post-filter in your own code.
+
+4. **Social Media Lists**:  
+   - `exclude_social_media_links=True` typically references an internal list of known social domains like Facebook, Twitter, LinkedIn, etc. If you need to add or remove from that list, look for library settings or a local config file (depending on your version).
+
+---
+
+**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
+### Table Extraction Tips
+
+- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
+- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
+- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
+
+The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.
+
+```
+
+
+## File: docs/md_v2/core/local-files.md
+
+```md
+# Prefix-Based Input Handling in Crawl4AI
+
+This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.
+
+## Crawling a Web URL
+
+To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_web():
+    config = CrawlerRunConfig(bypass_cache=True)
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/apple", 
+            config=config
+        )
+        if result.success:
+            print("Markdown Content:")
+            print(result.markdown)
+        else:
+            print(f"Failed to crawl: {result.error_message}")
+
+asyncio.run(crawl_web())
+```
+
+## Crawling a Local HTML File
+
+To crawl a local HTML file, prefix the file path with `file://`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_local_file():
+    local_file_path = "/path/to/apple.html"  # Replace with your file path
+    file_url = f"file://{local_file_path}"
+    config = CrawlerRunConfig(bypass_cache=True)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=file_url, config=config)
+        if result.success:
+            print("Markdown Content from Local File:")
+            print(result.markdown)
+        else:
+            print(f"Failed to crawl local file: {result.error_message}")
+
+asyncio.run(crawl_local_file())
+```
+
+## Crawling Raw HTML Content
+
+To crawl raw HTML content, prefix the HTML string with `raw:`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_raw_html():
+    raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
+    raw_html_url = f"raw:{raw_html}"
+    config = CrawlerRunConfig(bypass_cache=True)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=raw_html_url, config=config)
+        if result.success:
+            print("Markdown Content from Raw HTML:")
+            print(result.markdown)
+        else:
+            print(f"Failed to crawl raw HTML: {result.error_message}")
+
+asyncio.run(crawl_raw_html())
+```
+
+---
+
+# Complete Example
+
+Below is a comprehensive script that:
+
+1. Crawls the Wikipedia page for "Apple."
+2. Saves the HTML content to a local file (`apple.html`).
+3. Crawls the local HTML file and verifies the markdown length matches the original crawl.
+4. Crawls the raw HTML content from the saved file and verifies consistency.
+
+```python
+import os
+import sys
+import asyncio
+from pathlib import Path
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def main():
+    wikipedia_url = "https://en.wikipedia.org/wiki/apple"
+    script_dir = Path(__file__).parent
+    html_file_path = script_dir / "apple.html"
+
+    async with AsyncWebCrawler() as crawler:
+        # Step 1: Crawl the Web URL
+        print("\n=== Step 1: Crawling the Wikipedia URL ===")
+        web_config = CrawlerRunConfig(bypass_cache=True)
+        result = await crawler.arun(url=wikipedia_url, config=web_config)
+
+        if not result.success:
+            print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
+            return
+
+        with open(html_file_path, 'w', encoding='utf-8') as f:
+            f.write(result.html)
+        web_crawl_length = len(result.markdown)
+        print(f"Length of markdown from web crawl: {web_crawl_length}\n")
+
+        # Step 2: Crawl from the Local HTML File
+        print("=== Step 2: Crawling from the Local HTML File ===")
+        file_url = f"file://{html_file_path.resolve()}"
+        file_config = CrawlerRunConfig(bypass_cache=True)
+        local_result = await crawler.arun(url=file_url, config=file_config)
+
+        if not local_result.success:
+            print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
+            return
+
+        local_crawl_length = len(local_result.markdown)
+        assert web_crawl_length == local_crawl_length, "Markdown length mismatch"
+        print("✅ Markdown length matches between web and local file crawl.\n")
+
+        # Step 3: Crawl Using Raw HTML Content
+        print("=== Step 3: Crawling Using Raw HTML Content ===")
+        with open(html_file_path, 'r', encoding='utf-8') as f:
+            raw_html_content = f.read()
+        raw_html_url = f"raw:{raw_html_content}"
+        raw_config = CrawlerRunConfig(bypass_cache=True)
+        raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
+
+        if not raw_result.success:
+            print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
+            return
+
+        raw_crawl_length = len(raw_result.markdown)
+        assert web_crawl_length == raw_crawl_length, "Markdown length mismatch"
+        print("✅ Markdown length matches between web and raw HTML crawl.\n")
+
+        print("All tests passed successfully!")
+    if html_file_path.exists():
+        os.remove(html_file_path)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+# Conclusion
+
+With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios.
+```
+
+
+## File: docs/md_v2/core/markdown-generation.md
+
+```md
+# Markdown Generation Basics
+
+One of Crawl4AI’s core features is generating **clean, structured markdown** from web pages. Originally built to solve the problem of extracting only the “actual” content and discarding boilerplate or noise, Crawl4AI’s markdown system remains one of its biggest draws for AI workflows.
+
+In this tutorial, you’ll learn:
+
+1. How to configure the **Default Markdown Generator**  
+2. How **content filters** (BM25 or Pruning) help you refine markdown and discard junk  
+3. The difference between raw markdown (`result.markdown`) and filtered markdown (`fit_markdown`)  
+
+> **Prerequisites**  
+> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
+> - You know how to configure `CrawlerRunConfig`.
+
+---
+
+## 1. Quick Example
+
+Here’s a minimal code snippet that uses the **DefaultMarkdownGenerator** with no additional filtering:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator()
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        
+        if result.success:
+            print("Raw Markdown Output:\n")
+            print(result.markdown)  # The unfiltered markdown from the page
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What’s happening?**  
+- `CrawlerRunConfig( markdown_generator = DefaultMarkdownGenerator() )` instructs Crawl4AI to convert the final HTML into markdown at the end of each crawl.  
+- The resulting markdown is accessible via `result.markdown`.
+
+---
+
+## 2. How Markdown Generation Works
+
+### 2.1 HTML-to-Text Conversion (Forked & Modified)
+
+Under the hood, **DefaultMarkdownGenerator** uses a specialized HTML-to-text approach that:
+
+- Preserves headings, code blocks, bullet points, etc.  
+- Removes extraneous tags (scripts, styles) that don’t add meaningful content.  
+- Can optionally generate references for links or skip them altogether.
+
+A set of **options** (passed as a dict) allows you to customize precisely how HTML converts to markdown. These map to standard html2text-like configuration plus your own enhancements (e.g., ignoring internal links, preserving certain tags verbatim, or adjusting line widths).
+
+### 2.2 Link Citations & References
+
+By default, the generator can convert `<a href="...">` elements into `[text][1]` citations, then place the actual links at the bottom of the document. This is handy for research workflows that demand references in a structured manner.
+
+### 2.3 Optional Content Filters
+
+Before or after the HTML-to-Markdown step, you can apply a **content filter** (like BM25 or Pruning) to reduce noise and produce a “fit_markdown”—a heavily pruned version focusing on the page’s main text. We’ll cover these filters shortly.
+
+---
+
+## 3. Configuring the Default Markdown Generator
+
+You can tweak the output by passing an `options` dict to `DefaultMarkdownGenerator`. For example:
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Example: ignore all links, don't escape HTML, and wrap text at 80 characters
+    md_generator = DefaultMarkdownGenerator(
+        options={
+            "ignore_links": True,
+            "escape_html": False,
+            "body_width": 80
+        }
+    )
+
+    config = CrawlerRunConfig(
+        markdown_generator=md_generator
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/docs", config=config)
+        if result.success:
+            print("Markdown:\n", result.markdown[:500])  # Just a snippet
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+Some commonly used `options`:
+
+- **`ignore_links`** (bool): Whether to remove all hyperlinks in the final markdown.  
+- **`ignore_images`** (bool): Remove all `![image]()` references.  
+- **`escape_html`** (bool): Turn HTML entities into text (default is often `True`).  
+- **`body_width`** (int): Wrap text at N characters. `0` or `None` means no wrapping.  
+- **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page.  
+- **`include_sup_sub`** (bool): Attempt to handle `<sup>` / `<sub>` in a more readable way.
+
+## 4. Selecting the HTML Source for Markdown Generation
+
+The `content_source` parameter allows you to control which HTML content is used as input for markdown generation. This gives you flexibility in how the HTML is processed before conversion to markdown.
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Option 1: Use the raw HTML directly from the webpage (before any processing)
+    raw_md_generator = DefaultMarkdownGenerator(
+        content_source="raw_html",
+        options={"ignore_links": True}
+    )
+    
+    # Option 2: Use the cleaned HTML (after scraping strategy processing - default)
+    cleaned_md_generator = DefaultMarkdownGenerator(
+        content_source="cleaned_html",  # This is the default
+        options={"ignore_links": True}
+    )
+    
+    # Option 3: Use preprocessed HTML optimized for schema extraction
+    fit_md_generator = DefaultMarkdownGenerator(
+        content_source="fit_html",
+        options={"ignore_links": True}
+    )
+    
+    # Use one of the generators in your crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=raw_md_generator  # Try each of the generators
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success:
+            print("Markdown:\n", result.markdown.raw_markdown[:500])
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+### HTML Source Options
+
+- **`"cleaned_html"`** (default): Uses the HTML after it has been processed by the scraping strategy. This HTML is typically cleaner and more focused on content, with some boilerplate removed.
+
+- **`"raw_html"`**: Uses the original HTML directly from the webpage, before any cleaning or processing. This preserves more of the original content, but may include navigation bars, ads, footers, and other elements that might not be relevant to the main content.
+
+- **`"fit_html"`**: Uses HTML preprocessed for schema extraction. This HTML is optimized for structured data extraction and may have certain elements simplified or removed.
+
+### When to Use Each Option
+
+- Use **`"cleaned_html"`** (default) for most cases where you want a balance of content preservation and noise removal.
+- Use **`"raw_html"`** when you need to preserve all original content, or when the cleaning process is removing content you actually want to keep.
+- Use **`"fit_html"`** when working with structured data or when you need HTML that's optimized for schema extraction.
+
+---
+
+## 5. Content Filters
+
+**Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want.
+
+### 5.1 BM25ContentFilter
+
+If you have a **search query**, BM25 is a good choice:
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import BM25ContentFilter
+from crawl4ai import CrawlerRunConfig
+
+bm25_filter = BM25ContentFilter(
+    user_query="machine learning",
+    bm25_threshold=1.2,
+    use_stemming=True
+)
+
+md_generator = DefaultMarkdownGenerator(
+    content_filter=bm25_filter,
+    options={"ignore_links": True}
+)
+
+config = CrawlerRunConfig(markdown_generator=md_generator)
+```
+
+- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
+- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
+- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
+
+**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
+
+### 5.2 PruningContentFilter
+
+If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections.
+
+```python
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+prune_filter = PruningContentFilter(
+    threshold=0.5,
+    threshold_type="fixed",  # or "dynamic"
+    min_word_threshold=50
+)
+```
+
+- **`threshold`**: Score boundary. Blocks below this score get removed.  
+- **`threshold_type`**:  
+    - `"fixed"`: Straight comparison (`score >= threshold` keeps the block).  
+    - `"dynamic"`: The filter adjusts threshold in a data-driven manner.  
+- **`min_word_threshold`**: Discard blocks under N words as likely too short or unhelpful.
+
+**When to Use PruningContentFilter**  
+- You want a broad cleanup without a user query.  
+- The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
+
+### 5.3 LLMContentFilter
+
+For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai.content_filter_strategy import LLMContentFilter
+
+async def main():
+    # Initialize LLM filter with specific instruction
+    filter = LLMContentFilter(
+        llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
+        instruction="""
+        Focus on extracting the core educational content.
+        Include:
+        - Key concepts and explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        chunk_token_threshold=4096,  # Adjust based on your needs
+        verbose=True
+    )
+
+    config = CrawlerRunConfig(
+        content_filter=filter
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        print(result.markdown.fit_markdown)  # Filtered markdown content
+```
+
+**Key Features:**
+- **Intelligent Filtering**: Uses LLMs to understand and extract relevant content while maintaining context
+- **Customizable Instructions**: Tailor the filtering process with specific instructions
+- **Chunk Processing**: Handles large documents by processing them in chunks (controlled by `chunk_token_threshold`)
+- **Parallel Processing**: For better performance, use smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks
+
+**Two Common Use Cases:**
+
+1. **Exact Content Preservation**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Extract the main educational content while preserving its original wording and substance completely.
+    1. Maintain the exact language and terminology
+    2. Keep all technical explanations and examples intact
+    3. Preserve the original flow and structure
+    4. Remove only clearly irrelevant elements like navigation menus and ads
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+2. **Focused Content Extraction**:
+```python
+filter = LLMContentFilter(
+    instruction="""
+    Focus on extracting specific types of content:
+    - Technical documentation
+    - Code examples
+    - API references
+    Reformat the content into clear, well-structured markdown
+    """,
+    chunk_token_threshold=4096
+)
+```
+
+> **Performance Tip**: Set a smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks. The default value is infinity, which processes the entire content as a single chunk.
+
+---
+
+## 6. Using Fit Markdown
+
+When a content filter is active, the library produces two forms of markdown inside `result.markdown`:
+
+1. **`raw_markdown`**: The full unfiltered markdown.  
+2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+async def main():
+    config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(threshold=0.6),
+            options={"ignore_links": True}
+        )
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://news.example.com/tech", config=config)
+        if result.success:
+            print("Raw markdown:\n", result.markdown)
+            
+            # If a filter is used, we also have .fit_markdown:
+            md_object = result.markdown  # or your equivalent
+            print("Filtered markdown:\n", md_object.fit_markdown)
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 7. The `MarkdownGenerationResult` Object
+
+If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as:
+
+- **`raw_markdown`**: The direct HTML-to-markdown transformation (no filtering).  
+- **`markdown_with_citations`**: A version that moves links to reference-style footnotes.  
+- **`references_markdown`**: A separate string or section containing the gathered references.  
+- **`fit_markdown`**: The filtered markdown if you used a content filter.  
+- **`fit_html`**: The corresponding HTML snippet used to generate `fit_markdown` (helpful for debugging or advanced usage).
+
+**Example**:
+
+```python
+md_obj = result.markdown  # your library’s naming may vary
+print("RAW:\n", md_obj.raw_markdown)
+print("CITED:\n", md_obj.markdown_with_citations)
+print("REFERENCES:\n", md_obj.references_markdown)
+print("FIT:\n", md_obj.fit_markdown)
+```
+
+**Why Does This Matter?**  
+- You can supply `raw_markdown` to an LLM if you want the entire text.  
+- Or feed `fit_markdown` into a vector database to reduce token usage.  
+- `references_markdown` can help you keep track of link provenance.
+
+---
+
+Below is a **revised section** under “Combining Filters (BM25 + Pruning)” that demonstrates how you can run **two** passes of content filtering without re-crawling, by taking the HTML (or text) from a first pass and feeding it into the second filter. It uses real code patterns from the snippet you provided for **BM25ContentFilter**, which directly accepts **HTML** strings (and can also handle plain text with minimal adaptation).
+
+---
+
+## 8. Combining Filters (BM25 + Pruning) in Two Passes
+
+You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead:
+
+1. **First pass**: Apply `PruningContentFilter` directly to the raw HTML from `result.html` (the crawler’s downloaded HTML).  
+2. **Second pass**: Take the pruned HTML (or text) from step 1, and feed it into `BM25ContentFilter`, focusing on a user query.
+
+### Two-Pass Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
+from bs4 import BeautifulSoup
+
+async def main():
+    # 1. Crawl with minimal or no markdown generator, just get raw HTML
+    config = CrawlerRunConfig(
+        # If you only want raw HTML, you can skip passing a markdown_generator
+        # or provide one but focus on .html in this example
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/tech-article", config=config)
+
+        if not result.success or not result.html:
+            print("Crawl failed or no HTML content.")
+            return
+        
+        raw_html = result.html
+        
+        # 2. First pass: PruningContentFilter on raw HTML
+        pruning_filter = PruningContentFilter(threshold=0.5, min_word_threshold=50)
+        
+        # filter_content returns a list of "text chunks" or cleaned HTML sections
+        pruned_chunks = pruning_filter.filter_content(raw_html)
+        # This list is basically pruned content blocks, presumably in HTML or text form
+        
+        # For demonstration, let's combine these chunks back into a single HTML-like string
+        # or you could do further processing. It's up to your pipeline design.
+        pruned_html = "\n".join(pruned_chunks)
+        
+        # 3. Second pass: BM25ContentFilter with a user query
+        bm25_filter = BM25ContentFilter(
+            user_query="machine learning",
+            bm25_threshold=1.2,
+            language="english"
+        )
+        
+        # returns a list of text chunks
+        bm25_chunks = bm25_filter.filter_content(pruned_html)  
+        
+        if not bm25_chunks:
+            print("Nothing matched the BM25 query after pruning.")
+            return
+        
+        # 4. Combine or display final results
+        final_text = "\n---\n".join(bm25_chunks)
+        
+        print("==== PRUNED OUTPUT (first pass) ====")
+        print(pruned_html[:500], "... (truncated)")  # preview
+
+        print("\n==== BM25 OUTPUT (second pass) ====")
+        print(final_text[:500], "... (truncated)")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### What’s Happening?
+
+1. **Raw HTML**: We crawl once and store the raw HTML in `result.html`.  
+2. **PruningContentFilter**: Takes HTML + optional parameters. It extracts blocks of text or partial HTML, removing headings/sections deemed “noise.” It returns a **list of text chunks**.  
+3. **Combine or Transform**: We join these pruned chunks back into a single HTML-like string. (Alternatively, you could store them in a list for further logic—whatever suits your pipeline.)  
+4. **BM25ContentFilter**: We feed the pruned string into `BM25ContentFilter` with a user query. This second pass further narrows the content to chunks relevant to “machine learning.”
+
+**No Re-Crawling**: We used `raw_html` from the first pass, so there’s no need to run `arun()` again—**no second network request**.
+
+### Tips & Variations
+
+- **Plain Text vs. HTML**: If your pruned output is mostly text, BM25 can still handle it; just keep in mind it expects a valid string input. If you supply partial HTML (like `"<p>some text</p>"`), it will parse it as HTML.  
+- **Chaining in a Single Pipeline**: If your code supports it, you can chain multiple filters automatically. Otherwise, manual two-pass filtering (as shown) is straightforward.  
+- **Adjust Thresholds**: If you see too much or too little text in step one, tweak `threshold=0.5` or `min_word_threshold=50`. Similarly, `bm25_threshold=1.2` can be raised/lowered for more or fewer chunks in step two.
+
+### One-Pass Combination?
+
+If your codebase or pipeline design allows applying multiple filters in one pass, you could do so. But often it’s simpler—and more transparent—to run them sequentially, analyzing each step’s result.
+
+**Bottom Line**: By **manually chaining** your filtering logic in two passes, you get powerful incremental control over the final content. First, remove “global” clutter with Pruning, then refine further with BM25-based query relevance—without incurring a second network crawl.
+
+---
+
+## 9. Common Pitfalls & Tips
+
+1. **No Markdown Output?**  
+   - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements.  
+   - Check if your content filter is too aggressive. Lower thresholds or disable the filter to see if content reappears.
+
+2. **Performance Considerations**  
+   - Very large pages with multiple filters can be slower. Consider `cache_mode` to avoid re-downloading.  
+   - If your final use case is LLM ingestion, consider summarizing further or chunking big texts.
+
+3. **Take Advantage of `fit_markdown`**  
+   - Great for RAG pipelines, semantic search, or any scenario where extraneous boilerplate is unwanted.  
+   - Still verify the textual quality—some sites have crucial data in footers or sidebars.
+
+4. **Adjusting `html2text` Options**  
+   - If you see lots of raw HTML slipping into the text, turn on `escape_html`.  
+   - If code blocks look messy, experiment with `mark_code` or `handle_code_in_pre`.
+
+---
+
+## 10. Summary & Next Steps
+
+In this **Markdown Generation Basics** tutorial, you learned to:
+
+- Configure the **DefaultMarkdownGenerator** with HTML-to-text options.  
+- Select different HTML sources using the `content_source` parameter.  
+- Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal.  
+- Distinguish between raw and filtered markdown (`fit_markdown`).  
+- Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.).
+
+Now you can produce high-quality Markdown from any website, focusing on exactly the content you need—an essential step for powering AI models, summarization pipelines, or knowledge-base queries.
+
+**Last Updated**: 2025-01-01
+
+```
+
+
+## File: docs/md_v2/core/page-interaction.md
+
+```md
+# Page Interaction
+
+Crawl4AI provides powerful features for interacting with **dynamic** webpages, handling JavaScript execution, waiting for conditions, and managing multi-step flows. By combining **js_code**, **wait_for**, and certain **CrawlerRunConfig** parameters, you can:
+
+1. Click “Load More” buttons  
+2. Fill forms and submit them  
+3. Wait for elements or data to appear  
+4. Reuse sessions across multiple steps  
+
+Below is a quick overview of how to do it.
+
+---
+
+## 1. JavaScript Execution
+
+### Basic Execution
+
+**`js_code`** in **`CrawlerRunConfig`** accepts either a single JS string or a list of JS snippets.  
+**Example**: We’ll scroll to the bottom of the page, then optionally click a “Load More” button.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Single JS command
+    config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",  # Example site
+            config=config
+        )
+        print("Crawled length:", len(result.cleaned_html))
+
+    # Multiple commands
+    js_commands = [
+        "window.scrollTo(0, document.body.scrollHeight);",
+        # 'More' link on Hacker News
+        "document.querySelector('a.morelink')?.click();",  
+    ]
+    config = CrawlerRunConfig(js_code=js_commands)
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",  # Another pass
+            config=config
+        )
+        print("After scroll+click, length:", len(result.cleaned_html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Relevant `CrawlerRunConfig` params**:
+- **`js_code`**: A string or list of strings with JavaScript to run after the page loads.
+- **`js_only`**: If set to `True` on subsequent calls, indicates we’re continuing an existing session without a new full navigation.  
+- **`session_id`**: If you want to keep the same page across multiple calls, specify an ID.
+
+---
+
+## 2. Wait Conditions
+
+### 2.1 CSS-Based Waiting
+
+Sometimes, you just want to wait for a specific element to appear. For example:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # Wait for at least 30 items on Hacker News
+        wait_for="css:.athing:nth-child(30)"  
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=config
+        )
+        print("We have at least 30 items loaded!")
+        # Rough check
+        print("Total items in HTML:", result.cleaned_html.count("athing"))  
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key param**:
+- **`wait_for="css:..."`**: Tells the crawler to wait until that CSS selector is present.
+
+### 2.2 JavaScript-Based Waiting
+
+For more complex conditions (e.g., waiting for content length to exceed a threshold), prefix `js:`:
+
+```python
+wait_condition = """() => {
+    const items = document.querySelectorAll('.athing');
+    return items.length > 50;  // Wait for at least 51 items
+}"""
+
+config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
+```
+
+**Behind the Scenes**: Crawl4AI keeps polling the JS function until it returns `true` or a timeout occurs.
+
+---
+
+## 3. Handling Dynamic Content
+
+Many modern sites require **multiple steps**: scrolling, clicking “Load More,” or updating via JavaScript. Below are typical patterns.
+
+### 3.1 Load More Example (Hacker News “More” Link)
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Step 1: Load initial Hacker News page
+    config = CrawlerRunConfig(
+        wait_for="css:.athing:nth-child(30)"  # Wait for 30 items
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=config
+        )
+        print("Initial items loaded.")
+
+        # Step 2: Let's scroll and click the "More" link
+        load_more_js = [
+            "window.scrollTo(0, document.body.scrollHeight);",
+            # The "More" link at page bottom
+            "document.querySelector('a.morelink')?.click();"  
+        ]
+        
+        next_page_conf = CrawlerRunConfig(
+            js_code=load_more_js,
+            wait_for="""js:() => {
+                return document.querySelectorAll('.athing').length > 30;
+            }""",
+            # Mark that we do not re-navigate, but run JS in the same session:
+            js_only=True,
+            session_id="hn_session"
+        )
+
+        # Re-use the same crawler session
+        result2 = await crawler.arun(
+            url="https://news.ycombinator.com",  # same URL but continuing session
+            config=next_page_conf
+        )
+        total_items = result2.cleaned_html.count("athing")
+        print("Items after load-more:", total_items)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key params**:
+- **`session_id="hn_session"`**: Keep the same page across multiple calls to `arun()`.
+- **`js_only=True`**: We’re not performing a full reload, just applying JS in the existing page.
+- **`wait_for`** with `js:`: Wait for item count to grow beyond 30.
+
+---
+
+### 3.2 Form Interaction
+
+If the site has a search or login form, you can fill fields and submit them with **`js_code`**. For instance, if GitHub had a local search form:
+
+```python
+js_form_interaction = """
+document.querySelector('#your-search').value = 'TypeScript commits';
+document.querySelector('form').submit();
+"""
+
+config = CrawlerRunConfig(
+    js_code=js_form_interaction,
+    wait_for="css:.commit"
+)
+result = await crawler.arun(url="https://github.com/search", config=config)
+```
+
+**In reality**: Replace IDs or classes with the real site’s form selectors.
+
+---
+
+## 4. Timing Control
+
+1. **`page_timeout`** (ms): Overall page load or script execution time limit.  
+2. **`delay_before_return_html`** (seconds): Wait an extra moment before capturing the final HTML.  
+3. **`mean_delay`** & **`max_range`**: If you call `arun_many()` with multiple URLs, these add a random pause between each request.
+
+**Example**:
+
+```python
+config = CrawlerRunConfig(
+    page_timeout=60000,  # 60s limit
+    delay_before_return_html=2.5
+)
+```
+
+---
+
+## 5. Multi-Step Interaction Example
+
+Below is a simplified script that does multiple “Load More” clicks on GitHub’s TypeScript commits page. It **re-uses** the same session to accumulate new commits each time. The code includes the relevant **`CrawlerRunConfig`** parameters you’d rely on.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def multi_page_commits():
+    browser_cfg = BrowserConfig(
+        headless=False,  # Visible for demonstration
+        verbose=True
+    )
+    session_id = "github_ts_commits"
+    
+    base_wait = """js:() => {
+        const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+        return commits.length > 0;
+    }"""
+
+    # Step 1: Load initial commits
+    config1 = CrawlerRunConfig(
+        wait_for=base_wait,
+        session_id=session_id,
+        cache_mode=CacheMode.BYPASS,
+        # Not using js_only yet since it's our first load
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url="https://github.com/microsoft/TypeScript/commits/main",
+            config=config1
+        )
+        print("Initial commits loaded. Count:", result.cleaned_html.count("commit"))
+
+        # Step 2: For subsequent pages, we run JS to click 'Next Page' if it exists
+        js_next_page = """
+        const selector = 'a[data-testid="pagination-next-button"]';
+        const button = document.querySelector(selector);
+        if (button) button.click();
+        """
+        
+        # Wait until new commits appear
+        wait_for_more = """js:() => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            if (!window.firstCommit && commits.length>0) {
+                window.firstCommit = commits[0].textContent;
+                return false;
+            }
+            // If top commit changes, we have new commits
+            const topNow = commits[0]?.textContent.trim();
+            return topNow && topNow !== window.firstCommit;
+        }"""
+
+        for page in range(2):  # let's do 2 more "Next" pages
+            config_next = CrawlerRunConfig(
+                session_id=session_id,
+                js_code=js_next_page,
+                wait_for=wait_for_more,
+                js_only=True,       # We're continuing from the open tab
+                cache_mode=CacheMode.BYPASS
+            )
+            result2 = await crawler.arun(
+                url="https://github.com/microsoft/TypeScript/commits/main",
+                config=config_next
+            )
+            print(f"Page {page+2} commits count:", result2.cleaned_html.count("commit"))
+
+        # Optionally kill session
+        await crawler.crawler_strategy.kill_session(session_id)
+
+async def main():
+    await multi_page_commits()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**:
+
+- **`session_id`**: Keep the same page open.  
+- **`js_code`** + **`wait_for`** + **`js_only=True`**: We do partial refreshes, waiting for new commits to appear.  
+- **`cache_mode=CacheMode.BYPASS`** ensures we always see fresh data each step.
+
+---
+
+## 6. Combine Interaction with Extraction
+
+Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+schema = {
+    "name": "Commits",
+    "baseSelector": "li.Box-sc-g0xbh4-0",
+    "fields": [
+        {"name": "title", "selector": "h4.markdown-title", "type": "text"}
+    ]
+}
+config = CrawlerRunConfig(
+    session_id="ts_commits_session",
+    js_code=js_next_page,
+    wait_for=wait_for_more,
+    extraction_strategy=JsonCssExtractionStrategy(schema)
+)
+```
+
+When done, check `result.extracted_content` for the JSON.
+
+---
+
+## 7. Relevant `CrawlerRunConfig` Parameters
+
+Below are the key interaction-related parameters in `CrawlerRunConfig`. For a full list, see [Configuration Parameters](../api/parameters.md).
+
+- **`js_code`**: JavaScript to run after initial load.  
+- **`js_only`**: If `True`, no new page navigation—only JS in the existing session.  
+- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.  
+- **`session_id`**: Reuse the same page across calls.  
+- **`cache_mode`**: Whether to read/write from the cache or bypass.  
+- **`remove_overlay_elements`**: Remove certain popups automatically.  
+- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions.
+
+---
+
+## 8. Conclusion
+
+Crawl4AI’s **page interaction** features let you:
+
+1. **Execute JavaScript** for scrolling, clicks, or form filling.  
+2. **Wait** for CSS or custom JS conditions before capturing data.  
+3. **Handle** multi-step flows (like “Load More”) with partial reloads or persistent sessions.  
+4. Combine with **structured extraction** for dynamic sites.
+
+With these tools, you can scrape modern, interactive webpages confidently. For advanced hooking, user simulation, or in-depth config, check the [API reference](../api/parameters.md) or related advanced docs. Happy scripting!
+```
+
+
+## File: docs/md_v2/core/quickstart.md
+
+```md
+# Getting Started with Crawl4AI
+
+Welcome to **Crawl4AI**, an open-source LLM-friendly Web Crawler & Scraper. In this tutorial, you’ll:
+
+1. Run your **first crawl** using minimal configuration.  
+2. Generate **Markdown** output (and learn how it’s influenced by content filters).  
+3. Experiment with a simple **CSS-based extraction** strategy.  
+4. See a glimpse of **LLM-based extraction** (including open-source and closed-source model options).  
+5. Crawl a **dynamic** page that loads content via JavaScript.
+
+---
+
+## 1. Introduction
+
+Crawl4AI provides:
+
+- An asynchronous crawler, **`AsyncWebCrawler`**.  
+- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**.  
+- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports optional filters).  
+- Multiple extraction strategies (LLM-based or “traditional” CSS/XPath-based).
+
+By the end of this guide, you’ll have performed a basic crawl, generated Markdown, tried out two extraction strategies, and crawled a dynamic page that uses “Load More” buttons or JavaScript updates.
+
+---
+
+## 2. Your First Crawl
+
+Here’s a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+        print(result.markdown[:300])  # Print first 300 chars
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What’s happening?**
+- **`AsyncWebCrawler`** launches a headless browser (Chromium by default).
+- It fetches `https://example.com`.
+- Crawl4AI automatically converts the HTML into Markdown.
+
+You now have a simple, working crawl!
+
+---
+
+## 3. Basic Configuration (Light Introduction)
+
+Crawl4AI’s crawler can be heavily customized using two main classes:
+
+1. **`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.).  
+2. **`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.).
+
+Below is an example with minimal usage:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    browser_conf = BrowserConfig(headless=True)  # or False to see the browser
+    run_conf = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler(config=browser_conf) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_conf
+        )
+        print(result.markdown)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
+
+We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
+
+---
+
+## 4. Generating Markdown Output
+
+By default, Crawl4AI automatically generates Markdown from each crawled page. However, the exact output depends on whether you specify a **markdown generator** or **content filter**.
+
+- **`result.markdown`**:  
+  The direct HTML-to-Markdown conversion.  
+- **`result.markdown.fit_markdown`**:  
+  The same content after applying any configured **content filter** (e.g., `PruningContentFilter`).
+
+### Example: Using a Filter with `DefaultMarkdownGenerator`
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+md_generator = DefaultMarkdownGenerator(
+    content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
+)
+
+config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    markdown_generator=md_generator
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun("https://news.ycombinator.com", config=config)
+    print("Raw Markdown length:", len(result.markdown.raw_markdown))
+    print("Fit Markdown length:", len(result.markdown.fit_markdown))
+```
+
+**Note**: If you do **not** specify a content filter or markdown generator, you’ll typically see only the raw Markdown. `PruningContentFilter` may adds around `50ms` in processing time. We’ll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial.
+
+---
+
+## 5. Simple Data Extraction (CSS-based)
+
+Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. Below is a minimal CSS-based example:
+
+> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import LLMConfig
+
+# Generate a schema (one-time cost)
+html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
+
+# Using OpenAI (requires API token)
+schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")  # Required for OpenAI
+)
+
+# Or using Ollama (open source, no token needed)
+schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
+)
+
+# Use the schema for fast, repeated extractions
+strategy = JsonCssExtractionStrategy(schema)
+```
+
+For a complete guide on schema generation and advanced usage, see [No-LLM Extraction Strategies](../extraction/no-llm-strategies.md).
+
+Here's a basic extraction example:
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def main():
+    schema = {
+        "name": "Example Items",
+        "baseSelector": "div.item",
+        "fields": [
+            {"name": "title", "selector": "h2", "type": "text"},
+            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+        ]
+    }
+
+    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="raw://" + raw_html,
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                extraction_strategy=JsonCssExtractionStrategy(schema)
+            )
+        )
+        # The JSON output is stored in 'extracted_content'
+        data = json.loads(result.extracted_content)
+        print(data)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why is this helpful?**
+- Great for repetitive page structures (e.g., item listings, articles).
+- No AI usage or costs.
+- The crawler returns a JSON string you can parse or store.
+
+> Tips: You can pass raw HTML to the crawler instead of a URL. To do so, prefix the HTML with `raw://`.
+
+---
+
+## 6. Simple Data Extraction (LLM-based)
+
+For more complex or irregular pages, a language model can parse text intelligently into a structure you define. Crawl4AI supports **open-source** or **closed-source** providers:
+
+- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`)  
+- **OpenAI Models** (e.g., `openai/gpt-4`, requires `api_token`)  
+- Or any provider supported by the underlying library
+
+Below is an example using **open-source** style (no token) and closed-source:
+
+```python
+import os
+import json
+import asyncio
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config = LLMConfig(provider=provider,api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+if __name__ == "__main__":
+
+    asyncio.run(
+        extract_structured_data_using_llm(
+            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
+        )
+    )
+```
+
+**What’s happening?**
+- We define a Pydantic schema (`PricingInfo`) describing the fields we want.
+- The LLM extraction strategy uses that schema and your instructions to transform raw text into structured JSON.
+- Depending on the **provider** and **api_token**, you can use local models or a remote API.
+
+---
+
+## 7. Multi-URL Concurrency (Preview)
+
+If you need to crawl multiple URLs in **parallel**, you can use `arun_many()`. By default, Crawl4AI employs a **MemoryAdaptiveDispatcher**, automatically adjusting concurrency based on system resources. Here’s a quick glimpse:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def quick_parallel_example():
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3"
+    ]
+    
+    run_conf = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming mode
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # Stream results as they complete
+        async for result in await crawler.arun_many(urls, config=run_conf):
+            if result.success:
+                print(f"[OK] {result.url}, length: {len(result.markdown.raw_markdown)}")
+            else:
+                print(f"[ERROR] {result.url} => {result.error_message}")
+
+        # Or get all results at once (default behavior)
+        run_conf = run_conf.clone(stream=False)
+        results = await crawler.arun_many(urls, config=run_conf)
+        for res in results:
+            if res.success:
+                print(f"[OK] {res.url}, length: {len(res.markdown.raw_markdown)}")
+            else:
+                print(f"[ERROR] {res.url} => {res.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(quick_parallel_example())
+```
+
+The example above shows two ways to handle multiple URLs:
+1. **Streaming mode** (`stream=True`): Process results as they become available using `async for`
+2. **Batch mode** (`stream=False`): Wait for all results to complete
+
+For more advanced concurrency (e.g., a **semaphore-based** approach, **adaptive memory usage throttling**, or customized rate limiting), see [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md).
+
+---
+
+## 8. Dynamic Content Example
+
+Some sites require multiple “page clicks” or dynamic JavaScript updates. Below is an example showing how to **click** a “Next Page” button and wait for new commits to load on GitHub, using **`BrowserConfig`** and **`CrawlerRunConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+async def main():
+    await extract_structured_data_using_css_extractor()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**:
+
+- **`BrowserConfig(headless=False)`**: We want to watch it click “Next Page.”  
+- **`CrawlerRunConfig(...)`**: We specify the extraction strategy, pass `session_id` to reuse the same page.  
+- **`js_code`** and **`wait_for`** are used for subsequent pages (`page > 0`) to click the “Next” button and wait for new commits to load.  
+- **`js_only=True`** indicates we’re not re-navigating but continuing the existing session.  
+- Finally, we call `kill_session()` to clean up the page and browser session.
+
+---
+
+## 9. Next Steps
+
+Congratulations! You have:
+
+1. Performed a basic crawl and printed Markdown.  
+2. Used **content filters** with a markdown generator.  
+3. Extracted JSON via **CSS** or **LLM** strategies.  
+4. Handled **dynamic** pages with JavaScript triggers.
+
+If you’re ready for more, check out:
+
+- **Installation**: A deeper dive into advanced installs, Docker usage (experimental), or optional dependencies.  
+- **Hooks & Auth**: Learn how to run custom JavaScript or handle logins with cookies, local storage, etc.  
+- **Deployment**: Explore ephemeral testing in Docker or plan for the upcoming stable Docker release.  
+- **Browser Management**: Delve into user simulation, stealth modes, and concurrency best practices.  
+
+Crawl4AI is a powerful, flexible tool. Enjoy building out your scrapers, data pipelines, or AI-driven extraction flows. Happy crawling!
+```
+
+
+## File: docs/md_v2/core/simple-crawling.md
+
+```md
+# Simple Crawling
+
+This guide covers the basics of web crawling with Crawl4AI. You'll learn how to set up a crawler, make your first request, and understand the response.
+
+## Basic Usage
+
+Set up a simple crawl using `BrowserConfig` and `CrawlerRunConfig`:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+async def main():
+    browser_config = BrowserConfig()  # Default browser configuration
+    run_config = CrawlerRunConfig()   # Default crawl run configuration
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_config
+        )
+        print(result.markdown)  # Print clean markdown content
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Understanding the Response
+
+The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    config=CrawlerRunConfig(fit_markdown=True)
+)
+
+# Different content formats
+print(result.html)         # Raw HTML
+print(result.cleaned_html) # Cleaned HTML
+print(result.markdown.raw_markdown) # Raw markdown from cleaned html
+print(result.markdown.fit_markdown) # Most relevant content in markdown
+
+# Check success status
+print(result.success)      # True if crawl succeeded
+print(result.status_code)  # HTTP status code (e.g., 200, 404)
+
+# Access extracted media and links
+print(result.media)        # Dictionary of found media (images, videos, audio)
+print(result.links)        # Dictionary of internal and external links
+```
+
+## Adding Basic Options
+
+Customize your crawl using `CrawlerRunConfig`:
+
+```python
+run_config = CrawlerRunConfig(
+    word_count_threshold=10,        # Minimum words per content block
+    exclude_external_links=True,    # Remove external links
+    remove_overlay_elements=True,   # Remove popups/modals
+    process_iframes=True           # Process iframe content
+)
+
+result = await crawler.arun(
+    url="https://example.com",
+    config=run_config
+)
+```
+
+## Handling Errors
+
+Always check if the crawl was successful:
+
+```python
+run_config = CrawlerRunConfig()
+result = await crawler.arun(url="https://example.com", config=run_config)
+
+if not result.success:
+    print(f"Crawl failed: {result.error_message}")
+    print(f"Status code: {result.status_code}")
+```
+
+## Logging and Debugging
+
+Enable verbose logging in `BrowserConfig`:
+
+```python
+browser_config = BrowserConfig(verbose=True)
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    run_config = CrawlerRunConfig()
+    result = await crawler.arun(url="https://example.com", config=run_config)
+```
+
+## Complete Example
+
+Here's a more comprehensive example demonstrating common usage patterns:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    browser_config = BrowserConfig(verbose=True)
+    run_config = CrawlerRunConfig(
+        # Content filtering
+        word_count_threshold=10,
+        excluded_tags=['form', 'header'],
+        exclude_external_links=True,
+        
+        # Content processing
+        process_iframes=True,
+        remove_overlay_elements=True,
+        
+        # Cache control
+        cache_mode=CacheMode.ENABLED  # Use cache if available
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_config
+        )
+        
+        if result.success:
+            # Print clean content
+            print("Content:", result.markdown[:500])  # First 500 chars
+            
+            # Process images
+            for image in result.media["images"]:
+                print(f"Found image: {image['src']}")
+            
+            # Process links
+            for link in result.links["internal"]:
+                print(f"Internal link: {link['href']}")
+                
+        else:
+            print(f"Crawl failed: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+```
+
+
+## File: docs/md_v2/advanced/advanced-features.md
+
+```md
+# Overview of Some Important Advanced Features 
+(Proxy, PDF, Screenshot, SSL, Headers, & Storage State)
+
+Crawl4AI offers multiple power-user features that go beyond simple crawling. This tutorial covers:
+
+1. **Proxy Usage**  
+2. **Capturing PDFs & Screenshots**  
+3. **Handling SSL Certificates**  
+4. **Custom Headers**  
+5. **Session Persistence & Local Storage**  
+6. **Robots.txt Compliance**  
+
+> **Prerequisites**  
+> - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)  
+> - You know how to run or configure your Python environment with Playwright installed
+
+---
+
+## 1. Proxy Usage
+
+If you need to route your crawl traffic through a proxy—whether for IP rotation, geo-testing, or privacy—Crawl4AI supports it via `BrowserConfig.proxy_config`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    browser_cfg = BrowserConfig(
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "myuser",
+            "password": "mypass",
+        },
+        headless=True
+    )
+    crawler_cfg = CrawlerRunConfig(
+        verbose=True
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url="https://www.whatismyip.com/",
+            config=crawler_cfg
+        )
+        if result.success:
+            print("[OK] Page fetched via proxy.")
+            print("Page HTML snippet:", result.html[:200])
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**  
+- **`proxy_config`** expects a dict with `server` and optional auth credentials.  
+- Many commercial proxies provide an HTTP/HTTPS “gateway” server that you specify in `server`.  
+- If your proxy doesn’t need auth, omit `username`/`password`.
+
+---
+
+## 2. Capturing PDFs & Screenshots
+
+Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI can do both in one pass:
+
+```python
+import os, asyncio
+from base64 import b64decode
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
+            cache_mode=CacheMode.BYPASS,
+            pdf=True,
+            screenshot=True
+        )
+        
+        if result.success:
+            # Save screenshot
+            if result.screenshot:
+                with open("wikipedia_screenshot.png", "wb") as f:
+                    f.write(b64decode(result.screenshot))
+            
+            # Save PDF
+            if result.pdf:
+                with open("wikipedia_page.pdf", "wb") as f:
+                    f.write(result.pdf)
+            
+            print("[OK] PDF & screenshot captured.")
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Why PDF + Screenshot?**  
+- Large or complex pages can be slow or error-prone with “traditional” full-page screenshots.  
+- Exporting a PDF is more reliable for very long pages. Crawl4AI automatically converts the first PDF page into an image if you request both.  
+
+**Relevant Parameters**  
+- **`pdf=True`**: Exports the current page as a PDF (base64-encoded in `result.pdf`).  
+- **`screenshot=True`**: Creates a screenshot (base64-encoded in `result.screenshot`).  
+- **`scan_full_page`** or advanced hooking can further refine how the crawler captures content.
+
+---
+
+## 3. Handling SSL Certificates
+
+If you need to verify or export a site’s SSL certificate—for compliance, debugging, or data analysis—Crawl4AI can fetch it during the crawl:
+
+```python
+import asyncio, os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    tmp_dir = os.path.join(os.getcwd(), "tmp")
+    os.makedirs(tmp_dir, exist_ok=True)
+    
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+        
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            print("\nCertificate Information:")
+            print(f"Issuer (CN): {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+
+            # Export in multiple formats:
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))
+            cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))
+            cert.to_der(os.path.join(tmp_dir, "certificate.der"))
+            
+            print("\nCertificate exported to JSON/PEM/DER in 'tmp' folder.")
+        else:
+            print("[ERROR] No certificate or crawl failed.")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**  
+- **`fetch_ssl_certificate=True`** triggers certificate retrieval.  
+- `result.ssl_certificate` includes methods (`to_json`, `to_pem`, `to_der`) for saving in various formats (handy for server config, Java keystores, etc.).
+
+---
+
+## 4. Custom Headers
+
+Sometimes you need to set custom headers (e.g., language preferences, authentication tokens, or specialized user-agent strings). You can do this in multiple ways:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    # Option 1: Set headers at the crawler strategy level
+    crawler1 = AsyncWebCrawler(
+        # The underlying strategy can accept headers in its constructor
+        crawler_strategy=None  # We'll override below for clarity
+    )
+    crawler1.crawler_strategy.update_user_agent("MyCustomUA/1.0")
+    crawler1.crawler_strategy.set_custom_headers({
+        "Accept-Language": "fr-FR,fr;q=0.9"
+    })
+    result1 = await crawler1.arun("https://www.example.com")
+    print("Example 1 result success:", result1.success)
+
+    # Option 2: Pass headers directly to `arun()`
+    crawler2 = AsyncWebCrawler()
+    result2 = await crawler2.arun(
+        url="https://www.example.com",
+        headers={"Accept-Language": "es-ES,es;q=0.9"}
+    )
+    print("Example 2 result success:", result2.success)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Notes**  
+- Some sites may react differently to certain headers (e.g., `Accept-Language`).  
+- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-based-crawling.md) or use `UserAgentGenerator`.
+
+---
+
+## 5. Session Persistence & Local Storage
+
+Crawl4AI can preserve cookies and localStorage so you can continue where you left off—ideal for logging into sites or skipping repeated auth flows.
+
+### 5.1 `storage_state`
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    storage_dict = {
+        "cookies": [
+            {
+                "name": "session",
+                "value": "abcd1234",
+                "domain": "example.com",
+                "path": "/",
+                "expires": 1699999999.0,
+                "httpOnly": False,
+                "secure": False,
+                "sameSite": "None"
+            }
+        ],
+        "origins": [
+            {
+                "origin": "https://example.com",
+                "localStorage": [
+                    {"name": "token", "value": "my_auth_token"}
+                ]
+            }
+        ]
+    }
+
+    # Provide the storage state as a dictionary to start "already logged in"
+    async with AsyncWebCrawler(
+        headless=True,
+        storage_state=storage_dict
+    ) as crawler:
+        result = await crawler.arun("https://example.com/protected")
+        if result.success:
+            print("Protected page content length:", len(result.html))
+        else:
+            print("Failed to crawl protected page")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### 5.2 Exporting & Reusing State
+
+You can sign in once, export the browser context, and reuse it later—without re-entering credentials.
+
+- **`await context.storage_state(path="my_storage.json")`**: Exports cookies, localStorage, etc. to a file.  
+- Provide `storage_state="my_storage.json"` on subsequent runs to skip the login step.
+
+**See**: [Detailed session management tutorial](./session-management.md) or [Explanations → Browser Context & Managed Browser](./identity-based-crawling.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages).
+
+---
+
+## 6. Robots.txt Compliance
+
+Crawl4AI supports respecting robots.txt rules with efficient caching:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Enable robots.txt checking in config
+    config = CrawlerRunConfig(
+        check_robots_txt=True  # Will check and respect robots.txt rules
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            "https://example.com",
+            config=config
+        )
+        
+        if not result.success and result.status_code == 403:
+            print("Access denied by robots.txt")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Points**
+- Robots.txt files are cached locally for efficiency
+- Cache is stored in `~/.crawl4ai/robots/robots_cache.db`
+- Cache has a default TTL of 7 days
+- If robots.txt can't be fetched, crawling is allowed
+- Returns 403 status code if URL is disallowed
+
+---
+
+## Putting It All Together
+
+Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs.
+
+```python
+import os, asyncio
+from base64 import b64decode
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    # 1. Browser config with proxy + headless
+    browser_cfg = BrowserConfig(
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "myuser",
+            "password": "mypass",
+        },
+        headless=True,
+    )
+
+    # 2. Crawler config with PDF, screenshot, SSL, custom headers, and ignoring caches
+    crawler_cfg = CrawlerRunConfig(
+        pdf=True,
+        screenshot=True,
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS,
+        headers={"Accept-Language": "en-US,en;q=0.8"},
+        storage_state="my_storage.json",  # Reuse session from a previous sign-in
+        verbose=True,
+    )
+
+    # 3. Crawl
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        result = await crawler.arun(
+            url = "https://secure.example.com/protected", 
+            config=crawler_cfg
+        )
+        
+        if result.success:
+            print("[OK] Crawled the secure page. Links found:", len(result.links.get("internal", [])))
+            
+            # Save PDF & screenshot
+            if result.pdf:
+                with open("result.pdf", "wb") as f:
+                    f.write(b64decode(result.pdf))
+            if result.screenshot:
+                with open("result.png", "wb") as f:
+                    f.write(b64decode(result.screenshot))
+            
+            # Check SSL cert
+            if result.ssl_certificate:
+                print("SSL Issuer CN:", result.ssl_certificate.issuer.get("CN", ""))
+        else:
+            print("[ERROR]", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## Conclusion & Next Steps
+
+You’ve now explored several **advanced** features:
+
+- **Proxy Usage**  
+- **PDF & Screenshot** capturing for large or critical pages  
+- **SSL Certificate** retrieval & exporting  
+- **Custom Headers** for language or specialized requests  
+- **Session Persistence** via storage state
+- **Robots.txt Compliance**
+
+With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
+
+**Last Updated**: 2025-01-01
+```
+
+
+## File: docs/md_v2/advanced/crawl-dispatcher.md
+
+```md
+# Crawl Dispatcher
+
+We’re excited to announce a **Crawl Dispatcher** module that can handle **thousands** of crawling tasks simultaneously. By efficiently managing system resources (memory, CPU, network), this dispatcher ensures high-performance data extraction at scale. It also provides **real-time monitoring** of each crawler’s status, memory usage, and overall progress.
+
+Stay tuned—this feature is **coming soon** in an upcoming release of Crawl4AI! For the latest news, keep an eye on our changelogs and follow [@unclecode](https://twitter.com/unclecode) on X.
+
+Below is a **sample** of how the dispatcher’s performance monitor might look in action:
+
+![Crawl Dispatcher Performance Monitor](../assets/images/dispatcher.png)
+
+
+We can’t wait to bring you this streamlined, **scalable** approach to multi-URL crawling—**watch this space** for updates!
+```
+
+
+## File: docs/md_v2/advanced/file-downloading.md
+
+```md
+# Download Handling in Crawl4AI
+
+This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files.
+
+## Enabling Downloads
+
+To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler.
+
+```python
+from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler
+
+async def main():
+    config = BrowserConfig(accept_downloads=True)  # Enable downloads globally
+    async with AsyncWebCrawler(config=config) as crawler:
+        # ... your crawling logic ...
+
+asyncio.run(main())
+```
+
+## Specifying Download Location
+
+Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory.
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+import os
+
+downloads_path = os.path.join(os.getcwd(), "my_downloads")  # Custom download path
+os.makedirs(downloads_path, exist_ok=True)
+
+config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path)
+
+async def main():
+    async with AsyncWebCrawler(config=config) as crawler:
+        result = await crawler.arun(url="https://example.com")
+        # ...
+```
+
+## Triggering Downloads
+
+Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start.
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+config = CrawlerRunConfig(
+    js_code="""
+        const downloadLink = document.querySelector('a[href$=".exe"]');
+        if (downloadLink) {
+            downloadLink.click();
+        }
+    """,
+    wait_for=5  # Wait 5 seconds for the download to start
+)
+
+result = await crawler.arun(url="https://www.python.org/downloads/", config=config)
+```
+
+## Accessing Downloaded Files
+
+The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files.
+
+```python
+if result.downloaded_files:
+    print("Downloaded files:")
+    for file_path in result.downloaded_files:
+        print(f"- {file_path}")
+        file_size = os.path.getsize(file_path)
+        print(f"- File size: {file_size} bytes")
+else:
+    print("No files downloaded.")
+```
+
+## Example: Downloading Multiple Files
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import os
+from pathlib import Path
+
+async def download_multiple_files(url: str, download_path: str):
+    config = BrowserConfig(accept_downloads=True, downloads_path=download_path)
+    async with AsyncWebCrawler(config=config) as crawler:
+        run_config = CrawlerRunConfig(
+            js_code="""
+                const downloadLinks = document.querySelectorAll('a[download]');
+                for (const link of downloadLinks) {
+                    link.click();
+                    // Delay between clicks
+                    await new Promise(r => setTimeout(r, 2000));  
+                }
+            """,
+            wait_for=10  # Wait for all downloads to start
+        )
+        result = await crawler.arun(url=url, config=run_config)
+
+        if result.downloaded_files:
+            print("Downloaded files:")
+            for file in result.downloaded_files:
+                print(f"- {file}")
+        else:
+            print("No files downloaded.")
+
+# Usage
+download_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
+os.makedirs(download_path, exist_ok=True)
+
+asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path))
+```
+
+## Important Considerations
+
+- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage.
+- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing.
+- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully.
+- **Security:** Scan downloaded files for potential security threats before use.
+
+This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed!
+```
+
+
+## File: docs/md_v2/advanced/hooks-auth.md
+
+```md
+# Hooks & Auth in AsyncWebCrawler
+
+Crawl4AI’s **hooks** let you customize the crawler at specific points in the pipeline:
+
+1. **`on_browser_created`** – After browser creation.  
+2. **`on_page_context_created`** – After a new context & page are created.  
+3. **`before_goto`** – Just before navigating to a page.  
+4. **`after_goto`** – Right after navigation completes.  
+5. **`on_user_agent_updated`** – Whenever the user agent changes.  
+6. **`on_execution_started`** – Once custom JavaScript execution begins.  
+7. **`before_retrieve_html`** – Just before the crawler retrieves final HTML.  
+8. **`before_return_html`** – Right before returning the HTML content.
+
+**Important**: Avoid heavy tasks in `on_browser_created` since you don’t yet have a page context. If you need to *log in*, do so in **`on_page_context_created`**.
+
+> note "Important Hook Usage Warning"
+    **Avoid Misusing Hooks**: Do not manipulate page objects in the wrong hook or at the wrong time, as it can crash the pipeline or produce incorrect results. A common mistake is attempting to handle authentication prematurely—such as creating or closing pages in `on_browser_created`. 
+
+>   **Use the Right Hook for Auth**: If you need to log in or set tokens, use `on_page_context_created`. This ensures you have a valid page/context to work with, without disrupting the main crawling flow.
+
+>    **Identity-Based Crawling**: For robust auth, consider identity-based crawling (or passing a session ID) to preserve state. Run your initial login steps in a separate, well-defined process, then feed that session to your main crawl—rather than shoehorning complex authentication into early hooks. Check out [Identity-Based Crawling](../advanced/identity-based-crawling.md) for more details.
+
+>    **Be Cautious**: Overwriting or removing elements in the wrong hook can compromise the final crawl. Keep hooks focused on smaller tasks (like route filters, custom headers), and let your main logic (crawling, data extraction) proceed normally.
+
+
+Below is an example demonstration.
+
+---
+
+## Example: Using Hooks in AsyncWebCrawler
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+async def main():
+    print("🔗 Hooks Example: Demonstrating recommended usage")
+
+    # 1) Configure the browser
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True
+    )
+
+    # 2) Configure the crawler run
+    crawler_run_config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);",
+        wait_for="body",
+        cache_mode=CacheMode.BYPASS
+    )
+
+    # 3) Create the crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    #
+    # Define Hook Functions
+    #
+
+    async def on_browser_created(browser, **kwargs):
+        # Called once the browser instance is created (but no pages or contexts yet)
+        print("[HOOK] on_browser_created - Browser created successfully!")
+        # Typically, do minimal setup here if needed
+        return browser
+
+    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+        # Called right after a new page + context are created (ideal for auth or route config).
+        print("[HOOK] on_page_context_created - Setting up page & context.")
+        
+        # Example 1: Route filtering (e.g., block images)
+        async def route_filter(route):
+            if route.request.resource_type == "image":
+                print(f"[HOOK] Blocking image request: {route.request.url}")
+                await route.abort()
+            else:
+                await route.continue_()
+
+        await context.route("**", route_filter)
+
+        # Example 2: (Optional) Simulate a login scenario
+        # (We do NOT create or close pages here, just do quick steps if needed)
+        # e.g., await page.goto("https://example.com/login")
+        # e.g., await page.fill("input[name='username']", "testuser")
+        # e.g., await page.fill("input[name='password']", "password123")
+        # e.g., await page.click("button[type='submit']")
+        # e.g., await page.wait_for_selector("#welcome")
+        # e.g., await context.add_cookies([...])
+        # Then continue
+
+        # Example 3: Adjust the viewport
+        await page.set_viewport_size({"width": 1080, "height": 600})
+        return page
+
+    async def before_goto(
+        page: Page, context: BrowserContext, url: str, **kwargs
+    ):
+        # Called before navigating to each URL.
+        print(f"[HOOK] before_goto - About to navigate: {url}")
+        # e.g., inject custom headers
+        await page.set_extra_http_headers({
+            "Custom-Header": "my-value"
+        })
+        return page
+
+    async def after_goto(
+        page: Page, context: BrowserContext, 
+        url: str, response, **kwargs
+    ):
+        # Called after navigation completes.
+        print(f"[HOOK] after_goto - Successfully loaded: {url}")
+        # e.g., wait for a certain element if we want to verify
+        try:
+            await page.wait_for_selector('.content', timeout=1000)
+            print("[HOOK] Found .content element!")
+        except:
+            print("[HOOK] .content not found, continuing anyway.")
+        return page
+
+    async def on_user_agent_updated(
+        page: Page, context: BrowserContext, 
+        user_agent: str, **kwargs
+    ):
+        # Called whenever the user agent updates.
+        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
+        return page
+
+    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+        # Called after custom JavaScript execution begins.
+        print("[HOOK] on_execution_started - JS code is running!")
+        return page
+
+    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+        # Called before final HTML retrieval.
+        print("[HOOK] before_retrieve_html - We can do final actions")
+        # Example: Scroll again
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        return page
+
+    async def before_return_html(
+        page: Page, context: BrowserContext, html: str, **kwargs
+    ):
+        # Called just before returning the HTML in the result.
+        print(f"[HOOK] before_return_html - HTML length: {len(html)}")
+        return page
+
+    #
+    # Attach Hooks
+    #
+
+    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+    crawler.crawler_strategy.set_hook(
+        "on_page_context_created", on_page_context_created
+    )
+    crawler.crawler_strategy.set_hook("before_goto", before_goto)
+    crawler.crawler_strategy.set_hook("after_goto", after_goto)
+    crawler.crawler_strategy.set_hook(
+        "on_user_agent_updated", on_user_agent_updated
+    )
+    crawler.crawler_strategy.set_hook(
+        "on_execution_started", on_execution_started
+    )
+    crawler.crawler_strategy.set_hook(
+        "before_retrieve_html", before_retrieve_html
+    )
+    crawler.crawler_strategy.set_hook(
+        "before_return_html", before_return_html
+    )
+
+    await crawler.start()
+
+    # 4) Run the crawler on an example page
+    url = "https://example.com"
+    result = await crawler.arun(url, config=crawler_run_config)
+    
+    if result.success:
+        print("\nCrawled URL:", result.url)
+        print("HTML length:", len(result.html))
+    else:
+        print("Error:", result.error_message)
+
+    await crawler.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## Hook Lifecycle Summary
+
+1. **`on_browser_created`**:  
+   - Browser is up, but **no** pages or contexts yet.  
+   - Light setup only—don’t try to open or close pages here (that belongs in `on_page_context_created`).
+
+2. **`on_page_context_created`**:  
+   - Perfect for advanced **auth** or route blocking.  
+   - You have a **page** + **context** ready but haven’t navigated to the target URL yet.
+
+3. **`before_goto`**:  
+   - Right before navigation. Typically used for setting **custom headers** or logging the target URL.
+
+4. **`after_goto`**:  
+   - After page navigation is done. Good place for verifying content or waiting on essential elements. 
+
+5. **`on_user_agent_updated`**:  
+   - Whenever the user agent changes (for stealth or different UA modes).
+
+6. **`on_execution_started`**:  
+   - If you set `js_code` or run custom scripts, this runs once your JS is about to start.
+
+7. **`before_retrieve_html`**:  
+   - Just before the final HTML snapshot is taken. Often you do a final scroll or lazy-load triggers here.
+
+8. **`before_return_html`**:  
+   - The last hook before returning HTML to the `CrawlResult`. Good for logging HTML length or minor modifications.
+
+---
+
+## When to Handle Authentication
+
+**Recommended**: Use **`on_page_context_created`** if you need to:
+
+- Navigate to a login page or fill forms
+- Set cookies or localStorage tokens
+- Block resource routes to avoid ads
+
+This ensures the newly created context is under your control **before** `arun()` navigates to the main URL.
+
+---
+
+## Additional Considerations
+
+- **Session Management**: If you want multiple `arun()` calls to reuse a single session, pass `session_id=` in your `CrawlerRunConfig`. Hooks remain the same.  
+- **Performance**: Hooks can slow down crawling if they do heavy tasks. Keep them concise.  
+- **Error Handling**: If a hook fails, the overall crawl might fail. Catch exceptions or handle them gracefully.  
+- **Concurrency**: If you run `arun_many()`, each URL triggers these hooks in parallel. Ensure your hooks are thread/async-safe.
+
+---
+
+## Conclusion
+
+Hooks provide **fine-grained** control over:
+
+- **Browser** creation (light tasks only)
+- **Page** and **context** creation (auth, route blocking)
+- **Navigation** phases
+- **Final HTML** retrieval
+
+Follow the recommended usage:
+- **Login** or advanced tasks in `on_page_context_created`  
+- **Custom headers** or logs in `before_goto` / `after_goto`  
+- **Scrolling** or final checks in `before_retrieve_html` / `before_return_html`
+
+
+```
+
+
+## File: docs/md_v2/advanced/identity-based-crawling.md
+
+```md
+# Preserve Your Identity with Crawl4AI
+
+Crawl4AI empowers you to navigate and interact with the web using your **authentic digital identity**, ensuring you’re recognized as a human and not mistaken for a bot. This tutorial covers:
+
+1. **Managed Browsers** – The recommended approach for persistent profiles and identity-based crawling.  
+2. **Magic Mode** – A simplified fallback solution for quick automation without persistent identity.
+
+---
+
+## 1. Managed Browsers: Your Digital Identity Solution
+
+**Managed Browsers** let developers create and use **persistent browser profiles**. These profiles store local storage, cookies, and other session data, letting you browse as your **real self**—complete with logins, preferences, and cookies.
+
+### Key Benefits
+
+- **Authentic Browsing Experience**: Retain session data and browser fingerprints as though you’re a normal user.  
+- **Effortless Configuration**: Once you log in or solve CAPTCHAs in your chosen data directory, you can re-run crawls without repeating those steps.  
+- **Empowered Data Access**: If you can see the data in your own browser, you can automate its retrieval with your genuine identity.
+
+---
+
+Below is a **partial update** to your **Managed Browsers** tutorial, specifically the section about **creating a user-data directory** using **Playwright’s Chromium** binary rather than a system-wide Chrome/Edge. We’ll show how to **locate** that binary and launch it with a `--user-data-dir` argument to set up your profile. You can then point `BrowserConfig.user_data_dir` to that folder for subsequent crawls.
+
+---
+
+### Creating a User Data Directory (Command-Line Approach via Playwright)
+
+If you installed Crawl4AI (which installs Playwright under the hood), you already have a Playwright-managed Chromium on your system. Follow these steps to launch that **Chromium** from your command line, specifying a **custom** data directory:
+
+1. **Find** the Playwright Chromium binary:
+   - On most systems, installed browsers go under a `~/.cache/ms-playwright/` folder or similar path.  
+   - To see an overview of installed browsers, run:
+     ```bash
+     python -m playwright install --dry-run
+     ```
+     or
+     ```bash
+     playwright install --dry-run
+     ```
+     (depending on your environment). This shows where Playwright keeps Chromium.
+
+   - For instance, you might see a path like:
+     ```
+     ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome
+     ```
+     on Linux, or a corresponding folder on macOS/Windows.
+
+2. **Launch** the Playwright Chromium binary with a **custom** user-data directory:
+   ```bash
+   # Linux example
+   ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome \
+       --user-data-dir=/home/<you>/my_chrome_profile
+   ```
+   ```bash
+   # macOS example (Playwright’s internal binary)
+   ~/Library/Caches/ms-playwright/chromium-1234/chrome-mac/Chromium.app/Contents/MacOS/Chromium \
+       --user-data-dir=/Users/<you>/my_chrome_profile
+   ```
+   ```powershell
+   # Windows example (PowerShell/cmd)
+   "C:\Users\<you>\AppData\Local\ms-playwright\chromium-1234\chrome-win\chrome.exe" ^
+       --user-data-dir="C:\Users\<you>\my_chrome_profile"
+   ```
+   
+   **Replace** the path with the actual subfolder indicated in your `ms-playwright` cache structure.  
+   - This **opens** a fresh Chromium with your new or existing data folder.  
+   - **Log into** any sites or configure your browser the way you want.  
+   - **Close** when done—your profile data is saved in that folder.
+
+3. **Use** that folder in **`BrowserConfig.user_data_dir`**:
+   ```python
+   from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+   browser_config = BrowserConfig(
+       headless=True,
+       use_managed_browser=True,
+       user_data_dir="/home/<you>/my_chrome_profile",
+       browser_type="chromium"
+   )
+   ```
+   - Next time you run your code, it reuses that folder—**preserving** your session data, cookies, local storage, etc.
+
+---
+
+## 3. Using Managed Browsers in Crawl4AI
+
+Once you have a data directory with your session data, pass it to **`BrowserConfig`**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    # 1) Reference your persistent data directory
+    browser_config = BrowserConfig(
+        headless=True,             # 'True' for automated runs
+        verbose=True,
+        use_managed_browser=True,  # Enables persistent browser strategy
+        browser_type="chromium",
+        user_data_dir="/path/to/my-chrome-profile"
+    )
+
+    # 2) Standard crawl config
+    crawl_config = CrawlerRunConfig(
+        wait_for="css:.logged-in-content"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="https://example.com/private", config=crawl_config)
+        if result.success:
+            print("Successfully accessed private data with your identity!")
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### Workflow
+
+1. **Login** externally (via CLI or your normal Chrome with `--user-data-dir=...`).  
+2. **Close** that browser.  
+3. **Use** the same folder in `user_data_dir=` in Crawl4AI.  
+4. **Crawl** – The site sees your identity as if you’re the same user who just logged in.
+
+---
+
+## 4. Magic Mode: Simplified Automation
+
+If you **don’t** need a persistent profile or identity-based approach, **Magic Mode** offers a quick way to simulate human-like browsing without storing long-term data.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        config=CrawlerRunConfig(
+            magic=True,  # Simplifies a lot of interaction
+            remove_overlay_elements=True,
+            page_timeout=60000
+        )
+    )
+```
+
+**Magic Mode**:
+
+- Simulates a user-like experience  
+- Randomizes user agent & navigator
+- Randomizes interactions & timings  
+- Masks automation signals  
+- Attempts pop-up handling  
+
+**But** it’s no substitute for **true** user-based sessions if you want a fully legitimate identity-based solution.
+
+---
+
+## 5. Comparing Managed Browsers vs. Magic Mode
+
+| Feature                    | **Managed Browsers**                                           | **Magic Mode**                                     |
+|----------------------------|---------------------------------------------------------------|-----------------------------------------------------|
+| **Session Persistence**    | Full localStorage/cookies retained in user_data_dir           | No persistent data (fresh each run)                |
+| **Genuine Identity**       | Real user profile with full rights & preferences              | Emulated user-like patterns, but no actual identity |
+| **Complex Sites**          | Best for login-gated sites or heavy config                    | Simple tasks, minimal login or config needed        |
+| **Setup**                  | External creation of user_data_dir, then use in Crawl4AI       | Single-line approach (`magic=True`)                 |
+| **Reliability**            | Extremely consistent (same data across runs)                  | Good for smaller tasks, can be less stable          |
+
+---
+
+## 6. Using the BrowserProfiler Class
+
+Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing.
+
+### Creating and Managing Profiles with BrowserProfiler
+
+The `BrowserProfiler` class offers a comprehensive API for browser profile management:
+
+```python
+import asyncio
+from crawl4ai import BrowserProfiler
+
+async def manage_profiles():
+    # Create a profiler instance
+    profiler = BrowserProfiler()
+    
+    # Create a profile interactively - opens a browser window
+    profile_path = await profiler.create_profile(
+        profile_name="my-login-profile"  # Optional: name your profile
+    )
+    
+    print(f"Profile saved at: {profile_path}")
+    
+    # List all available profiles
+    profiles = profiler.list_profiles()
+    
+    for profile in profiles:
+        print(f"Profile: {profile['name']}")
+        print(f"  Path: {profile['path']}")
+        print(f"  Created: {profile['created']}")
+        print(f"  Browser type: {profile['type']}")
+    
+    # Get a specific profile path by name
+    specific_profile = profiler.get_profile_path("my-login-profile")
+    
+    # Delete a profile when no longer needed
+    success = profiler.delete_profile("old-profile-name")
+    
+asyncio.run(manage_profiles())
+```
+
+**How profile creation works:**
+1. A browser window opens for you to interact with
+2. You log in to websites, set preferences, etc.
+3. When you're done, press 'q' in the terminal to close the browser
+4. The profile is saved in the Crawl4AI profiles directory
+5. You can use the returned path with `BrowserConfig.user_data_dir`
+
+### Interactive Profile Management
+
+The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion:
+
+```python
+import asyncio
+from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig
+
+# Define a function to use a profile for crawling
+async def crawl_with_profile(profile_path, url):
+    browser_config = BrowserConfig(
+        headless=True,
+        use_managed_browser=True,
+        user_data_dir=profile_path
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url)
+        return result
+
+async def main():
+    # Create a profiler instance
+    profiler = BrowserProfiler()
+    
+    # Launch the interactive profile manager
+    # Passing the crawl function as a callback adds a "crawl with profile" option
+    await profiler.interactive_manager(crawl_callback=crawl_with_profile)
+    
+asyncio.run(main())
+```
+
+### Legacy Methods
+
+For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class:
+
+```python
+from crawl4ai.browser_manager import ManagedBrowser
+
+# These methods still work but use BrowserProfiler internally
+profiles = ManagedBrowser.list_profiles()
+```
+
+### Complete Example
+
+See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class.
+
+---
+
+## 7. Summary
+
+- **Create** your user-data directory either:
+  - By launching Chrome/Chromium externally with `--user-data-dir=/some/path` 
+  - Or by using the built-in `BrowserProfiler.create_profile()` method
+  - Or through the interactive interface with `profiler.interactive_manager()`
+- **Log in** or configure sites as needed, then close the browser
+- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
+- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
+- **Manage** your profiles with the dedicated `BrowserProfiler` class
+- Enjoy **persistent** sessions that reflect your real identity
+- If you only need quick, ephemeral automation, **Magic Mode** might suffice
+
+**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.
+
+With these approaches, you preserve your **authentic** browsing environment, ensuring the site sees you exactly as a normal user—no repeated logins or wasted time.
+```
+
+
+## File: docs/md_v2/advanced/lazy-loading.md
+
+```md
+## Handling Lazy-Loaded Images
+
+Many websites now load images **lazily** as you scroll. If you need to ensure they appear in your final crawl (and in `result.media`), consider:
+
+1. **`wait_for_images=True`** – Wait for images to fully load.  
+2. **`scan_full_page`** – Force the crawler to scroll the entire page, triggering lazy loads.  
+3. **`scroll_delay`** – Add small delays between scroll steps.  
+
+**Note**: If the site requires multiple “Load More” triggers or complex interactions, see the [Page Interaction docs](../core/page-interaction.md).
+
+### Example: Ensuring Lazy Images Appear
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
+from crawl4ai.async_configs import CacheMode
+
+async def main():
+    config = CrawlerRunConfig(
+        # Force the crawler to wait until images are fully loaded
+        wait_for_images=True,
+
+        # Option 1: If you want to automatically scroll the page to load images
+        scan_full_page=True,  # Tells the crawler to try scrolling the entire page
+        scroll_delay=0.5,     # Delay (seconds) between scroll steps
+
+        # Option 2: If the site uses a 'Load More' or JS triggers for images,
+        # you can also specify js_code or wait_for logic here.
+
+        cache_mode=CacheMode.BYPASS,
+        verbose=True
+    )
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        result = await crawler.arun("https://www.example.com/gallery", config=config)
+        
+        if result.success:
+            images = result.media.get("images", [])
+            print("Images found:", len(images))
+            for i, img in enumerate(images[:5]):
+                print(f"[Image {i}] URL: {img['src']}, Score: {img.get('score','N/A')}")
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Explanation**:
+
+- **`wait_for_images=True`**  
+  The crawler tries to ensure images have finished loading before finalizing the HTML.  
+- **`scan_full_page=True`**  
+  Tells the crawler to attempt scrolling from top to bottom. Each scroll step helps trigger lazy loading.  
+- **`scroll_delay=0.5`**  
+  Pause half a second between each scroll step. Helps the site load images before continuing.
+
+**When to Use**:
+
+- **Lazy-Loading**: If images appear only when the user scrolls into view, `scan_full_page` + `scroll_delay` helps the crawler see them.  
+- **Heavier Pages**: If a page is extremely long, be mindful that scanning the entire page can be slow. Adjust `scroll_delay` or the max scroll steps as needed.
+
+---
+
+## Combining with Other Link & Media Filters
+
+You can still combine **lazy-load** logic with the usual **exclude_external_images**, **exclude_domains**, or link filtration:
+
+```python
+config = CrawlerRunConfig(
+    wait_for_images=True,
+    scan_full_page=True,
+    scroll_delay=0.5,
+
+    # Filter out external images if you only want local ones
+    exclude_external_images=True,
+
+    # Exclude certain domains for links
+    exclude_domains=["spammycdn.com"],
+)
+```
+
+This approach ensures you see **all** images from the main domain while ignoring external ones, and the crawler physically scrolls the entire page so that lazy-loading triggers.
+
+---
+
+## Tips & Troubleshooting
+
+1. **Long Pages**  
+   - Setting `scan_full_page=True` on extremely long or infinite-scroll pages can be resource-intensive.  
+   - Consider using [hooks](../core/page-interaction.md) or specialized logic to load specific sections or “Load More” triggers repeatedly.
+
+2. **Mixed Image Behavior**  
+   - Some sites load images in batches as you scroll. If you’re missing images, increase your `scroll_delay` or call multiple partial scrolls in a loop with JS code or hooks.
+
+3. **Combining with Dynamic Wait**  
+   - If the site has a placeholder that only changes to a real image after a certain event, you might do `wait_for="css:img.loaded"` or a custom JS `wait_for`.
+
+4. **Caching**  
+   - If `cache_mode` is enabled, repeated crawls might skip some network fetches. If you suspect caching is missing new images, set `cache_mode=CacheMode.BYPASS` for fresh fetches.
+
+---
+
+With **lazy-loading** support, **wait_for_images**, and **scan_full_page** settings, you can capture the entire gallery or feed of images you expect—even if the site only loads them as the user scrolls. Combine these with the standard media filtering and domain exclusion for a complete link & media handling strategy.
+```
+
+
+## File: docs/md_v2/advanced/multi-url-crawling.md
+
+```md
+# Advanced Multi-URL Crawling with Dispatchers
+
+> **Heads Up**: Crawl4AI supports advanced dispatchers for **parallel** or **throttled** crawling, providing dynamic rate limiting and memory usage checks. The built-in `arun_many()` function uses these dispatchers to handle concurrency efficiently.
+
+## 1. Introduction
+
+When crawling many URLs:
+
+- **Basic**: Use `arun()` in a loop (simple but less efficient)
+- **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control
+- **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.)
+
+**Why Dispatchers?**  
+
+- **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources
+- **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses
+- **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance
+- **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency
+
+---
+
+## 2. Core Components
+
+### 2.1 Rate Limiter
+
+```python
+class RateLimiter:
+    def __init__(
+        # Random delay range between requests
+        base_delay: Tuple[float, float] = (1.0, 3.0),  
+        
+        # Maximum backoff delay
+        max_delay: float = 60.0,                        
+        
+        # Retries before giving up
+        max_retries: int = 3,                          
+        
+        # Status codes triggering backoff
+        rate_limit_codes: List[int] = [429, 503]        
+    )
+```
+
+Here’s the revised and simplified explanation of the **RateLimiter**, focusing on constructor parameters and adhering to your markdown style and mkDocs guidelines.
+
+#### RateLimiter Constructor Parameters
+
+The **RateLimiter** is a utility that helps manage the pace of requests to avoid overloading servers or getting blocked due to rate limits. It operates internally to delay requests and handle retries but can be configured using its constructor parameters.
+
+**Parameters of the `RateLimiter` constructor:**
+
+1. **`base_delay`** (`Tuple[float, float]`, default: `(1.0, 3.0)`)  
+  The range for a random delay (in seconds) between consecutive requests to the same domain.
+
+- A random delay is chosen between `base_delay[0]` and `base_delay[1]` for each request.  
+- This prevents sending requests at a predictable frequency, reducing the chances of triggering rate limits.
+
+**Example:**  
+If `base_delay = (2.0, 5.0)`, delays could be randomly chosen as `2.3s`, `4.1s`, etc.
+
+---
+
+2. **`max_delay`** (`float`, default: `60.0`)  
+  The maximum allowable delay when rate-limiting errors occur.
+
+- When servers return rate-limit responses (e.g., 429 or 503), the delay increases exponentially with jitter.  
+- The `max_delay` ensures the delay doesn’t grow unreasonably high, capping it at this value.
+
+**Example:**  
+For a `max_delay = 30.0`, even if backoff calculations suggest a delay of `45s`, it will cap at `30s`.
+
+---
+
+3. **`max_retries`** (`int`, default: `3`)  
+  The maximum number of retries for a request if rate-limiting errors occur.
+
+- After encountering a rate-limit response, the `RateLimiter` retries the request up to this number of times.  
+- If all retries fail, the request is marked as failed, and the process continues.
+
+**Example:**  
+If `max_retries = 3`, the system retries a failed request three times before giving up.
+
+---
+
+4. **`rate_limit_codes`** (`List[int]`, default: `[429, 503]`)  
+  A list of HTTP status codes that trigger the rate-limiting logic.
+
+- These status codes indicate the server is overwhelmed or actively limiting requests.  
+- You can customize this list to include other codes based on specific server behavior.
+
+**Example:**  
+If `rate_limit_codes = [429, 503, 504]`, the crawler will back off on these three error codes.
+
+---
+
+**How to Use the `RateLimiter`:**
+
+Here’s an example of initializing and using a `RateLimiter` in your project:
+
+```python
+from crawl4ai import RateLimiter
+
+# Create a RateLimiter with custom settings
+rate_limiter = RateLimiter(
+    base_delay=(2.0, 4.0),  # Random delay between 2-4 seconds
+    max_delay=30.0,         # Cap delay at 30 seconds
+    max_retries=5,          # Retry up to 5 times on rate-limiting errors
+    rate_limit_codes=[429, 503]  # Handle these HTTP status codes
+)
+
+# RateLimiter will handle delays and retries internally
+# No additional setup is required for its operation
+```
+
+The `RateLimiter` integrates seamlessly with dispatchers like `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher`, ensuring requests are paced correctly without user intervention. Its internal mechanisms manage delays and retries to avoid overwhelming servers while maximizing efficiency.
+
+
+### 2.2 Crawler Monitor
+
+The CrawlerMonitor provides real-time visibility into crawling operations:
+
+```python
+from crawl4ai import CrawlerMonitor, DisplayMode
+monitor = CrawlerMonitor(
+    # Maximum rows in live display
+    max_visible_rows=15,          
+
+    # DETAILED or AGGREGATED view
+    display_mode=DisplayMode.DETAILED  
+)
+```
+
+**Display Modes**:
+
+1. **DETAILED**: Shows individual task status, memory usage, and timing
+2. **AGGREGATED**: Displays summary statistics and overall progress
+
+---
+
+## 3. Available Dispatchers
+
+### 3.1 MemoryAdaptiveDispatcher (Default)
+
+Automatically manages concurrency based on system memory usage:
+
+```python
+from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
+
+dispatcher = MemoryAdaptiveDispatcher(
+    memory_threshold_percent=90.0,  # Pause if memory exceeds this
+    check_interval=1.0,             # How often to check memory
+    max_session_permit=10,          # Maximum concurrent tasks
+    rate_limiter=RateLimiter(       # Optional rate limiting
+        base_delay=(1.0, 2.0),
+        max_delay=30.0,
+        max_retries=2
+    ),
+    monitor=CrawlerMonitor(         # Optional monitoring
+        max_visible_rows=15,
+        display_mode=DisplayMode.DETAILED
+    )
+)
+```
+
+**Constructor Parameters:**
+
+1. **`memory_threshold_percent`** (`float`, default: `90.0`)  
+  Specifies the memory usage threshold (as a percentage). If system memory usage exceeds this value, the dispatcher pauses crawling to prevent system overload.
+
+2. **`check_interval`** (`float`, default: `1.0`)  
+  The interval (in seconds) at which the dispatcher checks system memory usage.
+
+3. **`max_session_permit`** (`int`, default: `10`)  
+  The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
+
+4. **`memory_wait_timeout`** (`float`, default: `300.0`)  
+  Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
+
+5. **`rate_limiter`** (`RateLimiter`, default: `None`)  
+  Optional rate-limiting logic to avoid server-side blocking (e.g., for handling 429 or 503 errors). See **RateLimiter** for details.
+
+6. **`monitor`** (`CrawlerMonitor`, default: `None`)  
+  Optional monitoring for real-time task tracking and performance insights. See **CrawlerMonitor** for details.
+
+---
+
+### 3.2 SemaphoreDispatcher
+
+Provides simple concurrency control with a fixed limit:
+
+```python
+from crawl4ai.async_dispatcher import SemaphoreDispatcher
+
+dispatcher = SemaphoreDispatcher(
+    max_session_permit=20,         # Maximum concurrent tasks
+    rate_limiter=RateLimiter(      # Optional rate limiting
+        base_delay=(0.5, 1.0),
+        max_delay=10.0
+    ),
+    monitor=CrawlerMonitor(        # Optional monitoring
+        max_visible_rows=15,
+        display_mode=DisplayMode.DETAILED
+    )
+)
+```
+
+**Constructor Parameters:**
+
+1. **`max_session_permit`** (`int`, default: `20`)  
+  The maximum number of concurrent crawling tasks allowed, irrespective of semaphore slots.
+
+2. **`rate_limiter`** (`RateLimiter`, default: `None`)  
+  Optional rate-limiting logic to avoid overwhelming servers. See **RateLimiter** for details.
+
+3. **`monitor`** (`CrawlerMonitor`, default: `None`)  
+  Optional monitoring for tracking task progress and resource usage. See **CrawlerMonitor** for details.
+
+---
+
+## 4. Usage Examples
+
+### 4.1 Batch Processing (Default)
+
+```python
+async def crawl_batch():
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=False  # Default: get all results at once
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=10,
+        monitor=CrawlerMonitor(
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Get all results at once
+        results = await crawler.arun_many(
+            urls=urls,
+            config=run_config,
+            dispatcher=dispatcher
+        )
+        
+        # Process all results after completion
+        for result in results:
+            if result.success:
+                await process_result(result)
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+```
+
+**Review:**  
+- **Purpose:** Executes a batch crawl with all URLs processed together after crawling is complete.  
+- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` to manage concurrency and system memory.  
+- **Stream:** Disabled (`stream=False`), so all results are collected at once for post-processing.  
+- **Best Use Case:** When you need to analyze results in bulk rather than individually during the crawl.
+
+---
+
+### 4.2 Streaming Mode
+
+```python
+async def crawl_streaming():
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming mode
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=10,
+        monitor=CrawlerMonitor(
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Process results as they become available
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=run_config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                # Process each result immediately
+                await process_result(result)
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+```
+
+**Review:**  
+- **Purpose:** Enables streaming to process results as soon as they’re available.  
+- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` for concurrency and memory management.  
+- **Stream:** Enabled (`stream=True`), allowing real-time processing during crawling.  
+- **Best Use Case:** When you need to act on results immediately, such as for real-time analytics or progressive data storage.
+
+---
+
+### 4.3 Semaphore-based Crawling
+
+```python
+async def crawl_with_semaphore(urls):
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    
+    dispatcher = SemaphoreDispatcher(
+        semaphore_count=5,
+        rate_limiter=RateLimiter(
+            base_delay=(0.5, 1.0),
+            max_delay=10.0
+        ),
+        monitor=CrawlerMonitor(
+            max_visible_rows=15,
+            display_mode=DisplayMode.DETAILED
+        )
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = await crawler.arun_many(
+            urls, 
+            config=run_config,
+            dispatcher=dispatcher
+        )
+        return results
+```
+
+**Review:**  
+- **Purpose:** Uses `SemaphoreDispatcher` to limit concurrency with a fixed number of slots.  
+- **Dispatcher:** Configured with a semaphore to control parallel crawling tasks.  
+- **Rate Limiter:** Prevents servers from being overwhelmed by pacing requests.  
+- **Best Use Case:** When you want precise control over the number of concurrent requests, independent of system memory.
+
+---
+
+### 4.4 Robots.txt Consideration
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    urls = [
+        "https://example1.com",
+        "https://example2.com",
+        "https://example3.com"
+    ]
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        check_robots_txt=True,  # Will respect robots.txt for each URL
+        semaphore_count=3      # Max concurrent requests
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in crawler.arun_many(urls, config=config):
+            if result.success:
+                print(f"Successfully crawled {result.url}")
+            elif result.status_code == 403 and "robots.txt" in result.error_message:
+                print(f"Skipped {result.url} - blocked by robots.txt")
+            else:
+                print(f"Failed to crawl {result.url}: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Review:**  
+- **Purpose:** Ensures compliance with `robots.txt` rules for ethical and legal web crawling.  
+- **Configuration:** Set `check_robots_txt=True` to validate each URL against `robots.txt` before crawling.  
+- **Dispatcher:** Handles requests with concurrency limits (`semaphore_count=3`).  
+- **Best Use Case:** When crawling websites that strictly enforce robots.txt policies or for responsible crawling practices.
+
+---
+
+## 5. Dispatch Results
+
+Each crawl result includes dispatch information:
+
+```python
+@dataclass
+class DispatchResult:
+    task_id: str
+    memory_usage: float
+    peak_memory: float
+    start_time: datetime
+    end_time: datetime
+    error_message: str = ""
+```
+
+Access via `result.dispatch_result`:
+
+```python
+for result in results:
+    if result.success:
+        dr = result.dispatch_result
+        print(f"URL: {result.url}")
+        print(f"Memory: {dr.memory_usage:.1f}MB")
+        print(f"Duration: {dr.end_time - dr.start_time}")
+```
+
+## 6. Summary
+
+1. **Two Dispatcher Types**:
+
+   - MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory
+   - SemaphoreDispatcher: Fixed concurrency limit
+
+2. **Optional Components**:
+
+   - RateLimiter: Smart request pacing and backoff
+   - CrawlerMonitor: Real-time progress visualization
+
+3. **Key Benefits**:
+
+   - Automatic memory management
+   - Built-in rate limiting
+   - Live progress monitoring
+   - Flexible concurrency control
+
+Choose the dispatcher that best fits your needs:
+
+- **MemoryAdaptiveDispatcher**: For large crawls or limited resources
+- **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios
+
+```
+
+
+## File: docs/md_v2/advanced/network-console-capture.md
+
+```md
+# Network Requests & Console Message Capturing
+
+Crawl4AI can capture all network requests and browser console messages during a crawl, which is invaluable for debugging, security analysis, or understanding page behavior.
+
+## Configuration
+
+To enable network and console capturing, use these configuration options:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+# Enable both network request capture and console message capture
+config = CrawlerRunConfig(
+    capture_network_requests=True,  # Capture all network requests and responses
+    capture_console_messages=True   # Capture all browser console output
+)
+```
+
+## Example Usage
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Enable both network request capture and console message capture
+    config = CrawlerRunConfig(
+        capture_network_requests=True,
+        capture_console_messages=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=config
+        )
+        
+        if result.success:
+            # Analyze network requests
+            if result.network_requests:
+                print(f"Captured {len(result.network_requests)} network events")
+                
+                # Count request types
+                request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
+                response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
+                failed_count = len([r for r in result.network_requests if r.get("event_type") == "request_failed"])
+                
+                print(f"Requests: {request_count}, Responses: {response_count}, Failed: {failed_count}")
+                
+                # Find API calls
+                api_calls = [r for r in result.network_requests 
+                            if r.get("event_type") == "request" and "api" in r.get("url", "")]
+                if api_calls:
+                    print(f"Detected {len(api_calls)} API calls:")
+                    for call in api_calls[:3]:  # Show first 3
+                        print(f"  - {call.get('method')} {call.get('url')}")
+            
+            # Analyze console messages
+            if result.console_messages:
+                print(f"Captured {len(result.console_messages)} console messages")
+                
+                # Group by type
+                message_types = {}
+                for msg in result.console_messages:
+                    msg_type = msg.get("type", "unknown")
+                    message_types[msg_type] = message_types.get(msg_type, 0) + 1
+                
+                print("Message types:", message_types)
+                
+                # Show errors (often the most important)
+                errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+                if errors:
+                    print(f"Found {len(errors)} console errors:")
+                    for err in errors[:2]:  # Show first 2
+                        print(f"  - {err.get('text', '')[:100]}")
+            
+            # Export all captured data to a file for detailed analysis
+            with open("network_capture.json", "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "network_requests": result.network_requests or [],
+                    "console_messages": result.console_messages or []
+                }, f, indent=2)
+            
+            print("Exported detailed capture data to network_capture.json")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Captured Data Structure
+
+### Network Requests
+
+The `result.network_requests` contains a list of dictionaries, each representing a network event with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `event_type` | Type of event: `"request"`, `"response"`, or `"request_failed"` |
+| `url` | The URL of the request |
+| `timestamp` | Unix timestamp when the event was captured |
+
+#### Request Event Fields
+
+```json
+{
+  "event_type": "request",
+  "url": "https://example.com/api/data.json",
+  "method": "GET",
+  "headers": {"User-Agent": "...", "Accept": "..."},
+  "post_data": "key=value&otherkey=value",
+  "resource_type": "fetch",
+  "is_navigation_request": false,
+  "timestamp": 1633456789.123
+}
+```
+
+#### Response Event Fields
+
+```json
+{
+  "event_type": "response",
+  "url": "https://example.com/api/data.json",
+  "status": 200,
+  "status_text": "OK",
+  "headers": {"Content-Type": "application/json", "Cache-Control": "..."},
+  "from_service_worker": false,
+  "request_timing": {"requestTime": 1234.56, "receiveHeadersEnd": 1234.78},
+  "timestamp": 1633456789.456
+}
+```
+
+#### Failed Request Event Fields
+
+```json
+{
+  "event_type": "request_failed",
+  "url": "https://example.com/missing.png",
+  "method": "GET",
+  "resource_type": "image",
+  "failure_text": "net::ERR_ABORTED 404",
+  "timestamp": 1633456789.789
+}
+```
+
+### Console Messages
+
+The `result.console_messages` contains a list of dictionaries, each representing a console message with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `type` | Message type: `"log"`, `"error"`, `"warning"`, `"info"`, etc. |
+| `text` | The message text |
+| `timestamp` | Unix timestamp when the message was captured |
+
+#### Console Message Example
+
+```json
+{
+  "type": "error",
+  "text": "Uncaught TypeError: Cannot read property 'length' of undefined",
+  "location": "https://example.com/script.js:123:45",
+  "timestamp": 1633456790.123
+}
+```
+
+## Key Benefits
+
+- **Full Request Visibility**: Capture all network activity including:
+  - Requests (URLs, methods, headers, post data)
+  - Responses (status codes, headers, timing)
+  - Failed requests (with error messages)
+  
+- **Console Message Access**: View all JavaScript console output:
+  - Log messages
+  - Warnings
+  - Errors with stack traces
+  - Developer debugging information
+
+- **Debugging Power**: Identify issues such as:
+  - Failed API calls or resource loading
+  - JavaScript errors affecting page functionality
+  - CORS or other security issues
+  - Hidden API endpoints and data flows
+
+- **Security Analysis**: Detect:
+  - Unexpected third-party requests
+  - Data leakage in request payloads
+  - Suspicious script behavior
+
+- **Performance Insights**: Analyze:
+  - Request timing data
+  - Resource loading patterns
+  - Potential bottlenecks
+
+## Use Cases
+
+1. **API Discovery**: Identify hidden endpoints and data flows in single-page applications
+2. **Debugging**: Track down JavaScript errors affecting page functionality
+3. **Security Auditing**: Detect unwanted third-party requests or data leakage
+4. **Performance Analysis**: Identify slow-loading resources
+5. **Ad/Tracker Analysis**: Detect and catalog advertising or tracking calls
+
+This capability is especially valuable for complex sites with heavy JavaScript, single-page applications, or when you need to understand the exact communication happening between a browser and servers.
+```
+
+
+## File: docs/md_v2/advanced/proxy-security.md
+
+```md
+# Proxy 
+
+## Basic Proxy Setup
+
+Simple proxy configuration with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+# Using proxy URL
+browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+
+# Using SOCKS proxy
+browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+## Authenticated Proxy
+
+Use an authenticated proxy with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+proxy_config = {
+    "server": "http://proxy.example.com:8080",
+    "username": "user",
+    "password": "pass"
+}
+
+browser_config = BrowserConfig(proxy_config=proxy_config)
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+Here's the corrected documentation:
+
+## Rotating Proxies 
+
+Example using a proxy rotation service dynamically:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def get_next_proxy():
+    # Your proxy rotation logic here
+    return {"server": "http://next.proxy.com:8080"}
+
+async def main():
+    browser_config = BrowserConfig()
+    run_config = CrawlerRunConfig()
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # For each URL, create a new run config with different proxy
+        for url in urls:
+            proxy = await get_next_proxy()
+            # Clone the config and update proxy - this creates a new browser context
+            current_config = run_config.clone(proxy_config=proxy)
+            result = await crawler.arun(url=url, config=current_config)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+
+```
+
+
+## File: docs/md_v2/advanced/session-management.md
+
+```md
+# Session Management
+
+Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for:
+
+- **Performing JavaScript actions before and after crawling.**
+- **Executing multiple sequential crawls faster** without needing to reopen tabs or allocate memory repeatedly.
+
+**Note:** This feature is designed for sequential workflows and is not suitable for parallel operations.
+
+---
+
+#### Basic Session Usage
+
+Use `BrowserConfig` and `CrawlerRunConfig` to maintain state with a `session_id`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    session_id = "my_session"
+
+    # Define configurations
+    config1 = CrawlerRunConfig(
+        url="https://example.com/page1", session_id=session_id
+    )
+    config2 = CrawlerRunConfig(
+        url="https://example.com/page2", session_id=session_id
+    )
+
+    # First request
+    result1 = await crawler.arun(config=config1)
+
+    # Subsequent request using the same session
+    result2 = await crawler.arun(config=config2)
+
+    # Clean up when done
+    await crawler.crawler_strategy.kill_session(session_id)
+```
+
+---
+
+#### Dynamic Content with Sessions
+
+Here's an example of crawling GitHub commits across multiple pages while preserving session state:
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.cache_context import CacheMode
+
+async def crawl_dynamic_content():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "github_commits_session"
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        all_commits = []
+
+        # Define extraction schema
+        schema = {
+            "name": "Commit Extractor",
+            "baseSelector": "li.Box-sc-g0xbh4-0",
+            "fields": [{
+                "name": "title", "selector": "h4.markdown-title", "type": "text"
+            }],
+        }
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        # JavaScript and wait configurations
+        js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
+        wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
+
+        # Crawl multiple pages
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page if page > 0 else None,
+                wait_for=wait_for if page > 0 else None,
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            if result.success:
+                commits = json.loads(result.extracted_content)
+                all_commits.extend(commits)
+                print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        # Clean up session
+        await crawler.crawler_strategy.kill_session(session_id)
+        return all_commits
+```
+
+---
+
+## Example 1: Basic Session-Based Crawling
+
+A simple example using session-based crawling:
+
+```python
+import asyncio
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.cache_context import CacheMode
+
+async def basic_session_crawl():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "dynamic_content_session"
+        url = "https://example.com/dynamic-content"
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
+                css_selector=".content-item",
+                cache_mode=CacheMode.BYPASS
+            )
+            
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(basic_session_crawl())
+```
+
+This example shows:
+1. Reusing the same `session_id` across multiple requests.
+2. Executing JavaScript to load more content dynamically.
+3. Properly closing the session to free resources.
+
+---
+
+## Advanced Technique 1: Custom Execution Hooks
+
+> Warning: You might feel confused by the end of the next few examples 😅, so make sure you are comfortable with the order of the parts before you start this.
+
+Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically:
+
+```python
+async def advanced_session_crawl_with_hooks():
+    first_commit = ""
+
+    async def on_execution_started(page):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.commit-item h4")
+                commit = await page.query_selector("li.commit-item h4")
+                commit = await commit.evaluate("(element) => element.textContent").strip()
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear: {e}")
+
+    async with AsyncWebCrawler() as crawler:
+        session_id = "commit_session"
+        url = "https://github.com/example/repo/commits/main"
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        js_next_page = """document.querySelector('a.pagination-next').click();"""
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code=js_next_page if page > 0 else None,
+                css_selector="li.commit-item",
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(advanced_session_crawl_with_hooks())
+```
+
+This technique ensures new content loads before the next action.
+
+---
+
+## Advanced Technique 2: Integrated JavaScript Execution and Waiting
+
+Combine JavaScript execution and waiting logic for concise handling of dynamic content:
+
+```python
+async def integrated_js_and_wait_crawl():
+    async with AsyncWebCrawler() as crawler:
+        session_id = "integrated_session"
+        url = "https://github.com/example/repo/commits/main"
+
+        js_next_page_and_wait = """
+        (async () => {
+            const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim();
+            const initialCommit = getCurrentCommit();
+            document.querySelector('a.pagination-next').click();
+            while (getCurrentCommit() === initialCommit) {
+                await new Promise(resolve => setTimeout(resolve, 100));
+            }
+        })();
+        """
+
+        for page in range(3):
+            config = CrawlerRunConfig(
+                url=url,
+                session_id=session_id,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                css_selector="li.commit-item",
+                js_only=page > 0,
+                cache_mode=CacheMode.BYPASS
+            )
+
+            result = await crawler.arun(config=config)
+            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
+
+        await crawler.crawler_strategy.kill_session(session_id)
+
+asyncio.run(integrated_js_and_wait_crawl())
+```
+
+---
+
+#### Common Use Cases for Sessions
+
+1. **Authentication Flows**: Login and interact with secured pages.
+
+2. **Pagination Handling**: Navigate through multiple pages.
+
+3. **Form Submissions**: Fill forms, submit, and process results.
+
+4. **Multi-step Processes**: Complete workflows that span multiple actions.
+
+5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content.
+
+```
+
+
+## File: docs/md_v2/advanced/ssl-certificate.md
+
+```md
+# `SSLCertificate` Reference
+
+The **`SSLCertificate`** class encapsulates an SSL certificate’s data and allows exporting it in various formats (PEM, DER, JSON, or text). It’s used within **Crawl4AI** whenever you set **`fetch_ssl_certificate=True`** in your **`CrawlerRunConfig`**.  
+
+## 1. Overview
+
+**Location**: `crawl4ai/ssl_certificate.py`
+
+```python
+class SSLCertificate:
+    """
+    Represents an SSL certificate with methods to export in various formats.
+
+    Main Methods:
+    - from_url(url, timeout=10)
+    - from_file(file_path)
+    - from_binary(binary_data)
+    - to_json(filepath=None)
+    - to_pem(filepath=None)
+    - to_der(filepath=None)
+    ...
+
+    Common Properties:
+    - issuer
+    - subject
+    - valid_from
+    - valid_until
+    - fingerprint
+    """
+```
+
+### Typical Use Case
+1. You **enable** certificate fetching in your crawl by:
+   ```python
+   CrawlerRunConfig(fetch_ssl_certificate=True, ...)
+   ```
+2. After `arun()`, if `result.ssl_certificate` is present, it’s an instance of **`SSLCertificate`**.  
+3. You can **read** basic properties (issuer, subject, validity) or **export** them in multiple formats.
+
+---
+
+## 2. Construction & Fetching
+
+### 2.1 **`from_url(url, timeout=10)`**
+Manually load an SSL certificate from a given URL (port 443). Typically used internally, but you can call it directly if you want:
+
+```python
+cert = SSLCertificate.from_url("https://example.com")
+if cert:
+    print("Fingerprint:", cert.fingerprint)
+```
+
+### 2.2 **`from_file(file_path)`**
+Load from a file containing certificate data in ASN.1 or DER. Rarely needed unless you have local cert files:
+
+```python
+cert = SSLCertificate.from_file("/path/to/cert.der")
+```
+
+### 2.3 **`from_binary(binary_data)`**
+Initialize from raw binary. E.g., if you captured it from a socket or another source:
+
+```python
+cert = SSLCertificate.from_binary(raw_bytes)
+```
+
+---
+
+## 3. Common Properties
+
+After obtaining a **`SSLCertificate`** instance (e.g. `result.ssl_certificate` from a crawl), you can read:
+
+1. **`issuer`** *(dict)*  
+   - E.g. `{"CN": "My Root CA", "O": "..."}`
+2. **`subject`** *(dict)*  
+   - E.g. `{"CN": "example.com", "O": "ExampleOrg"}`
+3. **`valid_from`** *(str)*  
+   - NotBefore date/time. Often in ASN.1/UTC format.
+4. **`valid_until`** *(str)*  
+   - NotAfter date/time.
+5. **`fingerprint`** *(str)*  
+   - The SHA-256 digest (lowercase hex).  
+   - E.g. `"d14d2e..."`
+
+---
+
+## 4. Export Methods
+
+Once you have a **`SSLCertificate`** object, you can **export** or **inspect** it:
+
+### 4.1 **`to_json(filepath=None)` → `Optional[str]`**
+- Returns a JSON string containing the parsed certificate fields.  
+- If `filepath` is provided, saves it to disk instead, returning `None`.
+
+**Usage**:
+```python
+json_data = cert.to_json()  # returns JSON string
+cert.to_json("certificate.json")  # writes file, returns None
+```
+
+### 4.2 **`to_pem(filepath=None)` → `Optional[str]`**
+- Returns a PEM-encoded string (common for web servers).  
+- If `filepath` is provided, saves it to disk instead.
+
+```python
+pem_str = cert.to_pem()              # in-memory PEM string
+cert.to_pem("/path/to/cert.pem")     # saved to file
+```
+
+### 4.3 **`to_der(filepath=None)` → `Optional[bytes]`**
+- Returns the original DER (binary ASN.1) bytes.  
+- If `filepath` is specified, writes the bytes there instead.
+
+```python
+der_bytes = cert.to_der()
+cert.to_der("certificate.der")
+```
+
+### 4.4 (Optional) **`export_as_text()`**
+- If you see a method like `export_as_text()`, it typically returns an OpenSSL-style textual representation.  
+- Not always needed, but can help for debugging or manual inspection.
+
+---
+
+## 5. Example Usage in Crawl4AI
+
+Below is a minimal sample showing how the crawler obtains an SSL cert from a site, then reads or exports it. The code snippet:
+
+```python
+import asyncio
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    tmp_dir = "tmp"
+    os.makedirs(tmp_dir, exist_ok=True)
+
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            # 1. Basic Info
+            print("Issuer CN:", cert.issuer.get("CN", ""))
+            print("Valid until:", cert.valid_until)
+            print("Fingerprint:", cert.fingerprint)
+            
+            # 2. Export
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))
+            cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))
+            cert.to_der(os.path.join(tmp_dir, "certificate.der"))
+    
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 6. Notes & Best Practices
+
+1. **Timeout**: `SSLCertificate.from_url` internally uses a default **10s** socket connect and wraps SSL.  
+2. **Binary Form**: The certificate is loaded in ASN.1 (DER) form, then re-parsed by `OpenSSL.crypto`.  
+3. **Validation**: This does **not** validate the certificate chain or trust store. It only fetches and parses.  
+4. **Integration**: Within Crawl4AI, you typically just set `fetch_ssl_certificate=True` in `CrawlerRunConfig`; the final result’s `ssl_certificate` is automatically built.  
+5. **Export**: If you need to store or analyze a cert, the `to_json` and `to_pem` are quite universal.
+
+---
+
+### Summary
+
+- **`SSLCertificate`** is a convenience class for capturing and exporting the **TLS certificate** from your crawled site(s).  
+- Common usage is in the **`CrawlResult.ssl_certificate`** field, accessible after setting `fetch_ssl_certificate=True`.  
+- Offers quick access to essential certificate details (`issuer`, `subject`, `fingerprint`) and is easy to export (PEM, DER, JSON) for further analysis or server usage.
+
+Use it whenever you need **insight** into a site’s certificate or require some form of cryptographic or compliance check.
+```
+
+
+## File: docs/md_v2/extraction/chunking.md
+
+```md
+# Chunking Strategies
+Chunking strategies are critical for dividing large texts into manageable parts, enabling effective content processing and extraction. These strategies are foundational in cosine similarity-based extraction techniques, which allow users to retrieve only the most relevant chunks of content for a given query. Additionally, they facilitate direct integration into RAG (Retrieval-Augmented Generation) systems for structured and scalable workflows.
+
+### Why Use Chunking?
+1. **Cosine Similarity and Query Relevance**: Prepares chunks for semantic similarity analysis.
+2. **RAG System Integration**: Seamlessly processes and stores chunks for retrieval.
+3. **Structured Processing**: Allows for diverse segmentation methods, such as sentence-based, topic-based, or windowed approaches.
+
+### Methods of Chunking
+
+#### 1. Regex-Based Chunking
+Splits text based on regular expression patterns, useful for coarse segmentation.
+
+**Code Example**:
+```python
+class RegexChunking:
+    def __init__(self, patterns=None):
+        self.patterns = patterns or [r'\n\n']  # Default pattern for paragraphs
+
+    def chunk(self, text):
+        paragraphs = [text]
+        for pattern in self.patterns:
+            paragraphs = [seg for p in paragraphs for seg in re.split(pattern, p)]
+        return paragraphs
+
+# Example Usage
+text = """This is the first paragraph.
+
+This is the second paragraph."""
+chunker = RegexChunking()
+print(chunker.chunk(text))
+```
+
+#### 2. Sentence-Based Chunking
+Divides text into sentences using NLP tools, ideal for extracting meaningful statements.
+
+**Code Example**:
+```python
+from nltk.tokenize import sent_tokenize
+
+class NlpSentenceChunking:
+    def chunk(self, text):
+        sentences = sent_tokenize(text)
+        return [sentence.strip() for sentence in sentences]
+
+# Example Usage
+text = "This is sentence one. This is sentence two."
+chunker = NlpSentenceChunking()
+print(chunker.chunk(text))
+```
+
+#### 3. Topic-Based Segmentation
+Uses algorithms like TextTiling to create topic-coherent chunks.
+
+**Code Example**:
+```python
+from nltk.tokenize import TextTilingTokenizer
+
+class TopicSegmentationChunking:
+    def __init__(self):
+        self.tokenizer = TextTilingTokenizer()
+
+    def chunk(self, text):
+        return self.tokenizer.tokenize(text)
+
+# Example Usage
+text = """This is an introduction.
+This is a detailed discussion on the topic."""
+chunker = TopicSegmentationChunking()
+print(chunker.chunk(text))
+```
+
+#### 4. Fixed-Length Word Chunking
+Segments text into chunks of a fixed word count.
+
+**Code Example**:
+```python
+class FixedLengthWordChunking:
+    def __init__(self, chunk_size=100):
+        self.chunk_size = chunk_size
+
+    def chunk(self, text):
+        words = text.split()
+        return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
+
+# Example Usage
+text = "This is a long text with many words to be chunked into fixed sizes."
+chunker = FixedLengthWordChunking(chunk_size=5)
+print(chunker.chunk(text))
+```
+
+#### 5. Sliding Window Chunking
+Generates overlapping chunks for better contextual coherence.
+
+**Code Example**:
+```python
+class SlidingWindowChunking:
+    def __init__(self, window_size=100, step=50):
+        self.window_size = window_size
+        self.step = step
+
+    def chunk(self, text):
+        words = text.split()
+        chunks = []
+        for i in range(0, len(words) - self.window_size + 1, self.step):
+            chunks.append(' '.join(words[i:i + self.window_size]))
+        return chunks
+
+# Example Usage
+text = "This is a long text to demonstrate sliding window chunking."
+chunker = SlidingWindowChunking(window_size=5, step=2)
+print(chunker.chunk(text))
+```
+
+### Combining Chunking with Cosine Similarity
+To enhance the relevance of extracted content, chunking strategies can be paired with cosine similarity techniques. Here’s an example workflow:
+
+**Code Example**:
+```python
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+class CosineSimilarityExtractor:
+    def __init__(self, query):
+        self.query = query
+        self.vectorizer = TfidfVectorizer()
+
+    def find_relevant_chunks(self, chunks):
+        vectors = self.vectorizer.fit_transform([self.query] + chunks)
+        similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
+        return [(chunks[i], similarities[i]) for i in range(len(chunks))]
+
+# Example Workflow
+text = """This is a sample document. It has multiple sentences. 
+We are testing chunking and similarity."""
+
+chunker = SlidingWindowChunking(window_size=5, step=3)
+chunks = chunker.chunk(text)
+query = "testing chunking"
+extractor = CosineSimilarityExtractor(query)
+relevant_chunks = extractor.find_relevant_chunks(chunks)
+
+print(relevant_chunks)
+```
+
+```
+
+
+## File: docs/md_v2/extraction/clustring-strategies.md
+
+```md
+# Cosine Strategy
+
+The Cosine Strategy in Crawl4AI uses similarity-based clustering to identify and extract relevant content sections from web pages. This strategy is particularly useful when you need to find and extract content based on semantic similarity rather than structural patterns.
+
+## How It Works
+
+The Cosine Strategy:
+1. Breaks down page content into meaningful chunks
+2. Converts text into vector representations
+3. Calculates similarity between chunks
+4. Clusters similar content together
+5. Ranks and filters content based on relevance
+
+## Basic Usage
+
+```python
+from crawl4ai.extraction_strategy import CosineStrategy
+
+strategy = CosineStrategy(
+    semantic_filter="product reviews",    # Target content type
+    word_count_threshold=10,             # Minimum words per cluster
+    sim_threshold=0.3                    # Similarity threshold
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com/reviews",
+        extraction_strategy=strategy
+    )
+    
+    content = result.extracted_content
+```
+
+## Configuration Options
+
+### Core Parameters
+
+```python
+CosineStrategy(
+    # Content Filtering
+    semantic_filter: str = None,       # Keywords/topic for content filtering
+    word_count_threshold: int = 10,    # Minimum words per cluster
+    sim_threshold: float = 0.3,        # Similarity threshold (0.0 to 1.0)
+    
+    # Clustering Parameters
+    max_dist: float = 0.2,            # Maximum distance for clustering
+    linkage_method: str = 'ward',      # Clustering linkage method
+    top_k: int = 3,                   # Number of top categories to extract
+    
+    # Model Configuration
+    model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',  # Embedding model
+    
+    verbose: bool = False             # Enable logging
+)
+```
+
+### Parameter Details
+
+1. **semantic_filter**
+   - Sets the target topic or content type
+   - Use keywords relevant to your desired content
+   - Example: "technical specifications", "user reviews", "pricing information"
+
+2. **sim_threshold**
+   - Controls how similar content must be to be grouped together
+   - Higher values (e.g., 0.8) mean stricter matching
+   - Lower values (e.g., 0.3) allow more variation
+   ```python
+   # Strict matching
+   strategy = CosineStrategy(sim_threshold=0.8)
+   
+   # Loose matching
+   strategy = CosineStrategy(sim_threshold=0.3)
+   ```
+
+3. **word_count_threshold**
+   - Filters out short content blocks
+   - Helps eliminate noise and irrelevant content
+   ```python
+   # Only consider substantial paragraphs
+   strategy = CosineStrategy(word_count_threshold=50)
+   ```
+
+4. **top_k**
+   - Number of top content clusters to return
+   - Higher values return more diverse content
+   ```python
+   # Get top 5 most relevant content clusters
+   strategy = CosineStrategy(top_k=5)
+   ```
+
+## Use Cases
+
+### 1. Article Content Extraction
+```python
+strategy = CosineStrategy(
+    semantic_filter="main article content",
+    word_count_threshold=100,  # Longer blocks for articles
+    top_k=1                   # Usually want single main content
+)
+
+result = await crawler.arun(
+    url="https://example.com/blog/post",
+    extraction_strategy=strategy
+)
+```
+
+### 2. Product Review Analysis
+```python
+strategy = CosineStrategy(
+    semantic_filter="customer reviews and ratings",
+    word_count_threshold=20,   # Reviews can be shorter
+    top_k=10,                 # Get multiple reviews
+    sim_threshold=0.4         # Allow variety in review content
+)
+```
+
+### 3. Technical Documentation
+```python
+strategy = CosineStrategy(
+    semantic_filter="technical specifications documentation",
+    word_count_threshold=30,
+    sim_threshold=0.6,        # Stricter matching for technical content
+    max_dist=0.3             # Allow related technical sections
+)
+```
+
+## Advanced Features
+
+### Custom Clustering
+```python
+strategy = CosineStrategy(
+    linkage_method='complete',  # Alternative clustering method
+    max_dist=0.4,              # Larger clusters
+    model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'  # Multilingual support
+)
+```
+
+### Content Filtering Pipeline
+```python
+strategy = CosineStrategy(
+    semantic_filter="pricing plans features",
+    word_count_threshold=15,
+    sim_threshold=0.5,
+    top_k=3
+)
+
+async def extract_pricing_features(url: str):
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=url,
+            extraction_strategy=strategy
+        )
+        
+        if result.success:
+            content = json.loads(result.extracted_content)
+            return {
+                'pricing_features': content,
+                'clusters': len(content),
+                'similarity_scores': [item['score'] for item in content]
+            }
+```
+
+## Best Practices
+
+1. **Adjust Thresholds Iteratively**
+   - Start with default values
+   - Adjust based on results
+   - Monitor clustering quality
+
+2. **Choose Appropriate Word Count Thresholds**
+   - Higher for articles (100+)
+   - Lower for reviews/comments (20+)
+   - Medium for product descriptions (50+)
+
+3. **Optimize Performance**
+   ```python
+   strategy = CosineStrategy(
+       word_count_threshold=10,  # Filter early
+       top_k=5,                 # Limit results
+       verbose=True             # Monitor performance
+   )
+   ```
+
+4. **Handle Different Content Types**
+   ```python
+   # For mixed content pages
+   strategy = CosineStrategy(
+       semantic_filter="product features",
+       sim_threshold=0.4,      # More flexible matching
+       max_dist=0.3,          # Larger clusters
+       top_k=3                # Multiple relevant sections
+   )
+   ```
+
+## Error Handling
+
+```python
+try:
+    result = await crawler.arun(
+        url="https://example.com",
+        extraction_strategy=strategy
+    )
+    
+    if result.success:
+        content = json.loads(result.extracted_content)
+        if not content:
+            print("No relevant content found")
+    else:
+        print(f"Extraction failed: {result.error_message}")
+        
+except Exception as e:
+    print(f"Error during extraction: {str(e)}")
+```
+
+The Cosine Strategy is particularly effective when:
+- Content structure is inconsistent
+- You need semantic understanding
+- You want to find similar content blocks
+- Structure-based extraction (CSS/XPath) isn't reliable
+
+It works well with other strategies and can be used as a pre-processing step for LLM-based extraction.
+```
+
+
+## File: docs/md_v2/extraction/llm-strategies.md
+
+```md
+# Extracting JSON (LLM)
+
+In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
+
+1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more).  
+2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.  
+3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
+
+**Important**: LLM-based extraction can be slower and costlier than schema-based approaches. If your page data is highly structured, consider using [`JsonCssExtractionStrategy`](./no-llm-strategies.md) or [`JsonXPathExtractionStrategy`](./no-llm-strategies.md) first. But if you need AI to interpret or reorganize content, read on!
+
+---
+
+## 1. Why Use an LLM?
+
+- **Complex Reasoning**: If the site’s data is unstructured, scattered, or full of natural language context.  
+- **Semantic Extraction**: Summaries, knowledge graphs, or relational data that require comprehension.  
+- **Flexible**: You can pass instructions to the model to do more advanced transformations or classification.
+
+---
+
+## 2. Provider-Agnostic via LightLLM
+
+Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
+
+- **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).  
+- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.  
+- **`api_base`** (optional): If your provider has a custom endpoint.  
+
+This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
+
+---
+
+## 3. How LLM Extraction Works
+
+### 3.1 Flow
+
+1. **Chunking** (optional): The HTML or markdown is split into smaller segments if it’s very long (based on `chunk_token_threshold`, overlap, etc.).  
+2. **Prompt Construction**: For each chunk, the library forms a prompt that includes your **`instruction`** (and possibly schema or examples).  
+3. **LLM Inference**: Each chunk is sent to the model in parallel or sequentially (depending on your concurrency).  
+4. **Combining**: The results from each chunk are merged and parsed into JSON.
+
+### 3.2 `extraction_type`
+
+- **`"schema"`**: The model tries to return JSON conforming to your Pydantic-based schema.  
+- **`"block"`**: The model returns freeform text, or smaller JSON structures, which the library collects.  
+
+For structured data, `"schema"` is recommended. You provide `schema=YourPydanticModel.model_json_schema()`.
+
+---
+
+## 4. Key Parameters
+
+Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
+
+1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.  
+2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.  
+3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
+4. **`extraction_type`** (str): `"schema"` or `"block"`.  
+5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
+6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
+7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
+8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
+9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
+   - `"markdown"`: The raw markdown (default).  
+   - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.  
+   - `"html"`: The cleaned or raw HTML.  
+10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
+11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
+
+**Example**:
+
+```python
+extraction_strategy = LLMExtractionStrategy(
+    llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
+    schema=MyModel.model_json_schema(),
+    extraction_type="schema",
+    instruction="Extract a list of items from the text with 'name' and 'price' fields.",
+    chunk_token_threshold=1200,
+    overlap_rate=0.1,
+    apply_chunking=True,
+    input_format="html",
+    extra_args={"temperature": 0.1, "max_tokens": 1000},
+    verbose=True
+)
+```
+
+---
+
+## 5. Putting It in `CrawlerRunConfig`
+
+**Important**: In Crawl4AI, all strategy definitions should go inside the `CrawlerRunConfig`, not directly as a param in `arun()`. Here’s a full example:
+
+```python
+import os
+import asyncio
+import json
+from pydantic import BaseModel, Field
+from typing import List
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class Product(BaseModel):
+    name: str
+    price: str
+
+async def main():
+    # 1. Define the LLM extraction strategy
+    llm_strategy = LLMExtractionStrategy(
+        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
+        schema=Product.schema_json(), # Or use model_json_schema()
+        extraction_type="schema",
+        instruction="Extract all product objects with 'name' and 'price' from the content.",
+        chunk_token_threshold=1000,
+        overlap_rate=0.0,
+        apply_chunking=True,
+        input_format="markdown",   # or "html", "fit_markdown"
+        extra_args={"temperature": 0.0, "max_tokens": 800}
+    )
+
+    # 2. Build the crawler config
+    crawl_config = CrawlerRunConfig(
+        extraction_strategy=llm_strategy,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    # 3. Create a browser config if needed
+    browser_cfg = BrowserConfig(headless=True)
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        # 4. Let's say we want to crawl a single page
+        result = await crawler.arun(
+            url="https://example.com/products",
+            config=crawl_config
+        )
+
+        if result.success:
+            # 5. The extracted content is presumably JSON
+            data = json.loads(result.extracted_content)
+            print("Extracted items:", data)
+            
+            # 6. Show usage stats
+            llm_strategy.show_usage()  # prints token usage
+        else:
+            print("Error:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+## 6. Chunking Details
+
+### 6.1 `chunk_token_threshold`
+
+If your page is large, you might exceed your LLM’s context window. **`chunk_token_threshold`** sets the approximate max tokens per chunk. The library calculates word→token ratio using `word_token_rate` (often ~0.75 by default). If chunking is enabled (`apply_chunking=True`), the text is split into segments.
+
+### 6.2 `overlap_rate`
+
+To keep context continuous across chunks, we can overlap them. E.g., `overlap_rate=0.1` means each subsequent chunk includes 10% of the previous chunk’s text. This is helpful if your needed info might straddle chunk boundaries.
+
+### 6.3 Performance & Parallelism
+
+By chunking, you can potentially process multiple chunks in parallel (depending on your concurrency settings and the LLM provider). This reduces total time if the site is huge or has many sections.
+
+---
+
+## 7. Input Format
+
+By default, **LLMExtractionStrategy** uses `input_format="markdown"`, meaning the **crawler’s final markdown** is fed to the LLM. You can change to:
+
+- **`html`**: The cleaned HTML or raw HTML (depending on your crawler config) goes into the LLM.  
+- **`fit_markdown`**: If you used, for instance, `PruningContentFilter`, the “fit” version of the markdown is used. This can drastically reduce tokens if you trust the filter.  
+- **`markdown`**: Standard markdown output from the crawler’s `markdown_generator`.
+
+This setting is crucial: if the LLM instructions rely on HTML tags, pick `"html"`. If you prefer a text-based approach, pick `"markdown"`.
+
+```python
+LLMExtractionStrategy(
+    # ...
+    input_format="html",  # Instead of "markdown" or "fit_markdown"
+)
+```
+
+---
+
+## 8. Token Usage & Show Usage
+
+To keep track of tokens and cost, each chunk is processed with an LLM call. We record usage in:
+
+- **`usages`** (list): token usage per chunk or call.  
+- **`total_usage`**: sum of all chunk calls.  
+- **`show_usage()`**: prints a usage report (if the provider returns usage data).
+
+```python
+llm_strategy = LLMExtractionStrategy(...)
+# ...
+llm_strategy.show_usage()
+# e.g. “Total usage: 1241 tokens across 2 chunk calls”
+```
+
+If your model provider doesn’t return usage info, these fields might be partial or empty.
+
+---
+
+## 9. Example: Building a Knowledge Graph
+
+Below is a snippet combining **`LLMExtractionStrategy`** with a Pydantic schema for a knowledge graph. Notice how we pass an **`instruction`** telling the model what to parse.
+
+```python
+import os
+import json
+import asyncio
+from typing import List
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class Entity(BaseModel):
+    name: str
+    description: str
+
+class Relationship(BaseModel):
+    entity1: Entity
+    entity2: Entity
+    description: str
+    relation_type: str
+
+class KnowledgeGraph(BaseModel):
+    entities: List[Entity]
+    relationships: List[Relationship]
+
+async def main():
+    # LLM extraction strategy
+    llm_strat = LLMExtractionStrategy(
+        provider="openai/gpt-4",
+        api_token=os.getenv('OPENAI_API_KEY'),
+        schema=KnowledgeGraph.schema_json(),
+        extraction_type="schema",
+        instruction="Extract entities and relationships from the content. Return valid JSON.",
+        chunk_token_threshold=1400,
+        apply_chunking=True,
+        input_format="html",
+        extra_args={"temperature": 0.1, "max_tokens": 1500}
+    )
+
+    crawl_config = CrawlerRunConfig(
+        extraction_strategy=llm_strat,
+        cache_mode=CacheMode.BYPASS
+    )
+
+    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+        # Example page
+        url = "https://www.nbcnews.com/business"
+        result = await crawler.arun(url=url, config=crawl_config)
+
+        if result.success:
+            with open("kb_result.json", "w", encoding="utf-8") as f:
+                f.write(result.extracted_content)
+            llm_strat.show_usage()
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key Observations**:
+
+- **`extraction_type="schema"`** ensures we get JSON fitting our `KnowledgeGraph`.  
+- **`input_format="html"`** means we feed HTML to the model.  
+- **`instruction`** guides the model to output a structured knowledge graph.  
+
+---
+
+## 10. Best Practices & Caveats
+
+1. **Cost & Latency**: LLM calls can be slow or expensive. Consider chunking or smaller coverage if you only need partial data.  
+2. **Model Token Limits**: If your page + instruction exceed the context window, chunking is essential.  
+3. **Instruction Engineering**: Well-crafted instructions can drastically improve output reliability.  
+4. **Schema Strictness**: `"schema"` extraction tries to parse the model output as JSON. If the model returns invalid JSON, partial extraction might happen, or you might get an error.  
+5. **Parallel vs. Serial**: The library can process multiple chunks in parallel, but you must watch out for rate limits on certain providers.  
+6. **Check Output**: Sometimes, an LLM might omit fields or produce extraneous text. You may want to post-validate with Pydantic or do additional cleanup.
+
+---
+
+## 11. Conclusion
+
+**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
+
+- Put your LLM strategy **in `CrawlerRunConfig`**.  
+- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.  
+- Tweak **`chunk_token_threshold`**, **`overlap_rate`**, and **`apply_chunking`** to handle large content efficiently.  
+- Monitor token usage with `show_usage()`.
+
+If your site’s data is consistent or repetitive, consider [`JsonCssExtractionStrategy`](./no-llm-strategies.md) first for speed and simplicity. But if you need an **AI-driven** approach, `LLMExtractionStrategy` offers a flexible, multi-provider solution for extracting structured JSON from any website.
+
+**Next Steps**:
+
+1. **Experiment with Different Providers**  
+   - Try switching the `provider` (e.g., `"ollama/llama2"`, `"openai/gpt-4o"`, etc.) to see differences in speed, accuracy, or cost.  
+   - Pass different `extra_args` like `temperature`, `top_p`, and `max_tokens` to fine-tune your results.
+
+2. **Performance Tuning**  
+   - If pages are large, tweak `chunk_token_threshold`, `overlap_rate`, or `apply_chunking` to optimize throughput.  
+   - Check the usage logs with `show_usage()` to keep an eye on token consumption and identify potential bottlenecks.
+
+3. **Validate Outputs**  
+   - If using `extraction_type="schema"`, parse the LLM’s JSON with a Pydantic model for a final validation step.  
+   - Log or handle any parse errors gracefully, especially if the model occasionally returns malformed JSON.
+
+4. **Explore Hooks & Automation**  
+   - Integrate LLM extraction with [hooks](../advanced/hooks-auth.md) for complex pre/post-processing.  
+   - Use a multi-step pipeline: crawl, filter, LLM-extract, then store or index results for further analysis.
+
+**Last Updated**: 2025-01-01
+
+---
+
+That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
+```
+
+
+## File: docs/md_v2/extraction/no-llm-strategies.md
+
+```md
+# Extracting JSON (No LLM)
+
+One of Crawl4AI’s **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. By defining a **schema** with CSS or XPath selectors, you can extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
+
+**Why avoid LLM for basic extractions?**
+
+1. **Faster & Cheaper**: No API calls or GPU overhead.  
+2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free.  
+3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate.  
+4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel.
+
+Below, we’ll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We’ll also highlight advanced features like **nested fields** and **base element attributes**.
+
+---
+
+## 1. Intro to Schema-Based Extraction
+
+A schema defines:
+
+1. A **base selector** that identifies each “container” element on the page (e.g., a product row, a blog post card).  
+2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
+3. **Nested** or **list** types for repeated or hierarchical structures.  
+
+For example, if you have a list of products, each one might have a name, price, reviews, and “related products.” This approach is faster and more reliable than an LLM for consistent, structured pages.
+
+---
+
+## 2. Simple Example: Crypto Prices
+
+Let’s begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don’t** call any LLM:
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+async def extract_crypto_prices():
+    # 1. Define a simple extraction schema
+    schema = {
+        "name": "Crypto Prices",
+        "baseSelector": "div.crypto-row",    # Repeated elements
+        "fields": [
+            {
+                "name": "coin_name",
+                "selector": "h2.coin-name",
+                "type": "text"
+            },
+            {
+                "name": "price",
+                "selector": "span.coin-price",
+                "type": "text"
+            }
+        ]
+    }
+
+    # 2. Create the extraction strategy
+    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+
+    # 3. Set up your crawler config (if needed)
+    config = CrawlerRunConfig(
+        # e.g., pass js_code or wait_for if the page is dynamic
+        # wait_for="css:.crypto-row:nth-child(20)"
+        cache_mode = CacheMode.BYPASS,
+        extraction_strategy=extraction_strategy,
+    )
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # 4. Run the crawl and extraction
+        result = await crawler.arun(
+            url="https://example.com/crypto-prices",
+            
+            config=config
+        )
+
+        if not result.success:
+            print("Crawl failed:", result.error_message)
+            return
+
+        # 5. Parse the extracted JSON
+        data = json.loads(result.extracted_content)
+        print(f"Extracted {len(data)} coin entries")
+        print(json.dumps(data[0], indent=2) if data else "No data found")
+
+asyncio.run(extract_crypto_prices())
+```
+
+**Highlights**:
+
+- **`baseSelector`**: Tells us where each “item” (crypto row) is.  
+- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.  
+- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
+
+No LLM is needed, and the performance is **near-instant** for hundreds or thousands of items.
+
+---
+
+### **XPath Example with `raw://` HTML**
+
+Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We’ll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+
+async def extract_crypto_prices_xpath():
+    # 1. Minimal dummy HTML with some repeating rows
+    dummy_html = """
+    <html>
+      <body>
+        <div class='crypto-row'>
+          <h2 class='coin-name'>Bitcoin</h2>
+          <span class='coin-price'>$28,000</span>
+        </div>
+        <div class='crypto-row'>
+          <h2 class='coin-name'>Ethereum</h2>
+          <span class='coin-price'>$1,800</span>
+        </div>
+      </body>
+    </html>
+    """
+
+    # 2. Define the JSON schema (XPath version)
+    schema = {
+        "name": "Crypto Prices via XPath",
+        "baseSelector": "//div[@class='crypto-row']",
+        "fields": [
+            {
+                "name": "coin_name",
+                "selector": ".//h2[@class='coin-name']",
+                "type": "text"
+            },
+            {
+                "name": "price",
+                "selector": ".//span[@class='coin-price']",
+                "type": "text"
+            }
+        ]
+    }
+
+    # 3. Place the strategy in the CrawlerRunConfig
+    config = CrawlerRunConfig(
+        extraction_strategy=JsonXPathExtractionStrategy(schema, verbose=True)
+    )
+
+    # 4. Use raw:// scheme to pass dummy_html directly
+    raw_url = f"raw://{dummy_html}"
+
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url=raw_url,
+            config=config
+        )
+
+        if not result.success:
+            print("Crawl failed:", result.error_message)
+            return
+
+        data = json.loads(result.extracted_content)
+        print(f"Extracted {len(data)} coin rows")
+        if data:
+            print("First item:", data[0])
+
+asyncio.run(extract_crypto_prices_xpath())
+```
+
+**Key Points**:
+
+1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
+2. **`baseSelector`** and each field’s `"selector"` use **XPath** instead of CSS.  
+3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
+4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**.  
+
+That’s how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
+
+---
+
+## 3. Advanced Schema & Nested Structures
+
+Real sites often have **nested** or repeated data—like categories containing products, which themselves have a list of reviews or features. For that, we can define **nested** or **list** (and even **nested_list**) fields.
+
+### Sample E-Commerce HTML
+
+We have a **sample e-commerce** HTML file on GitHub (example):
+```
+https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
+```
+This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**.
+
+```python
+schema = {
+    "name": "E-commerce Product Catalog",
+    "baseSelector": "div.category",
+    # (1) We can define optional baseFields if we want to extract attributes 
+    # from the category container
+    "baseFields": [
+        {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, 
+    ],
+    "fields": [
+        {
+            "name": "category_name",
+            "selector": "h2.category-name",
+            "type": "text"
+        },
+        {
+            "name": "products",
+            "selector": "div.product",
+            "type": "nested_list",    # repeated sub-objects
+            "fields": [
+                {
+                    "name": "name",
+                    "selector": "h3.product-name",
+                    "type": "text"
+                },
+                {
+                    "name": "price",
+                    "selector": "p.product-price",
+                    "type": "text"
+                },
+                {
+                    "name": "details",
+                    "selector": "div.product-details",
+                    "type": "nested",  # single sub-object
+                    "fields": [
+                        {
+                            "name": "brand",
+                            "selector": "span.brand",
+                            "type": "text"
+                        },
+                        {
+                            "name": "model",
+                            "selector": "span.model",
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "features",
+                    "selector": "ul.product-features li",
+                    "type": "list",
+                    "fields": [
+                        {"name": "feature", "type": "text"} 
+                    ]
+                },
+                {
+                    "name": "reviews",
+                    "selector": "div.review",
+                    "type": "nested_list",
+                    "fields": [
+                        {
+                            "name": "reviewer", 
+                            "selector": "span.reviewer", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "rating", 
+                            "selector": "span.rating", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "comment", 
+                            "selector": "p.review-text", 
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "related_products",
+                    "selector": "ul.related-products li",
+                    "type": "list",
+                    "fields": [
+                        {
+                            "name": "name", 
+                            "selector": "span.related-name", 
+                            "type": "text"
+                        },
+                        {
+                            "name": "price", 
+                            "selector": "span.related-price", 
+                            "type": "text"
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
+```
+
+Key Takeaways:
+
+- **Nested vs. List**:  
+  - **`type: "nested"`** means a **single** sub-object (like `details`).  
+  - **`type: "list"`** means multiple items that are **simple** dictionaries or single text fields.  
+  - **`type: "nested_list"`** means repeated **complex** objects (like `products` or `reviews`).
+- **Base Fields**: We can extract **attributes** from the container element via `"baseFields"`. For instance, `"data_cat_id"` might be `data-cat-id="elect123"`.  
+- **Transforms**: We can also define a `transform` if we want to lower/upper case, strip whitespace, or even run a custom function.
+
+### Running the Extraction
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+ecommerce_schema = {
+    # ... the advanced schema from above ...
+}
+
+async def extract_ecommerce_data():
+    strategy = JsonCssExtractionStrategy(ecommerce_schema, verbose=True)
+    
+    config = CrawlerRunConfig()
+    
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
+            extraction_strategy=strategy,
+            config=config
+        )
+
+        if not result.success:
+            print("Crawl failed:", result.error_message)
+            return
+        
+        # Parse the JSON output
+        data = json.loads(result.extracted_content)
+        print(json.dumps(data, indent=2) if data else "No data found.")
+
+asyncio.run(extract_ecommerce_data())
+```
+
+If all goes well, you get a **structured** JSON array with each “category,” containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
+
+---
+
+## 4. Why “No LLM” Is Often Better
+
+1. **Zero Hallucination**: Schema-based extraction doesn’t guess text. It either finds it or not.  
+2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
+3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
+4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model.
+
+**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns.
+
+---
+
+## 5. Base Element Attributes & Additional Fields
+
+It’s easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
+
+```json
+{
+  "name": "href",
+  "type": "attribute",
+  "attribute": "href",
+  "default": null
+}
+```
+
+You can define them in **`baseFields`** (extracted from the main container element) or in each field’s sub-lists. This is especially helpful if you need an item’s link or ID stored in the parent `<div>`.
+
+---
+
+## 6. Putting It All Together: Larger Example
+
+Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author:
+
+```python
+schema = {
+  "name": "Blog Posts",
+  "baseSelector": "a.blog-post-card",
+  "baseFields": [
+    {"name": "post_url", "type": "attribute", "attribute": "href"}
+  ],
+  "fields": [
+    {"name": "title", "selector": "h2.post-title", "type": "text", "default": "No Title"},
+    {"name": "date", "selector": "time.post-date", "type": "text", "default": ""},
+    {"name": "summary", "selector": "p.post-summary", "type": "text", "default": ""},
+    {"name": "author", "selector": "span.post-author", "type": "text", "default": ""}
+  ]
+}
+```
+
+Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post objects, each with `"post_url"`, `"title"`, `"date"`, `"summary"`, `"author"`.
+
+---
+
+## 7. Tips & Best Practices
+
+1. **Inspect the DOM** in Chrome DevTools or Firefox’s Inspector to find stable selectors.  
+2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
+3. **Test** your schema on partial HTML or a test page before a big crawl.  
+4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
+5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it’ll often show warnings.  
+6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the “parent” item.  
+7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
+
+---
+
+## 8. Schema Generation Utility
+
+While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
+
+1. You're dealing with a new website structure and want a quick starting point
+2. You need to extract complex nested data structures
+3. You want to avoid the learning curve of CSS/XPath selector syntax
+
+### Using the Schema Generator
+
+The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai import LLMConfig
+
+# Sample HTML with product information
+html = """
+<div class="product-card">
+    <h2 class="title">Gaming Laptop</h2>
+    <div class="price">$999.99</div>
+    <div class="specs">
+        <ul>
+            <li>16GB RAM</li>
+            <li>1TB SSD</li>
+        </ul>
+    </div>
+</div>
+"""
+
+# Option 1: Using OpenAI (requires API token)
+css_schema = JsonCssExtractionStrategy.generate_schema(
+    html,
+    schema_type="css", 
+    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
+)
+
+# Option 2: Using Ollama (open source, no token needed)
+xpath_schema = JsonXPathExtractionStrategy.generate_schema(
+    html,
+    schema_type="xpath",
+    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
+)
+
+# Use the generated schema for fast, repeated extractions
+strategy = JsonCssExtractionStrategy(css_schema)
+```
+
+### LLM Provider Options
+
+1. **OpenAI GPT-4 (`openai/gpt4o`)**
+   - Default provider
+   - Requires an API token
+   - Generally provides more accurate schemas
+   - Set via environment variable: `OPENAI_API_KEY`
+
+2. **Ollama (`ollama/llama3.3`)**
+   - Open source alternative
+   - No API token required
+   - Self-hosted option
+   - Good for development and testing
+
+### Benefits of Schema Generation
+
+1. **One-Time Cost**: While schema generation uses LLM, it's a one-time cost. The generated schema can be reused for unlimited extractions without further LLM calls.
+2. **Smart Pattern Recognition**: The LLM analyzes the HTML structure and identifies common patterns, often producing more robust selectors than manual attempts.
+3. **Automatic Nesting**: Complex nested structures are automatically detected and properly represented in the schema.
+4. **Learning Tool**: The generated schemas serve as excellent examples for learning how to write your own schemas.
+
+### Best Practices
+
+1. **Review Generated Schemas**: While the generator is smart, always review and test the generated schema before using it in production.
+2. **Provide Representative HTML**: The better your sample HTML represents the overall structure, the more accurate the generated schema will be.
+3. **Consider Both CSS and XPath**: Try both schema types and choose the one that works best for your specific case.
+4. **Cache Generated Schemas**: Since generation uses LLM, save successful schemas for reuse.
+5. **API Token Security**: Never hardcode API tokens. Use environment variables or secure configuration management.
+6. **Choose Provider Wisely**: 
+   - Use OpenAI for production-quality schemas
+   - Use Ollama for development, testing, or when you need a self-hosted solution
+
+That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
+
+---
+
+## 9. Conclusion
+
+With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that:
+
+- Scrape any consistent site for structured data.  
+- Support nested objects, repeating lists, or advanced transformations.  
+- Scale to thousands of pages quickly and reliably.
+
+**Next Steps**:
+
+- Combine your extracted JSON with advanced filtering or summarization in a second pass if needed.  
+- For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded.
+
+**Remember**: For repeated, structured data, you don’t need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
+
+**Last Updated**: 2025-01-01
+
+---
+
+That’s it for **Extracting JSON (No LLM)**! You’ve seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
+```
+
diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
new file mode 100644
index 00000000..c81badc4
--- /dev/null
+++ b/deploy/docker/config.yml
@@ -0,0 +1,91 @@
+# Application Configuration
+app:
+  title: "Crawl4AI API"
+  version: "1.0.0"
+  host: "0.0.0.0"
+  port: 11234
+  reload: False
+  workers: 1
+  timeout_keep_alive: 300
+
+# Default LLM Configuration
+llm:
+  provider: "openai/gpt-4o-mini"
+  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
+
+# Redis Configuration
+redis:
+  host: "localhost"
+  port: 6379
+  db: 0
+  password: ""
+  ssl: False
+  ssl_cert_reqs: None
+  ssl_ca_certs: None
+  ssl_certfile: None
+  ssl_keyfile: None
+  ssl_cert_reqs: None
+  ssl_ca_certs: None
+  ssl_certfile: None
+  ssl_keyfile: None
+
+# Rate Limiting Configuration
+rate_limiting:
+  enabled: True
+  default_limit: "1000/minute"
+  trusted_proxies: []
+  storage_uri: "memory://"  # Use "redis://localhost:6379" for production
+
+# Security Configuration
+security:
+  enabled: false 
+  jwt_enabled: false 
+  https_redirect: false
+  trusted_hosts: ["*"]
+  headers:
+    x_content_type_options: "nosniff"
+    x_frame_options: "DENY"
+    content_security_policy: "default-src 'self'"
+    strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+  base_config:
+    simulate_user: true
+  memory_threshold_percent: 95.0
+  rate_limiter:
+    enabled: true
+    base_delay: [1.0, 2.0]
+  timeouts:
+    stream_init: 30.0  # Timeout for stream initialization
+    batch_process: 300.0  # Timeout for batch processing
+  pool:
+    max_pages: 40                          # ← GLOBAL_SEM permits
+    idle_ttl_sec: 1800                     # ← 30 min janitor cutoff
+  browser:
+    kwargs:
+      headless: true
+      text_mode: true
+    extra_args:
+      # - "--single-process"
+      - "--no-sandbox"
+      - "--disable-dev-shm-usage"
+      - "--disable-gpu"
+      - "--disable-software-rasterizer"
+      - "--disable-web-security"
+      - "--allow-insecure-localhost"
+      - "--ignore-certificate-errors"
+
+# Logging Configuration
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+  prometheus:
+    enabled: True
+    endpoint: "/metrics"
+  health_check:
+    endpoint: "/health"
\ No newline at end of file
diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
new file mode 100644
index 00000000..d15102e4
--- /dev/null
+++ b/deploy/docker/crawler_pool.py
@@ -0,0 +1,60 @@
+# crawler_pool.py  (new file)
+import asyncio, json, hashlib, time, psutil
+from contextlib import suppress
+from typing import Dict
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+from typing import Dict
+from utils import load_config 
+
+CONFIG = load_config()
+
+POOL: Dict[str, AsyncWebCrawler] = {}
+LAST_USED: Dict[str, float] = {}
+LOCK = asyncio.Lock()
+
+MEM_LIMIT  = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0)   # % RAM – refuse new browsers above this
+IDLE_TTL  = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)   # close if unused for 30 min
+
+def _sig(cfg: BrowserConfig) -> str:
+    payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
+    return hashlib.sha1(payload.encode()).hexdigest()
+
+async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
+    try:
+        sig = _sig(cfg)
+        async with LOCK:
+            if sig in POOL:
+                LAST_USED[sig] = time.time();  
+                return POOL[sig]
+            if psutil.virtual_memory().percent >= MEM_LIMIT:
+                raise MemoryError("RAM pressure – new browser denied")
+            crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
+            await crawler.start()
+            POOL[sig] = crawler; LAST_USED[sig] = time.time()
+            return crawler
+    except MemoryError as e:
+        raise MemoryError(f"RAM pressure – new browser denied: {e}")
+    except Exception as e:
+        raise RuntimeError(f"Failed to start browser: {e}")
+    finally:
+        if sig in POOL:
+            LAST_USED[sig] = time.time()
+        else:
+            # If we failed to start the browser, we should remove it from the pool
+            POOL.pop(sig, None)
+            LAST_USED.pop(sig, None)
+        # If we failed to start the browser, we should remove it from the pool
+async def close_all():
+    async with LOCK:
+        await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
+        POOL.clear(); LAST_USED.clear()
+
+async def janitor():
+    while True:
+        await asyncio.sleep(60)
+        now = time.time()
+        async with LOCK:
+            for sig, crawler in list(POOL.items()):
+                if now - LAST_USED[sig] > IDLE_TTL:
+                    with suppress(Exception): await crawler.close()
+                    POOL.pop(sig, None); LAST_USED.pop(sig, None)
diff --git a/deploy/docker/job.py b/deploy/docker/job.py
new file mode 100644
index 00000000..4cd63009
--- /dev/null
+++ b/deploy/docker/job.py
@@ -0,0 +1,99 @@
+"""
+Job endpoints (enqueue + poll) for long-running LL​M extraction and raw crawl.
+Relies on the existing Redis task helpers in api.py
+"""
+
+from typing import Dict, Optional, Callable
+from fastapi import APIRouter, BackgroundTasks, Depends, Request
+from pydantic import BaseModel, HttpUrl
+
+from api import (
+    handle_llm_request,
+    handle_crawl_job,
+    handle_task_status,
+)
+
+# ------------- dependency placeholders -------------
+_redis = None        # will be injected from server.py
+_config = None
+_token_dep: Callable = lambda: None  # dummy until injected
+
+# public router
+router = APIRouter()
+
+
+# === init hook called by server.py =========================================
+def init_job_router(redis, config, token_dep) -> APIRouter:
+    """Inject shared singletons and return the router for mounting."""
+    global _redis, _config, _token_dep
+    _redis, _config, _token_dep = redis, config, token_dep
+    return router
+
+
+# ---------- payload models --------------------------------------------------
+class LlmJobPayload(BaseModel):
+    url:    HttpUrl
+    q:      str
+    schema: Optional[str] = None
+    cache:  bool = False
+
+
+class CrawlJobPayload(BaseModel):
+    urls:           list[HttpUrl]
+    browser_config: Dict = {}
+    crawler_config: Dict = {}
+
+
+# ---------- LL​M job ---------------------------------------------------------
+@router.post("/llm/job", status_code=202)
+async def llm_job_enqueue(
+        payload: LlmJobPayload,
+        background_tasks: BackgroundTasks,
+        request: Request,
+        _td: Dict = Depends(lambda: _token_dep()),   # late-bound dep
+):
+    return await handle_llm_request(
+        _redis,
+        background_tasks,
+        request,
+        str(payload.url),
+        query=payload.q,
+        schema=payload.schema,
+        cache=payload.cache,
+        config=_config,
+    )
+
+
+@router.get("/llm/job/{task_id}")
+async def llm_job_status(
+    request: Request,
+    task_id: str,
+    _td: Dict = Depends(lambda: _token_dep())
+):
+    return await handle_task_status(_redis, task_id)
+
+
+# ---------- CRAWL job -------------------------------------------------------
+@router.post("/crawl/job", status_code=202)
+async def crawl_job_enqueue(
+        payload: CrawlJobPayload,
+        background_tasks: BackgroundTasks,
+        _td: Dict = Depends(lambda: _token_dep()),
+):
+    return await handle_crawl_job(
+        _redis,
+        background_tasks,
+        [str(u) for u in payload.urls],
+        payload.browser_config,
+        payload.crawler_config,
+        config=_config,
+    )
+
+
+@router.get("/crawl/job/{task_id}")
+async def crawl_job_status(
+    request: Request,
+    task_id: str,
+    _td: Dict = Depends(lambda: _token_dep())
+):
+    return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
diff --git a/deploy/docker/mcp_bridge.py b/deploy/docker/mcp_bridge.py
new file mode 100644
index 00000000..c55ed14c
--- /dev/null
+++ b/deploy/docker/mcp_bridge.py
@@ -0,0 +1,252 @@
+# deploy/docker/mcp_bridge.py
+
+from __future__ import annotations
+import inspect, json, re, anyio
+from contextlib import suppress
+from typing import Any, Callable, Dict, List, Tuple
+import httpx
+
+from fastapi import FastAPI, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi import Request
+from sse_starlette.sse import EventSourceResponse
+from pydantic import BaseModel
+from mcp.server.sse import SseServerTransport
+
+import mcp.types as t
+from mcp.server.lowlevel.server import Server, NotificationOptions
+from mcp.server.models import InitializationOptions
+
+# ── opt‑in decorators ───────────────────────────────────────────
+def mcp_resource(name: str | None = None):
+    def deco(fn):
+        fn.__mcp_kind__, fn.__mcp_name__ = "resource", name
+        return fn
+    return deco
+
+def mcp_template(name: str | None = None):
+    def deco(fn):
+        fn.__mcp_kind__, fn.__mcp_name__ = "template", name
+        return fn
+    return deco
+
+def mcp_tool(name: str | None = None):
+    def deco(fn):
+        fn.__mcp_kind__, fn.__mcp_name__ = "tool", name
+        return fn
+    return deco
+
+# ── HTTP‑proxy helper for FastAPI endpoints ─────────────────────
+def _make_http_proxy(base_url: str, route):
+    method = list(route.methods - {"HEAD", "OPTIONS"})[0]
+    async def proxy(**kwargs):
+        # replace `/items/{id}` style params first
+        path = route.path
+        for k, v in list(kwargs.items()):
+            placeholder = "{" + k + "}"
+            if placeholder in path:
+                path = path.replace(placeholder, str(v))
+                kwargs.pop(k)
+        url = base_url.rstrip("/") + path
+
+        async with httpx.AsyncClient() as client:
+            try:
+                r = (
+                    await client.get(url, params=kwargs)
+                    if method == "GET"
+                    else await client.request(method, url, json=kwargs)
+                )
+                r.raise_for_status()
+                return r.text if method == "GET" else r.json()
+            except httpx.HTTPStatusError as e:
+                # surface FastAPI error details instead of plain 500
+                raise HTTPException(e.response.status_code, e.response.text)
+    return proxy
+
+# ── main entry point ────────────────────────────────────────────
+def attach_mcp(
+    app: FastAPI,
+    *,                          # keyword‑only
+    base: str = "/mcp",
+    name: str | None = None,
+    base_url: str,              # eg. "http://127.0.0.1:8020"
+) -> None:
+    """Call once after all routes are declared to expose WS+SSE MCP endpoints."""
+    server_name = name or app.title or "FastAPI-MCP"
+    mcp = Server(server_name)
+
+    # tools: Dict[str, Callable] = {}
+    tools: Dict[str, Tuple[Callable, Callable]] = {}
+    resources: Dict[str, Callable] = {}
+    templates: Dict[str, Callable] = {}
+
+    # register decorated FastAPI routes
+    for route in app.routes:
+        fn = getattr(route, "endpoint", None)
+        kind = getattr(fn, "__mcp_kind__", None)
+        if not kind:
+            continue
+
+        key = fn.__mcp_name__ or re.sub(r"[/{}}]", "_", route.path).strip("_")
+
+        # if kind == "tool":
+        #     tools[key] = _make_http_proxy(base_url, route)
+        if kind == "tool":
+            proxy = _make_http_proxy(base_url, route)
+            tools[key] = (proxy, fn)
+            continue
+        if kind == "resource":
+            resources[key] = fn
+        if kind == "template":
+            templates[key] = fn
+
+    # helpers for JSON‑Schema
+    def _schema(model: type[BaseModel] | None) -> dict:
+        return {"type": "object"} if model is None else model.model_json_schema()
+
+    def _body_model(fn: Callable) -> type[BaseModel] | None:
+        for p in inspect.signature(fn).parameters.values():
+            a = p.annotation
+            if inspect.isclass(a) and issubclass(a, BaseModel):
+                return a
+        return None
+
+    # MCP handlers
+    @mcp.list_tools()
+    async def _list_tools() -> List[t.Tool]:
+        out = []
+        for k, (proxy, orig_fn) in tools.items():
+            desc   = getattr(orig_fn, "__mcp_description__", None) or inspect.getdoc(orig_fn) or ""
+            schema = getattr(orig_fn, "__mcp_schema__", None) or _schema(_body_model(orig_fn))
+            out.append(
+                t.Tool(name=k, description=desc, inputSchema=schema)
+            )
+        return out
+             
+
+    @mcp.call_tool()
+    async def _call_tool(name: str, arguments: Dict | None) -> List[t.TextContent]:
+        if name not in tools:
+            raise HTTPException(404, "tool not found")
+        
+        proxy, _ = tools[name]
+        try:
+            res = await proxy(**(arguments or {}))
+        except HTTPException as exc:
+            # map server‑side errors into MCP "text/error" payloads
+            err = {"error": exc.status_code, "detail": exc.detail}
+            return [t.TextContent(type = "text", text=json.dumps(err))]
+        return [t.TextContent(type = "text", text=json.dumps(res, default=str))]
+
+    @mcp.list_resources()
+    async def _list_resources() -> List[t.Resource]:
+        return [
+            t.Resource(name=k, description=inspect.getdoc(f) or "", mime_type="application/json")
+            for k, f in resources.items()
+        ]
+
+    @mcp.read_resource()
+    async def _read_resource(name: str) -> List[t.TextContent]:
+        if name not in resources:
+            raise HTTPException(404, "resource not found")
+        res = resources[name]()
+        return [t.TextContent(type = "text", text=json.dumps(res, default=str))]
+
+    @mcp.list_resource_templates()
+    async def _list_templates() -> List[t.ResourceTemplate]:
+        return [
+            t.ResourceTemplate(
+                name=k,
+                description=inspect.getdoc(f) or "",
+                parameters={
+                    p: {"type": "string"} for p in _path_params(app, f)
+                },
+            )
+            for k, f in templates.items()
+        ]
+
+    init_opts = InitializationOptions(
+        server_name=server_name,
+        server_version="0.1.0",
+        capabilities=mcp.get_capabilities(
+            notification_options=NotificationOptions(),
+            experimental_capabilities={},
+        ),
+    )
+
+    # ── WebSocket transport ────────────────────────────────────
+    @app.websocket_route(f"{base}/ws")
+    async def _ws(ws: WebSocket):
+        await ws.accept()
+        c2s_send, c2s_recv = anyio.create_memory_object_stream(100)
+        s2c_send, s2c_recv = anyio.create_memory_object_stream(100)
+
+        from pydantic import TypeAdapter
+        from mcp.types import JSONRPCMessage
+        adapter = TypeAdapter(JSONRPCMessage)
+
+        init_done = anyio.Event()
+
+        async def srv_to_ws():
+            first = True 
+            try:
+                async for msg in s2c_recv:
+                    await ws.send_json(msg.model_dump())
+                    if first:
+                        init_done.set()
+                        first = False
+            finally:
+                # make sure cleanup survives TaskGroup cancellation
+                with anyio.CancelScope(shield=True):
+                    with suppress(RuntimeError):       # idempotent close
+                        await ws.close()
+
+        async def ws_to_srv():
+            try:
+                # 1st frame is always "initialize"
+                first = adapter.validate_python(await ws.receive_json())
+                await c2s_send.send(first)
+                await init_done.wait()          # block until server ready
+                while True:
+                    data = await ws.receive_json()
+                    await c2s_send.send(adapter.validate_python(data))
+            except WebSocketDisconnect:
+                await c2s_send.aclose()
+
+        async with anyio.create_task_group() as tg:
+            tg.start_soon(mcp.run, c2s_recv, s2c_send, init_opts)
+            tg.start_soon(ws_to_srv)
+            tg.start_soon(srv_to_ws)
+
+    # ── SSE transport (official) ─────────────────────────────
+    sse = SseServerTransport(f"{base}/messages/")
+
+    @app.get(f"{base}/sse")
+    async def _mcp_sse(request: Request):
+        async with sse.connect_sse(
+            request.scope, request.receive, request._send  # starlette ASGI primitives
+        ) as (read_stream, write_stream):
+            await mcp.run(read_stream, write_stream, init_opts)
+
+    # client → server frames are POSTed here
+    app.mount(f"{base}/messages", app=sse.handle_post_message)
+
+    # ── schema endpoint ───────────────────────────────────────
+    @app.get(f"{base}/schema")
+    async def _schema_endpoint():
+        return JSONResponse({
+            "tools": [x.model_dump() for x in await _list_tools()],
+            "resources": [x.model_dump() for x in await _list_resources()],
+            "resource_templates": [x.model_dump() for x in await _list_templates()],
+        })
+
+
+# ── helpers ────────────────────────────────────────────────────
+def _route_name(path: str) -> str:
+    return re.sub(r"[/{}}]", "_", path).strip("_")
+
+def _path_params(app: FastAPI, fn: Callable) -> List[str]:
+    for r in app.routes:
+        if r.endpoint is fn:
+            return list(r.param_convertors.keys())
+    return []
diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt
new file mode 100644
index 00000000..dd489e28
--- /dev/null
+++ b/deploy/docker/requirements.txt
@@ -0,0 +1,16 @@
+fastapi>=0.115.12
+uvicorn>=0.34.2
+gunicorn>=23.0.0
+slowapi==0.1.9
+prometheus-fastapi-instrumentator>=7.1.0
+redis>=5.2.1
+jwt>=1.3.1
+dnspython>=2.7.0
+email-validator==2.2.0
+sse-starlette==2.2.1
+pydantic>=2.11
+rank-bm25==0.2.2
+anyio==4.9.0
+PyJWT==2.10.1
+mcp>=1.6.0
+websockets>=15.0.1
diff --git a/deploy/docker/schemas.py b/deploy/docker/schemas.py
new file mode 100644
index 00000000..ea32b6c6
--- /dev/null
+++ b/deploy/docker/schemas.py
@@ -0,0 +1,42 @@
+from typing import List, Optional, Dict
+from enum import Enum
+from pydantic import BaseModel, Field
+from utils import FilterType
+
+
+class CrawlRequest(BaseModel):
+    urls: List[str] = Field(min_length=1, max_length=100)
+    browser_config: Optional[Dict] = Field(default_factory=dict)
+    crawler_config: Optional[Dict] = Field(default_factory=dict)
+
+class MarkdownRequest(BaseModel):
+    """Request body for the /md endpoint."""
+    url: str                    = Field(...,  description="Absolute http/https URL to fetch")
+    f:   FilterType             = Field(FilterType.FIT,
+                                        description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
+    q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
+    c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")
+
+
+class RawCode(BaseModel):
+    code: str
+
+class HTMLRequest(BaseModel):
+    url: str
+    
+class ScreenshotRequest(BaseModel):
+    url: str
+    screenshot_wait_for: Optional[float] = 2
+    output_path: Optional[str] = None
+
+class PDFRequest(BaseModel):
+    url: str
+    output_path: Optional[str] = None
+
+
+class JSEndpointRequest(BaseModel):
+    url: str
+    scripts: List[str] = Field(
+        ...,
+        description="List of separated JavaScript snippets to execute"
+    )
\ No newline at end of file
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
new file mode 100644
index 00000000..0bd6ac2d
--- /dev/null
+++ b/deploy/docker/server.py
@@ -0,0 +1,618 @@
+# ───────────────────────── server.py ─────────────────────────
+"""
+Crawl4AI FastAPI entry‑point
+• Browser pool + global page cap
+• Rate‑limiting, security, metrics
+• /crawl, /crawl/stream, /md, /llm endpoints
+"""
+
+# ── stdlib & 3rd‑party imports ───────────────────────────────
+from crawler_pool import get_crawler, close_all, janitor
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from auth import create_access_token, get_token_dependency, TokenRequest
+from pydantic import BaseModel
+from typing import Optional, List, Dict
+from fastapi import Request, Depends
+from fastapi.responses import FileResponse
+import base64
+import re
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from api import (
+    handle_markdown_request, handle_llm_qa,
+    handle_stream_crawl_request, handle_crawl_request,
+    stream_results
+)
+from schemas import (
+    CrawlRequest,
+    MarkdownRequest,
+    RawCode,
+    HTMLRequest,
+    ScreenshotRequest,
+    PDFRequest,
+    JSEndpointRequest,
+)
+
+from utils import (
+    FilterType, load_config, setup_logging, verify_email_domain
+)
+import os
+import sys
+import time
+import asyncio
+from typing import List
+from contextlib import asynccontextmanager
+import pathlib
+
+from fastapi import (
+    FastAPI, HTTPException, Request, Path, Query, Depends
+)
+from rank_bm25 import BM25Okapi
+from fastapi.responses import (
+    StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
+)
+from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
+from fastapi.middleware.trustedhost import TrustedHostMiddleware
+from fastapi.staticfiles import StaticFiles
+from job import init_job_router
+
+from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
+
+import ast
+import crawl4ai as _c4
+from pydantic import BaseModel, Field
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from prometheus_fastapi_instrumentator import Instrumentator
+from redis import asyncio as aioredis
+
+# ── internal imports (after sys.path append) ─────────────────
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+
+# ────────────────── configuration / logging ──────────────────
+config = load_config()
+setup_logging(config)
+
+__version__ = "0.5.1-d1"
+
+# ── global page semaphore (hard cap) ─────────────────────────
+MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
+GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
+
+# import logging
+# page_log = logging.getLogger("page_cap")
+# orig_arun = AsyncWebCrawler.arun
+# async def capped_arun(self, *a, **kw):
+#     await GLOBAL_SEM.acquire()                        # ← take slot
+#     try:
+#         in_flight = MAX_PAGES - GLOBAL_SEM._value     # used permits
+#         page_log.info("🕸️  pages_in_flight=%s / %s", in_flight, MAX_PAGES)
+#         return await orig_arun(self, *a, **kw)
+#     finally:
+#         GLOBAL_SEM.release()                          # ← free slot
+
+orig_arun = AsyncWebCrawler.arun
+
+
+async def capped_arun(self, *a, **kw):
+    async with GLOBAL_SEM:
+        return await orig_arun(self, *a, **kw)
+AsyncWebCrawler.arun = capped_arun
+
+# ───────────────────── FastAPI lifespan ──────────────────────
+
+
+@asynccontextmanager
+async def lifespan(_: FastAPI):
+    await get_crawler(BrowserConfig(
+        extra_args=config["crawler"]["browser"].get("extra_args", []),
+        **config["crawler"]["browser"].get("kwargs", {}),
+    ))           # warm‑up
+    app.state.janitor = asyncio.create_task(janitor())        # idle GC
+    yield
+    app.state.janitor.cancel()
+    await close_all()
+
+# ───────────────────── FastAPI instance ──────────────────────
+app = FastAPI(
+    title=config["app"]["title"],
+    version=config["app"]["version"],
+    lifespan=lifespan,
+)
+
+# ── static playground ──────────────────────────────────────
+STATIC_DIR = pathlib.Path(__file__).parent / "static" / "playground"
+if not STATIC_DIR.exists():
+    raise RuntimeError(f"Playground assets not found at {STATIC_DIR}")
+app.mount(
+    "/playground",
+    StaticFiles(directory=STATIC_DIR, html=True),
+    name="play",
+)
+
+
+@app.get("/")
+async def root():
+    return RedirectResponse("/playground")
+
+# ─────────────────── infra / middleware  ─────────────────────
+redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
+
+limiter = Limiter(
+    key_func=get_remote_address,
+    default_limits=[config["rate_limiting"]["default_limit"]],
+    storage_uri=config["rate_limiting"]["storage_uri"],
+)
+
+
+def _setup_security(app_: FastAPI):
+    sec = config["security"]
+    if not sec["enabled"]:
+        return
+    if sec.get("https_redirect"):
+        app_.add_middleware(HTTPSRedirectMiddleware)
+    if sec.get("trusted_hosts", []) != ["*"]:
+        app_.add_middleware(
+            TrustedHostMiddleware, allowed_hosts=sec["trusted_hosts"]
+        )
+
+
+_setup_security(app)
+
+if config["observability"]["prometheus"]["enabled"]:
+    Instrumentator().instrument(app).expose(app)
+
+token_dep = get_token_dependency(config)
+
+
+@app.middleware("http")
+async def add_security_headers(request: Request, call_next):
+    resp = await call_next(request)
+    if config["security"]["enabled"]:
+        resp.headers.update(config["security"]["headers"])
+    return resp
+
+# ───────────────── safe config‑dump helper ─────────────────
+ALLOWED_TYPES = {
+    "CrawlerRunConfig": CrawlerRunConfig,
+    "BrowserConfig": BrowserConfig,
+}
+
+
+def _safe_eval_config(expr: str) -> dict:
+    """
+    Accept exactly one top‑level call to CrawlerRunConfig(...) or BrowserConfig(...).
+    Whatever is inside the parentheses is fine *except* further function calls
+    (so no  __import__('os') stuff).  All public names from crawl4ai are available
+    when we eval.
+    """
+    tree = ast.parse(expr, mode="eval")
+
+    # must be a single call
+    if not isinstance(tree.body, ast.Call):
+        raise ValueError("Expression must be a single constructor call")
+
+    call = tree.body
+    if not (isinstance(call.func, ast.Name) and call.func.id in {"CrawlerRunConfig", "BrowserConfig"}):
+        raise ValueError(
+            "Only CrawlerRunConfig(...) or BrowserConfig(...) are allowed")
+
+    # forbid nested calls to keep the surface tiny
+    for node in ast.walk(call):
+        if isinstance(node, ast.Call) and node is not call:
+            raise ValueError("Nested function calls are not permitted")
+
+    # expose everything that crawl4ai exports, nothing else
+    safe_env = {name: getattr(_c4, name)
+                for name in dir(_c4) if not name.startswith("_")}
+    obj = eval(compile(tree, "<config>", "eval"),
+               {"__builtins__": {}}, safe_env)
+    return obj.dump()
+
+
+# ── job router ──────────────────────────────────────────────
+app.include_router(init_job_router(redis, config, token_dep))
+
+# ──────────────────────── Endpoints ──────────────────────────
+@app.post("/token")
+async def get_token(req: TokenRequest):
+    if not verify_email_domain(req.email):
+        raise HTTPException(400, "Invalid email domain")
+    token = create_access_token({"sub": req.email})
+    return {"email": req.email, "access_token": token, "token_type": "bearer"}
+
+
+@app.post("/config/dump")
+async def config_dump(raw: RawCode):
+    try:
+        return JSONResponse(_safe_eval_config(raw.code.strip()))
+    except Exception as e:
+        raise HTTPException(400, str(e))
+
+
+@app.post("/md")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("md")
+async def get_markdown(
+    request: Request,
+    body: MarkdownRequest,
+    _td: Dict = Depends(token_dep),
+):
+    if not body.url.startswith(("http://", "https://")):
+        raise HTTPException(
+            400, "URL must be absolute and start with http/https")
+    markdown = await handle_markdown_request(
+        body.url, body.f, body.q, body.c, config
+    )
+    return JSONResponse({
+        "url": body.url,
+        "filter": body.f,
+        "query": body.q,
+        "cache": body.c,
+        "markdown": markdown,
+        "success": True
+    })
+
+
+@app.post("/html")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("html")
+async def generate_html(
+    request: Request,
+    body: HTMLRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML.
+    Use when you need sanitized HTML structures for building schemas or further processing.
+    """
+    cfg = CrawlerRunConfig()
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    raw_html = results[0].html
+    from crawl4ai.utils import preprocess_html_for_schema
+    processed_html = preprocess_html_for_schema(raw_html)
+    return JSONResponse({"html": processed_html, "url": body.url, "success": True})
+
+# Screenshot endpoint
+
+
+@app.post("/screenshot")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("screenshot")
+async def generate_screenshot(
+    request: Request,
+    body: ScreenshotRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Capture a full-page PNG screenshot of the specified URL, waiting an optional delay before capture,
+    Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
+    Then in result instead of the screenshot you will get a path to the saved file.
+    """
+    cfg = CrawlerRunConfig(
+        screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    screenshot_data = results[0].screenshot
+    if body.output_path:
+        abs_path = os.path.abspath(body.output_path)
+        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+        with open(abs_path, "wb") as f:
+            f.write(base64.b64decode(screenshot_data))
+        return {"success": True, "path": abs_path}
+    return {"success": True, "screenshot": screenshot_data}
+
+# PDF endpoint
+
+
+@app.post("/pdf")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("pdf")
+async def generate_pdf(
+    request: Request,
+    body: PDFRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Generate a PDF document of the specified URL,
+    Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
+    Then in result instead of the PDF you will get a path to the saved file.
+    """
+    cfg = CrawlerRunConfig(pdf=True)
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    pdf_data = results[0].pdf
+    if body.output_path:
+        abs_path = os.path.abspath(body.output_path)
+        os.makedirs(os.path.dirname(abs_path), exist_ok=True)
+        with open(abs_path, "wb") as f:
+            f.write(pdf_data)
+        return {"success": True, "path": abs_path}
+    return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
+
+
+@app.post("/execute_js")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("execute_js")
+async def execute_js(
+    request: Request,
+    body: JSEndpointRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Execute a sequence of JavaScript snippets on the specified URL.
+    Return the full CrawlResult JSON (first result).
+    Use this when you need to interact with dynamic pages using JS.
+    REMEMBER: Scripts accept a list of separated JS snippets to execute and execute them in order.
+    IMPORTANT: Each script should be an expression that returns a value. It can be an IIFE or an async function. You can think of it as such.
+        Your script will replace '{script}' and execute in the browser context. So provide either an IIFE or a sync/async function that returns a value.
+    Return Format:
+        - The return result is an instance of CrawlResult, so you have access to markdown, links, and other stuff. If this is enough, you don't need to call again for other endpoints.
+
+        ```python
+        class CrawlResult(BaseModel):
+            url: str
+            html: str
+            success: bool
+            cleaned_html: Optional[str] = None
+            media: Dict[str, List[Dict]] = {}
+            links: Dict[str, List[Dict]] = {}
+            downloaded_files: Optional[List[str]] = None
+            js_execution_result: Optional[Dict[str, Any]] = None
+            screenshot: Optional[str] = None
+            pdf: Optional[bytes] = None
+            mhtml: Optional[str] = None
+            _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
+            extracted_content: Optional[str] = None
+            metadata: Optional[dict] = None
+            error_message: Optional[str] = None
+            session_id: Optional[str] = None
+            response_headers: Optional[dict] = None
+            status_code: Optional[int] = None
+            ssl_certificate: Optional[SSLCertificate] = None
+            dispatch_result: Optional[DispatchResult] = None
+            redirected_url: Optional[str] = None
+            network_requests: Optional[List[Dict[str, Any]]] = None
+            console_messages: Optional[List[Dict[str, Any]]] = None
+
+        class MarkdownGenerationResult(BaseModel):
+            raw_markdown: str
+            markdown_with_citations: str
+            references_markdown: str
+            fit_markdown: Optional[str] = None
+            fit_html: Optional[str] = None
+        ```
+
+    """
+    cfg = CrawlerRunConfig(js_code=body.scripts)
+    async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
+        results = await crawler.arun(url=body.url, config=cfg)
+    # Return JSON-serializable dict of the first CrawlResult
+    data = results[0].model_dump()
+    return JSONResponse(data)
+
+
+@app.get("/llm/{url:path}")
+async def llm_endpoint(
+    request: Request,
+    url: str = Path(...),
+    q: str = Query(...),
+    _td: Dict = Depends(token_dep),
+):
+    if not q:
+        raise HTTPException(400, "Query parameter 'q' is required")
+    if not url.startswith(("http://", "https://")):
+        url = "https://" + url
+    answer = await handle_llm_qa(url, q, config)
+    return JSONResponse({"answer": answer})
+
+
+@app.get("/schema")
+async def get_schema():
+    from crawl4ai import BrowserConfig, CrawlerRunConfig
+    return {"browser": BrowserConfig().dump(),
+            "crawler": CrawlerRunConfig().dump()}
+
+
+@app.get(config["observability"]["health_check"]["endpoint"])
+async def health():
+    return {"status": "ok", "timestamp": time.time(), "version": __version__}
+
+
+@app.get(config["observability"]["prometheus"]["endpoint"])
+async def metrics():
+    return RedirectResponse(config["observability"]["prometheus"]["endpoint"])
+
+
+@app.post("/crawl")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("crawl")
+async def crawl(
+    request: Request,
+    crawl_request: CrawlRequest,
+    _td: Dict = Depends(token_dep),
+):
+    """
+    Crawl a list of URLs and return the results as JSON.
+    """
+    if not crawl_request.urls:
+        raise HTTPException(400, "At least one URL required")
+    res = await handle_crawl_request(
+        urls=crawl_request.urls,
+        browser_config=crawl_request.browser_config,
+        crawler_config=crawl_request.crawler_config,
+        config=config,
+    )
+    return JSONResponse(res)
+
+
+@app.post("/crawl/stream")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+async def crawl_stream(
+    request: Request,
+    crawl_request: CrawlRequest,
+    _td: Dict = Depends(token_dep),
+):
+    if not crawl_request.urls:
+        raise HTTPException(400, "At least one URL required")
+    crawler, gen = await handle_stream_crawl_request(
+        urls=crawl_request.urls,
+        browser_config=crawl_request.browser_config,
+        crawler_config=crawl_request.crawler_config,
+        config=config,
+    )
+    return StreamingResponse(
+        stream_results(crawler, gen),
+        media_type="application/x-ndjson",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Stream-Status": "active",
+        },
+    )
+
+
+def chunk_code_functions(code_md: str) -> List[str]:
+    """Extract each function/class from markdown code blocks per file."""
+    pattern = re.compile(
+        # match "## File: <path>" then a ```py fence, then capture until the closing ```
+        r'##\s*File:\s*(?P<path>.+?)\s*?\r?\n'      # file header
+        r'```py\s*?\r?\n'                         # opening fence
+        r'(?P<code>.*?)(?=\r?\n```)',             # code block
+        re.DOTALL
+    )
+    chunks: List[str] = []
+    for m in pattern.finditer(code_md):
+        file_path = m.group("path").strip()
+        code_blk = m.group("code")
+        tree = ast.parse(code_blk)
+        lines = code_blk.splitlines()
+        for node in tree.body:
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+                start = node.lineno - 1
+                end = getattr(node, "end_lineno", start + 1)
+                snippet = "\n".join(lines[start:end])
+                chunks.append(f"# File: {file_path}\n{snippet}")
+    return chunks
+
+
+def chunk_doc_sections(doc: str) -> List[str]:
+    lines = doc.splitlines(keepends=True)
+    sections = []
+    current: List[str] = []
+    for line in lines:
+        if re.match(r"^#{1,6}\s", line):
+            if current:
+                sections.append("".join(current))
+            current = [line]
+        else:
+            current.append(line)
+    if current:
+        sections.append("".join(current))
+    return sections
+
+
+@app.get("/ask")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("ask")
+async def get_context(
+    request: Request,
+    _td: Dict = Depends(token_dep),
+    context_type: str = Query("all", regex="^(code|doc|all)$"),
+    query: Optional[str] = Query(
+        None, description="search query to filter chunks"),
+    score_ratio: float = Query(
+        0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
+    max_results: int = Query(
+        20, ge=1, description="absolute cap on returned chunks"),
+):
+    """
+    This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai. 
+    You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
+    Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
+
+    Parameters:
+    - context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both.
+    - query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context.
+    - score_ratio: Minimum score as a fraction of the maximum score for filtering results.
+    - max_results: Maximum number of results to return. Default is 20.
+
+    Returns:
+    - JSON response with the requested context.
+    - If "code" is specified, returns the code context.
+    - If "doc" is specified, returns the documentation context.
+    - If "all" is specified, returns both code and documentation contexts.
+    """
+    # load contexts
+    base = os.path.dirname(__file__)
+    code_path = os.path.join(base, "c4ai-code-context.md")
+    doc_path = os.path.join(base, "c4ai-doc-context.md")
+    if not os.path.exists(code_path) or not os.path.exists(doc_path):
+        raise HTTPException(404, "Context files not found")
+
+    with open(code_path, "r") as f:
+        code_content = f.read()
+    with open(doc_path, "r") as f:
+        doc_content = f.read()
+
+    # if no query, just return raw contexts
+    if not query:
+        if context_type == "code":
+            return JSONResponse({"code_context": code_content})
+        if context_type == "doc":
+            return JSONResponse({"doc_context": doc_content})
+        return JSONResponse({
+            "code_context": code_content,
+            "doc_context": doc_content,
+        })
+
+    tokens = query.split()
+    results: Dict[str, List[Dict[str, float]]] = {}
+
+    # code BM25 over functions/classes
+    if context_type in ("code", "all"):
+        code_chunks = chunk_code_functions(code_content)
+        bm25 = BM25Okapi([c.split() for c in code_chunks])
+        scores = bm25.get_scores(tokens)
+        max_sc = float(scores.max()) if scores.size > 0 else 0.0
+        cutoff = max_sc * score_ratio
+        picked = [(c, s) for c, s in zip(code_chunks, scores) if s >= cutoff]
+        picked = sorted(picked, key=lambda x: x[1], reverse=True)[:max_results]
+        results["code_results"] = [{"text": c, "score": s} for c, s in picked]
+
+    # doc BM25 over markdown sections
+    if context_type in ("doc", "all"):
+        sections = chunk_doc_sections(doc_content)
+        bm25d = BM25Okapi([sec.split() for sec in sections])
+        scores_d = bm25d.get_scores(tokens)
+        max_sd = float(scores_d.max()) if scores_d.size > 0 else 0.0
+        cutoff_d = max_sd * score_ratio
+        idxs = [i for i, s in enumerate(scores_d) if s >= cutoff_d]
+        neighbors = set(i for idx in idxs for i in (idx-1, idx, idx+1))
+        valid = [i for i in sorted(neighbors) if 0 <= i < len(sections)]
+        valid = valid[:max_results]
+        results["doc_results"] = [
+            {"text": sections[i], "score": scores_d[i]} for i in valid
+        ]
+
+    return JSONResponse(results)
+
+
+# attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
+print(f"MCP server running on {config['app']['host']}:{config['app']['port']}")
+attach_mcp(
+    app,
+    base_url=f"http://{config['app']['host']}:{config['app']['port']}"
+)
+
+# ────────────────────────── cli ──────────────────────────────
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(
+        "server:app",
+        host=config["app"]["host"],
+        port=config["app"]["port"],
+        reload=config["app"]["reload"],
+        timeout_keep_alive=config["app"]["timeout_keep_alive"],
+    )
+# ─────────────────────────────────────────────────────────────
diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html
new file mode 100644
index 00000000..7af96f1f
--- /dev/null
+++ b/deploy/docker/static/playground/index.html
@@ -0,0 +1,955 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Crawl4AI Playground</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script>
+        tailwind.config = {
+            theme: {
+                extend: {
+                    colors: {
+                        primary: '#4EFFFF',
+                        primarydim: '#09b5a5',
+                        accent: '#F380F5',
+                        dark: '#070708',
+                        light: '#E8E9ED',
+                        secondary: '#D5CEBF',
+                        codebg: '#1E1E1E',
+                        surface: '#202020',
+                        border: '#3F3F44',
+                    },
+                    fontFamily: {
+                        mono: ['Fira Code', 'monospace'],
+                    },
+                }
+            }
+        }
+    </script>
+    <link href="https://fonts.googleapis.com/css2?family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
+    <!-- Highlight.js -->
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github-dark.min.css">
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.11/clipboard.min.js"></script>
+    <!-- CodeMirror (python mode) -->
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/codemirror.min.css">
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/codemirror.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/mode/python/python.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/addon/edit/matchbrackets.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/addon/selection/active-line.min.js"></script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.65.16/theme/darcula.min.css">
+    <!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/python.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/bash.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/languages/json.min.js"></script> -->
+    <style>
+        /* Custom CodeMirror styling to match theme */
+        .CodeMirror {
+            background-color: #1E1E1E !important;
+            color: #E8E9ED !important;
+            border-radius: 4px;
+            font-family: 'Fira Code', monospace;
+            font-size: 0.9rem;
+        }
+        
+        .CodeMirror-gutters {
+            background-color: #1E1E1E !important;
+            border-right: 1px solid #3F3F44 !important;
+        }
+        
+        .CodeMirror-linenumber {
+            color: #3F3F44 !important;
+        }
+        
+        .cm-s-darcula .cm-keyword {
+            color: #4EFFFF !important;
+        }
+        
+        .cm-s-darcula .cm-string {
+            color: #F380F5 !important;
+        }
+        
+        .cm-s-darcula .cm-number {
+            color: #D5CEBF !important;
+        }
+        
+        /* Add to your <style> section or Tailwind config */
+        .hljs {
+            background: #1E1E1E !important;
+            border-radius: 4px;
+            padding: 1rem !important;
+        }
+
+        pre code.hljs {
+            display: block;
+            overflow-x: auto;
+        }
+
+        /* Language-specific colors */
+        .hljs-attr {
+            color: #4EFFFF;
+        }
+
+        /* JSON keys */
+        .hljs-string {
+            color: #F380F5;
+        }
+
+        /* Strings */
+        .hljs-number {
+            color: #D5CEBF;
+        }
+
+        /* Numbers */
+        .hljs-keyword {
+            color: #4EFFFF;
+        }
+
+        pre code {
+            white-space: pre-wrap;
+            word-break: break-word;
+        }
+
+        .copy-btn {
+            transition: all 0.2s ease;
+            opacity: 0.7;
+        }
+
+        .copy-btn:hover {
+            opacity: 1;
+        }
+
+        .tab-content:hover .copy-btn {
+            opacity: 0.7;
+        }
+
+        .tab-content:hover .copy-btn:hover {
+            opacity: 1;
+        }
+
+        /* copid text highlighted */
+        .highlighted {
+            background-color: rgba(78, 255, 255, 0.2) !important;
+            transition: background-color 0.5s ease;
+        }
+    </style>
+</head>
+
+<body class="bg-dark text-light font-mono min-h-screen flex flex-col" style="font-feature-settings: 'calt' 0;">
+    <!-- Header -->
+    <header class="border-b border-border px-4 py-2 flex items-center">
+        <h1 class="text-lg font-medium flex items-center space-x-4">
+            <span>🚀🤖 <span class="text-primary">Crawl4AI</span> Playground</span>
+
+            <!-- GitHub badges -->
+            <a href="https://github.com/unclecode/crawl4ai" target="_blank" class="flex space-x-1">
+                <img src="https://img.shields.io/github/stars/unclecode/crawl4ai?style=social"
+                     alt="GitHub stars" class="h-5">
+                <img src="https://img.shields.io/github/forks/unclecode/crawl4ai?style=social"
+                     alt="GitHub forks" class="h-5">
+            </a>
+
+            <!-- Docs -->
+            <a href="https://docs.crawl4ai.com" target="_blank"
+               class="text-xs text-secondary hover:text-primary underline flex items-center">
+                Docs
+            </a>
+
+            <!-- X (Twitter) follow -->
+            <a href="https://x.com/unclecode" target="_blank"
+               class="hover:text-primary flex items-center" title="Follow @unclecode on X">
+                <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"
+                     class="w-4 h-4 fill-current mr-1">
+                    <path d="M22.46 6c-.77.35-1.6.58-2.46.69a4.27 4.27 0 001.88-2.35 8.53 8.53 0 01-2.71 1.04 4.24 4.24 0 00-7.23 3.87A12.05 12.05 0 013 4.62a4.24 4.24 0 001.31 5.65 4.2 4.2 0 01-1.92-.53v.05a4.24 4.24 0 003.4 4.16 4.31 4.31 0 01-1.91.07 4.25 4.25 0 003.96 2.95A8.5 8.5 0 012 19.55a12.04 12.04 0 006.53 1.92c7.84 0 12.13-6.49 12.13-12.13 0-.18-.01-.36-.02-.54A8.63 8.63 0 0024 5.1a8.45 8.45 0 01-2.54.7z"/>
+                </svg>
+                <span class="text-xs">@unclecode</span>
+            </a>
+        </h1>
+
+        <div class="ml-auto flex space-x-2">
+            <button id="play-tab"
+                class="px-3 py-1 rounded-t bg-surface border border-b-0 border-border text-primary">Playground</button>
+            <button id="stress-tab" class="px-3 py-1 rounded-t border border-border hover:bg-surface">Stress
+                Test</button>
+        </div>
+    </header>
+
+    <!-- Main Playground -->
+    <main id="playground" class="flex-1 flex flex-col p-4 space-y-4 max-w-5xl w-full mx-auto">
+        <!-- Request Builder -->
+        <section class="bg-surface rounded-lg border border-border overflow-hidden">
+            <div class="px-4 py-2 border-b border-border flex items-center">
+                <h2 class="font-medium">Request Builder</h2>
+                <select id="endpoint" class="ml-auto bg-dark border border-border rounded px-2 py-1 text-sm">
+                    <option value="crawl">/crawl (batch)</option>
+                    <option value="crawl_stream">/crawl/stream</option>
+                    <option value="md">/md</option>
+                    <option value="llm">/llm</option>
+                </select>
+            </div>
+            <div class="p-4">
+                <label class="block mb-2 text-sm">URL(s) - one per line</label>
+                <textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4"
+                    spellcheck="false">https://example.com</textarea>
+
+                <!-- Specific options for /md endpoint -->
+                <details id="md-options" class="mb-4 hidden">
+                    <summary class="text-sm text-secondary cursor-pointer">/md Options</summary>
+                    <div class="mt-2 space-y-3 p-2 border border-border rounded">
+                        <div>
+                            <label for="md-filter" class="block text-xs text-secondary mb-1">Filter Type</label>
+                            <select id="md-filter" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                                <option value="fit">fit - Adaptive content filtering</option>
+                                <option value="raw">raw - No filtering</option>
+                                <option value="bm25">bm25 - BM25 keyword relevance</option>
+                                <option value="llm">llm - LLM-based filtering</option>
+                            </select>
+                        </div>
+                        <div>
+                            <label for="md-query" class="block text-xs text-secondary mb-1">Query (for BM25/LLM filters)</label>
+                            <input id="md-query" type="text" placeholder="Enter search terms or instructions" 
+                                class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                        </div>
+                        <div>
+                            <label for="md-cache" class="block text-xs text-secondary mb-1">Cache Mode</label>
+                            <select id="md-cache" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                                <option value="0">Write-Only (0)</option>
+                                <option value="1">Enabled (1)</option>
+                            </select>
+                        </div>
+                    </div>
+                </details>
+
+                <!-- Specific options for /llm endpoint -->
+                <details id="llm-options" class="mb-4 hidden">
+                    <summary class="text-sm text-secondary cursor-pointer">/llm Options</summary>
+                    <div class="mt-2 space-y-3 p-2 border border-border rounded">
+                        <div>
+                            <label for="llm-question" class="block text-xs text-secondary mb-1">Question</label>
+                            <input id="llm-question" type="text" value="What is this page about?" 
+                                class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                        </div>
+                    </div>
+                </details>
+
+                <!-- Advanced config for /crawl endpoints -->
+                <details id="adv-config" class="mb-4">
+                    <summary class="text-sm text-secondary cursor-pointer">Advanced Config <span
+                        class="text-xs text-primary">(Python → auto‑JSON)</span></summary>
+
+                    <!-- Toolbar -->
+                    <div class="flex items-center justify-end space-x-3 mt-2">
+                        <label for="cfg-type" class="text-xs text-secondary">Type:</label>
+                        <select id="cfg-type"
+                                class="bg-dark border border-border rounded px-1 py-0.5 text-xs">
+                            <option value="CrawlerRunConfig">CrawlerRunConfig</option>
+                            <option value="BrowserConfig">BrowserConfig</option>
+                        </select>
+
+                        <!-- help link -->
+                        <a href="https://docs.crawl4ai.com/api/parameters/"
+                           target="_blank"
+                           class="text-xs text-primary hover:underline flex items-center space-x-1"
+                           title="Open parameter reference in new tab">
+                            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"
+                                 class="w-4 h-4 fill-current">
+                                 <path d="M13 3h8v8h-2V6.41l-9.29 9.3-1.42-1.42 9.3-9.29H13V3z"/>
+                                 <path d="M5 5h4V3H3v6h2V5zm0 14v-4H3v6h6v-2H5z"/>
+                            </svg>
+                            <span>Docs</span>
+                        </a>
+
+                        <span id="cfg-status" class="text-xs text-secondary ml-2"></span>
+                    </div>
+
+                    <!-- CodeMirror host -->
+                    <div id="adv-editor" class="mt-2 border border-border rounded overflow-hidden h-40"></div>
+                </details>
+
+                <div class="flex space-x-2">
+                    <button id="run-btn" class="bg-primary text-dark px-4 py-2 rounded hover:bg-primarydim font-medium">
+                        Run (⌘/Ctrl+Enter)
+                    </button>
+                    <button id="export-btn" class="border border-border px-4 py-2 rounded hover:bg-surface hidden">
+                        Export Python Code
+                    </button>
+                </div>
+            </div>
+        </section>
+
+        <!-- Execution Status -->
+        <section id="execution-status" class="hidden bg-surface rounded-lg border border-border p-3 text-sm">
+            <div class="flex space-x-4">
+                <div id="status-badge" class="flex items-center">
+                    <span class="w-3 h-3 rounded-full mr-2"></span>
+                    <span>Ready</span>
+                </div>
+                <div>
+                    <span class="text-secondary">Time:</span>
+                    <span id="exec-time" class="text-light">-</span>
+                </div>
+                <div>
+                    <span class="text-secondary">Memory:</span>
+                    <span id="exec-mem" class="text-light">-</span>
+                </div>
+            </div>
+        </section>
+
+        <!-- Response Viewer -->
+        <!-- Update the Response Viewer section -->
+        <section class="bg-surface rounded-lg border border-border overflow-hidden flex-1 flex flex-col">
+            <div class="border-b border-border flex">
+                <button data-tab="response" class="tab-btn active px-4 py-2 border-r border-border">Response</button>
+                <button data-tab="python" class="tab-btn px-4 py-2 border-r border-border">Python</button>
+                <button data-tab="curl" class="tab-btn px-4 py-2">cURL</button>
+            </div>
+            <div class="flex-1 overflow-auto relative">
+                <!-- Response Tab -->
+                <div class="tab-content active h-full">
+                    <div class="absolute right-2 top-2">
+                        <button class="copy-btn bg-surface border border-border rounded px-2 py-1 text-xs hover:bg-dark"
+                            data-target="#response-content code">
+                            Copy
+                        </button>
+                    </div>
+                    <pre id="response-content" class="p-4 text-sm h-full"><code class="json hljs">{}</code></pre>
+                </div>
+
+                <!-- Python Tab -->
+                <div class="tab-content hidden h-full">
+                    <div class="absolute right-2 top-2">
+                        <button class="copy-btn bg-surface border border-border rounded px-2 py-1 text-xs hover:bg-dark"
+                            data-target="#python-content code">
+                            Copy
+                        </button>
+                    </div>
+                    <pre id="python-content" class="p-4 text-sm h-full"><code class="python hljs"></code></pre>
+                </div>
+
+                <!-- cURL Tab -->
+                <div class="tab-content hidden h-full">
+                    <div class="absolute right-2 top-2">
+                        <button class="copy-btn bg-surface border border-border rounded px-2 py-1 text-xs hover:bg-dark"
+                            data-target="#curl-content code">
+                            Copy
+                        </button>
+                    </div>
+                    <pre id="curl-content" class="p-4 text-sm h-full"><code class="bash hljs"></code></pre>
+                </div>
+            </div>
+        </section>
+    </main>
+
+    <!-- Stress Test Modal -->
+    <div id="stress-modal"
+        class="hidden fixed inset-0 bg-black bg-opacity-70 z-50 flex items-center justify-center p-4">
+        <div class="bg-surface rounded-lg border border-accent w-full max-w-3xl max-h-[90vh] flex flex-col">
+            <div class="px-4 py-2 border-b border-border flex items-center">
+                <h2 class="font-medium text-accent">🔥 Stress Test</h2>
+                <button id="close-stress" class="ml-auto text-secondary hover:text-light">&times;</button>
+            </div>
+
+            <div class="p-4 space-y-4 flex-1 overflow-auto">
+                <div class="grid grid-cols-3 gap-4">
+                    <div>
+                        <label class="block text-sm mb-1">Total URLs</label>
+                        <input id="st-total" type="number" value="20"
+                            class="w-full bg-dark border border-border rounded px-3 py-1">
+                    </div>
+                    <div>
+                        <label class="block text-sm mb-1">Chunk Size</label>
+                        <input id="st-chunk" type="number" value="5"
+                            class="w-full bg-dark border border-border rounded px-3 py-1">
+                    </div>
+                    <div>
+                        <label class="block text-sm mb-1">Concurrency</label>
+                        <input id="st-conc" type="number" value="2"
+                            class="w-full bg-dark border border-border rounded px-3 py-1">
+                    </div>
+                </div>
+
+                <div class="flex items-center">
+                    <input id="st-stream" type="checkbox" class="mr-2">
+                    <label for="st-stream" class="text-sm">Use /crawl/stream</label>
+                    <button id="st-run"
+                        class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
+                        Run Stress Test
+                    </button>
+                </div>
+
+                <div class="mt-4">
+                    <div class="bg-dark rounded border border-border p-3 h-64 overflow-auto text-sm whitespace-break-spaces"
+                        id="stress-log"></div>
+                </div>
+            </div>
+
+            <div class="px-4 py-2 border-t border-border text-sm text-secondary">
+                <div class="flex justify-between">
+                    <span>Completed: <span id="stress-completed">0</span>/<span id="stress-total">0</span></span>
+                    <span>Avg. Time: <span id="stress-avg-time">0</span>ms</span>
+                    <span>Peak Memory: <span id="stress-peak-mem">0</span>MB</span>
+                </div>
+            </div>
+        </div>
+    </div>
+
+    <script>
+        // Tab switching
+        document.querySelectorAll('.tab-btn').forEach(btn => {
+            btn.addEventListener('click', () => {
+                document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
+                document.querySelectorAll('.tab-content').forEach(c => c.classList.add('hidden'));
+
+                btn.classList.add('active');
+                const tabName = btn.dataset.tab;
+                document.querySelector(`#${tabName}-content`).parentElement.classList.remove('hidden');
+
+                // Re-highlight content when switching tabs
+                const activeCode = document.querySelector(`#${tabName}-content code`);
+                if (activeCode) {
+                    forceHighlightElement(activeCode);
+                }
+            });
+        });
+
+        // View switching
+        document.getElementById('play-tab').addEventListener('click', () => {
+            document.getElementById('playground').classList.remove('hidden');
+            document.getElementById('stress-modal').classList.add('hidden');
+            document.getElementById('play-tab').classList.add('bg-surface', 'border-b-0');
+            document.getElementById('stress-tab').classList.remove('bg-surface', 'border-b-0');
+        });
+
+        document.getElementById('stress-tab').addEventListener('click', () => {
+            document.getElementById('stress-modal').classList.remove('hidden');
+            document.getElementById('stress-tab').classList.add('bg-surface', 'border-b-0');
+            document.getElementById('play-tab').classList.remove('bg-surface', 'border-b-0');
+        });
+
+        document.getElementById('close-stress').addEventListener('click', () => {
+            document.getElementById('stress-modal').classList.add('hidden');
+            document.getElementById('play-tab').classList.add('bg-surface', 'border-b-0');
+            document.getElementById('stress-tab').classList.remove('bg-surface', 'border-b-0');
+        });
+
+        // Initialize clipboard and highlight.js
+        new ClipboardJS('#export-btn');
+        hljs.highlightAll();
+
+        // Keyboard shortcut
+        window.addEventListener('keydown', e => {
+            if ((e.ctrlKey || e.metaKey) && e.key === 'Enter') {
+                document.getElementById('run-btn').click();
+            }
+        });
+
+        // ================ ADVANCED CONFIG EDITOR ================
+        const cm = CodeMirror(document.getElementById('adv-editor'), {
+            value: `CrawlerRunConfig(
+    stream=True,
+    cache_mode=CacheMode.BYPASS,
+)`,
+            mode: 'python',
+            lineNumbers: true,
+            theme: 'darcula',
+            tabSize: 4,
+            styleActiveLine: true,
+            matchBrackets: true,
+            gutters: ["CodeMirror-linenumbers"],
+            lineWrapping: true,
+        });
+
+        const TEMPLATES = {
+            CrawlerRunConfig: `CrawlerRunConfig(
+    stream=True,
+    cache_mode=CacheMode.BYPASS,
+)`,
+            BrowserConfig: `BrowserConfig(
+    headless=True,
+    extra_args=[
+        "--no-sandbox",
+        "--disable-gpu",
+    ],
+)`,
+        };
+
+        document.getElementById('cfg-type').addEventListener('change', (e) => {
+            cm.setValue(TEMPLATES[e.target.value]);
+            document.getElementById('cfg-status').textContent = '';
+        });
+        
+        // Handle endpoint selection change to show appropriate options
+        document.getElementById('endpoint').addEventListener('change', function(e) {
+            const endpoint = e.target.value;
+            const mdOptions = document.getElementById('md-options');
+            const llmOptions = document.getElementById('llm-options');
+            const advConfig = document.getElementById('adv-config');
+            
+            // Hide all option sections first
+            mdOptions.classList.add('hidden');
+            llmOptions.classList.add('hidden');
+            advConfig.classList.add('hidden');
+            
+            // Show the appropriate section based on endpoint
+            if (endpoint === 'md') {
+                mdOptions.classList.remove('hidden');
+                // Auto-open the /md options
+                mdOptions.setAttribute('open', '');
+            } else if (endpoint === 'llm') {
+                llmOptions.classList.remove('hidden');
+                // Auto-open the /llm options
+                llmOptions.setAttribute('open', '');
+            } else {
+                // For /crawl endpoints, show the advanced config
+                advConfig.classList.remove('hidden');
+            }
+        });
+
+        async function pyConfigToJson() {
+            const code = cm.getValue().trim();
+            if (!code) return {};
+
+            const res = await fetch('/config/dump', {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ code }),
+            });
+
+            const statusEl = document.getElementById('cfg-status');
+            if (!res.ok) {
+                const msg = await res.text();
+                statusEl.textContent = '✖ config error';
+                statusEl.className = 'text-xs text-red-400';
+                throw new Error(msg || 'Invalid config');
+            }
+
+            statusEl.textContent = '✓ parsed';
+            statusEl.className = 'text-xs text-green-400';
+
+            return await res.json();
+        }
+
+        // ================ SERVER COMMUNICATION ================
+
+        // Update status UI
+        function updateStatus(status, time, memory, peakMemory) {
+            const statusEl = document.getElementById('execution-status');
+            const badgeEl = document.querySelector('#status-badge span:first-child');
+            const textEl = document.querySelector('#status-badge span:last-child');
+
+            statusEl.classList.remove('hidden');
+            badgeEl.className = 'w-3 h-3 rounded-full mr-2';
+
+            if (status === 'success') {
+                badgeEl.classList.add('bg-green-500');
+                textEl.textContent = 'Success';
+            } else if (status === 'error') {
+                badgeEl.classList.add('bg-red-500');
+                textEl.textContent = 'Error';
+            } else {
+                badgeEl.classList.add('bg-yellow-500');
+                textEl.textContent = 'Processing...';
+            }
+
+            if (time) {
+                document.getElementById('exec-time').textContent = `${time}ms`;
+            }
+
+            if (memory !== undefined && peakMemory !== undefined) {
+                document.getElementById('exec-mem').textContent = `Δ${memory >= 0 ? '+' : ''}${memory}MB (Peak: ${peakMemory}MB)`;
+            }
+        }
+
+        // Generate code snippets
+        function generateSnippets(api, payload, method = 'POST') {
+            // Python snippet
+            const pyCodeEl = document.querySelector('#python-content code');
+            let pySnippet;
+            
+            if (method === 'GET') {
+                // GET request (for /llm endpoint)
+                pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.get(\n            "${window.location.origin}${api}"\n        )\n        return response.json()`;
+            } else {
+                // POST request (for /crawl and /md endpoints)
+                pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.post(\n            "${window.location.origin}${api}",\n            json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n            ')}\n        )\n        return response.json()`;
+            }
+
+            pyCodeEl.textContent = pySnippet;
+            pyCodeEl.className = 'python hljs'; // Reset classes
+            forceHighlightElement(pyCodeEl);
+
+            // cURL snippet
+            const curlCodeEl = document.querySelector('#curl-content code');
+            let curlSnippet;
+            
+            if (method === 'GET') {
+                // GET request (for /llm endpoint)
+                curlSnippet = `curl -X GET "${window.location.origin}${api}"`;
+            } else {
+                // POST request (for /crawl and /md endpoints)
+                curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n  -H "Content-Type: application/json" \\\n  -d '${JSON.stringify(payload)}'`;
+            }
+
+            curlCodeEl.textContent = curlSnippet;
+            curlCodeEl.className = 'bash hljs'; // Reset classes
+            forceHighlightElement(curlCodeEl);
+        }
+
+        // Main run function
+        async function runCrawl() {
+            const endpoint = document.getElementById('endpoint').value;
+            const urls = document.getElementById('urls').value.trim().split(/\n/).filter(u => u);
+            // 1) grab python from CodeMirror, validate via /config/dump
+            let advConfig = {};
+            try {
+                const cfgJson = await pyConfigToJson(); // may throw
+                if (Object.keys(cfgJson).length) {
+                    const cfgType = document.getElementById('cfg-type').value;
+                    advConfig = cfgType === 'CrawlerRunConfig'
+                        ? { crawler_config: cfgJson }
+                        : { browser_config: cfgJson };
+                }
+            } catch (err) {
+                updateStatus('error');
+                document.querySelector('#response-content code').textContent =
+                    JSON.stringify({ error: err.message }, null, 2);
+                forceHighlightElement(document.querySelector('#response-content code'));
+                return; // stop run
+            }
+
+            const endpointMap = {
+                crawl: '/crawl',
+                // crawl_stream: '/crawl/stream',
+                md: '/md',
+                llm: '/llm'
+            };
+
+            const api = endpointMap[endpoint];
+            let payload;
+            
+            // Create appropriate payload based on endpoint type
+            if (endpoint === 'md') {
+                // Get values from the /md specific inputs
+                const filterType = document.getElementById('md-filter').value;
+                const query = document.getElementById('md-query').value.trim();
+                const cache = document.getElementById('md-cache').value;
+                
+                // MD endpoint expects: { url, f, q, c }
+                payload = {
+                    url: urls[0], // Take first URL
+                    f: filterType, // Lowercase filter type as required by server
+                    q: query || null, // Use the query if provided, otherwise null
+                    c: cache
+                };
+            } else if (endpoint === 'llm') {
+                // LLM endpoint has a different URL pattern and uses query params
+                // This will be handled directly in the fetch below
+                payload = null;
+            } else {
+                // Default payload for /crawl and /crawl/stream
+                payload = {
+                    urls,
+                    ...advConfig
+                };
+            }
+
+            updateStatus('processing');
+
+            try {
+                const startTime = performance.now();
+                let response, responseData;
+
+                if (endpoint === 'llm') {
+                    // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
+                    const url = urls[0];
+                    const encodedUrl = encodeURIComponent(url);
+                    // Get the question from the LLM-specific input
+                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
+                    
+                    response = await fetch(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, {
+                        method: 'GET',
+                        headers: { 'Accept': 'application/json' }
+                    });
+                } else if (endpoint === 'crawl_stream') {
+                    // Stream processing
+                    response = await fetch(api, {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json' },
+                        body: JSON.stringify(payload)
+                    });
+
+                    const reader = response.body.getReader();
+                    let text = '';
+                    let maxMemory = 0;
+
+                    while (true) {
+                        const { value, done } = await reader.read();
+                        if (done) break;
+
+                        const chunk = new TextDecoder().decode(value);
+                        text += chunk;
+
+                        // Process each line for memory updates
+                        chunk.trim().split('\n').forEach(line => {
+                            if (!line) return;
+                            try {
+                                const obj = JSON.parse(line);
+                                if (obj.server_memory_mb) {
+                                    maxMemory = Math.max(maxMemory, obj.server_memory_mb);
+                                }
+                            } catch (e) {
+                                console.error('Error parsing stream line:', e);
+                            }
+                        });
+                    }
+
+                    responseData = { stream: text };
+                    const time = Math.round(performance.now() - startTime);
+                    updateStatus('success', time, null, maxMemory);
+                    document.querySelector('#response-content code').textContent = text;
+                    document.querySelector('#response-content code').className = 'json hljs'; // Reset classes
+                    forceHighlightElement(document.querySelector('#response-content code'));
+                } else {
+                    // Regular request (handles /crawl and /md)
+                    response = await fetch(api, {
+                        method: 'POST',
+                        headers: { 'Content-Type': 'application/json' },
+                        body: JSON.stringify(payload)
+                    });
+
+                    responseData = await response.json();
+                    const time = Math.round(performance.now() - startTime);
+
+                    if (!response.ok) {
+                        updateStatus('error', time);
+                        throw new Error(responseData.error || 'Request failed');
+                    }
+
+                    updateStatus(
+                        'success',
+                        time,
+                        responseData.server_memory_delta_mb,
+                        responseData.server_peak_memory_mb
+                    );
+
+                    document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
+                    document.querySelector('#response-content code').className = 'json hljs'; // Ensure class is set
+                    forceHighlightElement(document.querySelector('#response-content code'));
+                }
+
+                forceHighlightElement(document.querySelector('#response-content code'));
+                
+                // For generateSnippets, handle the LLM case specially
+                if (endpoint === 'llm') {
+                    const url = urls[0];
+                    const encodedUrl = encodeURIComponent(url);
+                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
+                    generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
+                } else {
+                    generateSnippets(api, payload);
+                }
+            } catch (error) {
+                console.error('Error:', error);
+                updateStatus('error');
+                document.querySelector('#response-content code').textContent = JSON.stringify(
+                    { error: error.message },
+                    null,
+                    2
+                );
+                forceHighlightElement(document.querySelector('#response-content code'));
+            }
+        }
+
+        // Stress test function
+        async function runStressTest() {
+            const total = parseInt(document.getElementById('st-total').value);
+            const chunkSize = parseInt(document.getElementById('st-chunk').value);
+            const concurrency = parseInt(document.getElementById('st-conc').value);
+            const useStream = document.getElementById('st-stream').checked;
+
+            const logEl = document.getElementById('stress-log');
+            logEl.textContent = '';
+
+            document.getElementById('stress-completed').textContent = '0';
+            document.getElementById('stress-total').textContent = total;
+            document.getElementById('stress-avg-time').textContent = '0';
+            document.getElementById('stress-peak-mem').textContent = '0';
+
+            const api = useStream ? '/crawl/stream' : '/crawl';
+            const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
+            const chunks = [];
+
+            for (let i = 0; i < urls.length; i += chunkSize) {
+                chunks.push(urls.slice(i, i + chunkSize));
+            }
+
+            let completed = 0;
+            let totalTime = 0;
+            let peakMemory = 0;
+
+            const processBatch = async (batch, index) => {
+                const payload = {
+                    urls: batch,
+                    browser_config: {},
+                    crawler_config: { cache_mode: 'BYPASS', stream: useStream }
+                };
+
+                const start = performance.now();
+                let time, memory;
+
+                try {
+                    if (useStream) {
+                        const response = await fetch(api, {
+                            method: 'POST',
+                            headers: { 'Content-Type': 'application/json' },
+                            body: JSON.stringify(payload)
+                        });
+
+                        const reader = response.body.getReader();
+                        let maxMem = 0;
+                        while (true) {
+                            const { value, done } = await reader.read();
+                            if (done) break;
+                            const text = new TextDecoder().decode(value);
+                            text.split('\n').forEach(line => {
+                                try {
+                                    const obj = JSON.parse(line);
+                                    if (obj.server_memory_mb) {
+                                        maxMem = Math.max(maxMem, obj.server_memory_mb);
+                                    }
+                                } catch { }
+                            });
+                        }
+
+                        memory = maxMem;
+                    } else {
+                        const response = await fetch(api, {
+                            method: 'POST',
+                            headers: { 'Content-Type': 'application/json' },
+                            body: JSON.stringify(payload)
+                        });
+
+                        const data = await response.json();
+                        memory = data.server_peak_memory_mb;
+                    }
+
+                    time = Math.round(performance.now() - start);
+                    peakMemory = Math.max(peakMemory, memory || 0);
+                    totalTime += time;
+
+                    logEl.textContent += `[${index + 1}/${chunks.length}] ✔ ${time}ms | Peak ${memory}MB\n`;
+                } catch (error) {
+                    time = Math.round(performance.now() - start);
+                    logEl.textContent += `[${index + 1}/${chunks.length}] ✖ ${time}ms | ${error.message}\n`;
+                }
+
+                completed += batch.length;
+                document.getElementById('stress-completed').textContent = completed;
+                document.getElementById('stress-peak-mem').textContent = peakMemory;
+                document.getElementById('stress-avg-time').textContent = Math.round(totalTime / (index + 1));
+
+                logEl.scrollTop = logEl.scrollHeight;
+            };
+
+            // Run with concurrency control
+            let active = 0;
+            let index = 0;
+
+            return new Promise(resolve => {
+                const runNext = () => {
+                    while (active < concurrency && index < chunks.length) {
+                        processBatch(chunks[index], index)
+                            .finally(() => {
+                                active--;
+                                runNext();
+                            });
+                        active++;
+                        index++;
+                    }
+
+                    if (active === 0 && index >= chunks.length) {
+                        logEl.textContent += '\n✅ Stress test completed\n';
+                        resolve();
+                    }
+                };
+
+                runNext();
+            });
+        }
+
+        // Event listeners
+        document.getElementById('run-btn').addEventListener('click', runCrawl);
+        document.getElementById('st-run').addEventListener('click', runStressTest);
+
+        function forceHighlightElement(element) {
+            if (!element) return;
+
+            // Save current scroll position (important for large code blocks)
+            const scrollTop = element.parentElement.scrollTop;
+
+            // Reset the element
+            const text = element.textContent;
+            element.innerHTML = text;
+            element.removeAttribute('data-highlighted');
+
+            // Reapply highlighting
+            hljs.highlightElement(element);
+
+            // Restore scroll position
+            element.parentElement.scrollTop = scrollTop;
+        }
+
+        // Initialize clipboard for all copy buttons
+        function initCopyButtons() {
+            document.querySelectorAll('.copy-btn').forEach(btn => {
+                new  ClipboardJS(btn, {
+                    text: () => {
+                        const target = document.querySelector(btn.dataset.target);
+                        return target ? target.textContent : '';
+                    }
+                }).on('success', e => {
+                    e.clearSelection();
+                    // make button text "copied" for 1 second
+                    const originalText = e.trigger.textContent;
+                    e.trigger.textContent = 'Copied!';
+                    setTimeout(() => {
+                        e.trigger.textContent = originalText;
+                    }, 1000);
+                    // Highlight the copied code
+                    const target = document.querySelector(btn.dataset.target);
+                    if (target) {
+                        target.classList.add('highlighted');
+                        setTimeout(() => {
+                            target.classList.remove('highlighted');
+                        }, 1000);
+                    }
+
+                }).on('error', e => {
+                    console.error('Error copying:', e);
+                });
+            });
+        }
+        
+        // Function to initialize UI based on selected endpoint
+        function initUI() {
+            // Trigger the endpoint change handler to set initial UI state
+            const endpointSelect = document.getElementById('endpoint');
+            const event = new Event('change');
+            endpointSelect.dispatchEvent(event);
+            
+            // Initialize copy buttons
+            initCopyButtons();
+        }
+
+        // Initialize on page load
+        document.addEventListener('DOMContentLoaded', initUI);
+        // Also call it immediately in case the script runs after DOM is already loaded
+        if (document.readyState !== 'loading') {
+            initUI();
+        }
+
+    </script>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/deploy/docker/supervisord.conf b/deploy/docker/supervisord.conf
new file mode 100644
index 00000000..a1b994aa
--- /dev/null
+++ b/deploy/docker/supervisord.conf
@@ -0,0 +1,28 @@
+[supervisord]
+nodaemon=true                   ; Run supervisord in the foreground
+logfile=/dev/null               ; Log supervisord output to stdout/stderr
+logfile_maxbytes=0
+
+[program:redis]
+command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
+user=appuser                    ; Run redis as our non-root user
+autorestart=true
+priority=10
+stdout_logfile=/dev/stdout      ; Redirect redis stdout to container stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr      ; Redirect redis stderr to container stderr
+stderr_logfile_maxbytes=0
+
+[program:gunicorn]
+command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 1 --threads 4 --timeout 1800 --graceful-timeout 30 --keep-alive 300 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
+directory=/app                  ; Working directory for the app
+user=appuser                    ; Run gunicorn as our non-root user
+autorestart=true
+priority=20
+environment=PYTHONUNBUFFERED=1  ; Ensure Python output is sent straight to logs
+stdout_logfile=/dev/stdout      ; Redirect gunicorn stdout to container stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr      ; Redirect gunicorn stderr to container stderr
+stderr_logfile_maxbytes=0
+
+# Optional: Add filebeat or other logging agents here if needed
\ No newline at end of file
diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
new file mode 100644
index 00000000..05af2139
--- /dev/null
+++ b/deploy/docker/utils.py
@@ -0,0 +1,66 @@
+import dns.resolver
+import logging
+import yaml
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from fastapi import Request
+from typing import Dict, Optional
+
+class TaskStatus(str, Enum):
+    PROCESSING = "processing"
+    FAILED = "failed"
+    COMPLETED = "completed"
+
+class FilterType(str, Enum):
+    RAW = "raw"
+    FIT = "fit"
+    BM25 = "bm25"
+    LLM = "llm"
+
+def load_config() -> Dict:
+    """Load and return application configuration."""
+    config_path = Path(__file__).parent / "config.yml"
+    with open(config_path, "r") as config_file:
+        return yaml.safe_load(config_file)
+
+def setup_logging(config: Dict) -> None:
+    """Configure application logging."""
+    logging.basicConfig(
+        level=config["logging"]["level"],
+        format=config["logging"]["format"]
+    )
+
+def get_base_url(request: Request) -> str:
+    """Get base URL including scheme and host."""
+    return f"{request.url.scheme}://{request.url.netloc}"
+
+def is_task_id(value: str) -> bool:
+    """Check if the value matches task ID pattern."""
+    return value.startswith("llm_") and "_" in value
+
+def datetime_handler(obj: any) -> Optional[str]:
+    """Handle datetime serialization for JSON."""
+    if hasattr(obj, 'isoformat'):
+        return obj.isoformat()
+    raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
+
+def should_cleanup_task(created_at: str, ttl_seconds: int = 3600) -> bool:
+    """Check if task should be cleaned up based on creation time."""
+    created = datetime.fromisoformat(created_at)
+    return (datetime.now() - created).total_seconds() > ttl_seconds
+
+def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
+    """Decode Redis hash data from bytes to strings."""
+    return {k.decode('utf-8'): v.decode('utf-8') for k, v in hash_data.items()}
+
+
+
+def verify_email_domain(email: str) -> bool:
+    try:
+        domain = email.split('@')[1]
+        # Try to resolve MX records for the domain.
+        records = dns.resolver.resolve(domain, 'MX')
+        return True if records else False
+    except Exception as e:
+        return False
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 4b22fd98..10ff3269 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,67 +1,49 @@
+version: '3.8'
+
+# Shared configuration for all environments
+x-base-config: &base-config
+  ports:
+    - "11235:11235"  # Gunicorn port
+  env_file:
+    - .llm.env       # API keys (create from .llm.env.example)
+  environment:
+    - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+    - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
+    - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+    - GROQ_API_KEY=${GROQ_API_KEY:-}
+    - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
+    - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
+    - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
+  volumes:
+    - /dev/shm:/dev/shm  # Chromium performance
+  deploy:
+    resources:
+      limits:
+        memory: 4G
+      reservations:
+        memory: 1G
+  restart: unless-stopped
+  healthcheck:
+    test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+    interval: 30s
+    timeout: 10s
+    retries: 3
+    start_period: 40s
+  user: "appuser"
+
 services:
-  # Local build services for different platforms
-  crawl4ai-amd64:
+  crawl4ai:
+    # 1. Default: Pull multi-platform test image from Docker Hub
+    # 2. Override with local image via: IMAGE=local-test docker compose up
+    image: ${IMAGE:-unclecode/crawl4ai:${TAG:-latest}}
+    
+    # Local build config (used with --build)
     build:
       context: .
       dockerfile: Dockerfile
       args:
-        PYTHON_VERSION: "3.10"
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
-        ENABLE_GPU: false
-      platforms:
-        - linux/amd64
-    profiles: ["local-amd64"]
-    extends: &base-config
-      file: docker-compose.yml
-      service: base-config
-
-  crawl4ai-arm64:
-    build:
-      context: .
-      dockerfile: Dockerfile
-      args:
-        PYTHON_VERSION: "3.10"
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
-        ENABLE_GPU: false
-      platforms:
-        - linux/arm64
-    profiles: ["local-arm64"]
-    extends: *base-config
-
-  # Hub services for different platforms and versions
-  crawl4ai-hub-amd64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-amd64
-    profiles: ["hub-amd64"]
-    extends: *base-config
-
-  crawl4ai-hub-arm64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-arm64
-    profiles: ["hub-arm64"]
-    extends: *base-config
-
-  # Base configuration to be extended
-  base-config:
-    ports:
-      - "11235:11235"
-      - "8000:8000"
-      - "9222:9222"
-      - "8080:8080"
-    environment:
-      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
-      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
-    volumes:
-      - /dev/shm:/dev/shm
-    deploy:
-      resources:
-        limits:
-          memory: 4G
-        reservations:
-          memory: 1G
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-      start_period: 40s
\ No newline at end of file
+        INSTALL_TYPE: ${INSTALL_TYPE:-default}
+        ENABLE_GPU: ${ENABLE_GPU:-false}
+    
+    # Inherit shared config
+    <<: *base-config
\ No newline at end of file
diff --git a/docs/apps/linkdin/README.md b/docs/apps/linkdin/README.md
new file mode 100644
index 00000000..57f97815
--- /dev/null
+++ b/docs/apps/linkdin/README.md
@@ -0,0 +1,127 @@
+# Crawl4AI Prospect‑Wizard – step‑by‑step guide
+
+A three‑stage demo that goes from **LinkedIn scraping** ➜ **LLM reasoning** ➜ **graph visualisation**.
+
+```
+prospect‑wizard/
+├─ c4ai_discover.py         # Stage 1 – scrape companies + people
+├─ c4ai_insights.py         # Stage 2 – embeddings, org‑charts, scores
+├─ graph_view_template.html # Stage 3 – graph viewer (static HTML)
+└─ data/                    # output lands here (*.jsonl / *.json)
+```
+
+---
+
+## 1  Install & boot a LinkedIn profile (one‑time)
+
+### 1.1  Install dependencies
+```bash
+pip install crawl4ai litellm sentence-transformers pandas rich
+```
+
+### 1.2  Create / warm a LinkedIn browser profile
+```bash
+crwl profiles
+```
+1. The interactive shell shows **New profile** – hit **enter**.
+2. Choose a name, e.g. `profile_linkedin_uc`.
+3. A Chromium window opens – log in to LinkedIn, solve whatever CAPTCHA, then close.
+
+> Remember the **profile name**. All future runs take `--profile-name <your_name>`.
+
+---
+
+## 2  Discovery – scrape companies & people
+
+```bash
+python c4ai_discover.py full \
+  --query "health insurance management" \
+  --geo 102713980 \               # Malaysia geoUrn
+  --title-filters "" \            # or "Product,Engineering"
+  --max-companies 10 \            # default set small for workshops
+  --max-people 20 \               # \^ same
+  --profile-name profile_linkedin_uc \
+  --outdir ./data \
+  --concurrency 2 \
+  --log-level debug
+```
+**Outputs** in `./data/`:
+* `companies.jsonl` – one JSON per company
+* `people.jsonl` – one JSON per employee
+
+🛠️  **Dry‑run:** `C4AI_DEMO_DEBUG=1 python c4ai_discover.py full --query coffee` uses bundled HTML snippets, no network.
+
+### Handy geoUrn cheatsheet
+| Location | geoUrn |
+|----------|--------|
+| Singapore | **103644278** |
+| Malaysia | **102713980** |
+| United States | **103644922** |
+| United Kingdom | **102221843** |
+| Australia | **101452733** |
+_See more: <https://www.linkedin.com/search/results/companies/?geoUrn=XXX> – the number after `geoUrn=` is what you need._
+
+---
+
+## 3  Insights – embeddings, org‑charts, decision makers
+
+```bash
+python c4ai_insights.py \
+  --in ./data \
+  --out ./data \
+  --embed-model all-MiniLM-L6-v2 \
+  --llm-provider gemini/gemini-2.0-flash \
+  --llm-api-key "" \
+  --top-k 10 \
+  --max-llm-tokens 8024 \
+  --llm-temperature 1.0 \
+  --workers 4
+```
+Emits next to the Stage‑1 files:
+* `company_graph.json` – inter‑company similarity graph
+* `org_chart_<handle>.json` – one per company
+* `decision_makers.csv` – hand‑picked ‘who to pitch’ list
+
+Flags reference (straight from `build_arg_parser()`):
+| Flag | Default | Purpose |
+|------|---------|---------|
+| `--in` | `.` | Stage‑1 output dir |
+| `--out` | `.` | Destination dir |
+| `--embed_model` | `all-MiniLM-L6-v2` | Sentence‑Transformer model |
+| `--top_k` | `10` | Neighbours per company in graph |
+| `--openai_model` | `gpt-4.1` | LLM for scoring decision makers |
+| `--max_llm_tokens` | `8024` | Token budget per LLM call |
+| `--llm_temperature` | `1.0` | Creativity knob |
+| `--stub` | off | Skip OpenAI and fabricate tiny charts |
+| `--workers` | `4` | Parallel LLM workers |
+
+---
+
+## 4  Visualise – interactive graph
+
+After Stage 2 completes, simply open the HTML viewer from the project root:
+```bash
+open graph_view_template.html   # or Live Server / Python -http
+```
+The page fetches `data/company_graph.json` and the `org_chart_*.json` files automatically; keep the `data/` folder beside the HTML file.
+
+* Left pane → list of companies (clans).
+* Click a node to load its org‑chart on the right.
+* Chat drawer lets you ask follow‑up questions; context is pulled from `people.jsonl`.
+
+---
+
+## 5  Common snags
+
+| Symptom | Fix |
+|---------|-----|
+| Infinite CAPTCHA | Use a residential proxy: `--proxy http://user:pass@ip:port` |
+| 429 Too Many Requests | Lower `--concurrency`, rotate profile, add delay |
+| Blank graph | Check JSON paths, clear `localStorage` in browser |
+
+---
+
+### TL;DR
+`crwl profiles` → `c4ai_discover.py` → `c4ai_insights.py` → open `graph_view_template.html`.  
+Live long and `import crawl4ai`.
+
diff --git a/docs/apps/linkdin/c4ai_discover.py b/docs/apps/linkdin/c4ai_discover.py
new file mode 100644
index 00000000..ac6d2783
--- /dev/null
+++ b/docs/apps/linkdin/c4ai_discover.py
@@ -0,0 +1,437 @@
+#!/usr/bin/env python3
+"""
+c4ai-discover — Stage‑1 Discovery CLI
+
+Scrapes LinkedIn company search + their people pages and dumps two newline‑delimited
+JSON files: companies.jsonl and people.jsonl.
+
+Key design rules
+----------------
+* No BeautifulSoup — Crawl4AI only for network + HTML fetch.
+* JsonCssExtractionStrategy for structured scraping; schema auto‑generated once
+  from sample HTML provided by user and then cached under ./schemas/.
+* Defaults are embedded so the file runs inside VS Code debugger without CLI args.
+* If executed as a console script (argv > 1), CLI flags win.
+* Lightweight deps: argparse + Crawl4AI stack.
+
+Author: Tom @ Kidocode 2025‑04‑26
+"""
+from __future__ import annotations
+
+import warnings, re
+warnings.filterwarnings(
+    "ignore",
+    message=r"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used.*",
+    category=FutureWarning,
+    module=r"soupsieve"
+)
+
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Imports
+# ───────────────────────────────────────────────────────────────────────────────
+import argparse
+import random
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import sys
+# 3rd-party rich for pretty logging
+from rich.console import Console
+from rich.logging import RichHandler
+
+from datetime import datetime, UTC
+from textwrap import dedent
+from types import SimpleNamespace
+from typing import Dict, List, Optional
+from urllib.parse import quote
+from pathlib import Path
+from glob import glob
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CacheMode,
+    CrawlerRunConfig,
+    JsonCssExtractionStrategy,
+    BrowserProfiler,
+    LLMConfig,
+)
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Constants / paths
+# ───────────────────────────────────────────────────────────────────────────────
+BASE_DIR = pathlib.Path(__file__).resolve().parent
+SCHEMA_DIR = BASE_DIR / "schemas"
+SCHEMA_DIR.mkdir(parents=True, exist_ok=True)
+COMPANY_SCHEMA_PATH = SCHEMA_DIR / "company_card.json"
+PEOPLE_SCHEMA_PATH = SCHEMA_DIR / "people_card.json"
+
+# ---------- deterministic target JSON examples ----------
+_COMPANY_SCHEMA_EXAMPLE = {
+    "handle": "/company/posify/",
+    "profile_image": "https://media.licdn.com/dms/image/v2/.../logo.jpg",
+    "name": "Management Research Services, Inc. (MRS, Inc)",
+    "descriptor": "Insurance • Milwaukee, Wisconsin",
+    "about": "Insurance • Milwaukee, Wisconsin",
+    "followers": 1000
+}
+
+_PEOPLE_SCHEMA_EXAMPLE = {
+    "profile_url": "https://www.linkedin.com/in/lily-ng/",
+    "name": "Lily Ng",
+    "headline": "VP Product @ Posify",
+    "followers": 890,
+    "connection_degree": "2nd",
+    "avatar_url": "https://media.licdn.com/dms/image/v2/.../lily.jpg"
+}
+
+# Provided sample HTML snippets (trimmed) — used exactly once to cold‑generate schema.
+_SAMPLE_COMPANY_HTML = (Path(__file__).resolve().parent / "snippets/company.html").read_text()
+_SAMPLE_PEOPLE_HTML = (Path(__file__).resolve().parent / "snippets/people.html").read_text()
+
+# --------- tighter schema prompts ----------
+_COMPANY_SCHEMA_QUERY = dedent(
+    """
+    Using the supplied <li> company-card HTML, build a JsonCssExtractionStrategy schema that,
+    for every card, outputs *exactly* the keys shown in the example JSON below.
+    JSON spec:
+      • handle        – href of the outermost <a> that wraps the logo/title, e.g. "/company/posify/"
+      • profile_image – absolute URL of the <img> inside that link
+      • name          – text of the <a> inside the <span class*='t-16'>
+      • descriptor    – text line with industry • location
+      • about         – text of the <div class*='t-normal'> below the name (industry + geo)
+      • followers     – integer parsed from the <div> containing 'followers'
+      
+    IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
+    The main div parent contains these li element is "div.search-results-container" you can use this.
+    The <ul> parent has "role" equal to "list". Using these two should be enough to target the <li> elements."
+    """
+)
+
+_PEOPLE_SCHEMA_QUERY = dedent(
+    """
+    Using the supplied <li> people-card HTML, build a JsonCssExtractionStrategy schema that
+    outputs exactly the keys in the example JSON below.
+    Fields:
+      • profile_url        – href of the outermost profile link
+      • name               – text inside artdeco-entity-lockup__title
+      • headline           – inner text of artdeco-entity-lockup__subtitle
+      • followers          – integer parsed from the span inside lt-line-clamp--multi-line
+      • connection_degree  – '1st', '2nd', etc. from artdeco-entity-lockup__badge
+      • avatar_url         – src of the <img> within artdeco-entity-lockup__image
+      
+    IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
+    The main div parent contains these li element is a "div" has these classes "artdeco-card org-people-profile-card__card-spacing org-people__card-margin-bottom".
+    """
+)
+
+# ---------------------------------------------------------------------------
+# Utility helpers
+# ---------------------------------------------------------------------------
+
+def _load_or_build_schema(
+    path: pathlib.Path, 
+    sample_html: str, 
+    query: str, 
+    example_json: Dict,
+    force = False
+) -> Dict:
+    """Load schema from path, else call generate_schema once and persist."""
+    if path.exists() and not force:
+        return json.loads(path.read_text())
+
+    logging.info("[SCHEMA] Generating schema %s", path.name)
+    schema = JsonCssExtractionStrategy.generate_schema(
+        html=sample_html,
+        llm_config=LLMConfig(
+            provider=os.getenv("C4AI_SCHEMA_PROVIDER", "openai/gpt-4o"),
+            api_token=os.getenv("OPENAI_API_KEY", "env:OPENAI_API_KEY"),
+        ),
+        query=query,
+        target_json_example=json.dumps(example_json, indent=2),
+    )
+    path.write_text(json.dumps(schema, indent=2))
+    return schema
+
+
+def _openai_friendly_number(text: str) -> Optional[int]:
+    """Extract first int from text like '1K followers' (returns 1000)."""
+    import re
+
+    m = re.search(r"(\d[\d,]*)", text.replace(",", ""))
+    if not m:
+        return None
+    val = int(m.group(1))
+    if "k" in text.lower():
+        val *= 1000
+    if "m" in text.lower():
+        val *= 1_000_000
+    return val
+
+# ---------------------------------------------------------------------------
+# Core async workers
+# ---------------------------------------------------------------------------
+async def crawl_company_search(crawler: AsyncWebCrawler, url: str, schema: Dict, limit: int) -> List[Dict]:
+    """Paginate 10-item company search pages until `limit` reached."""
+    extraction = JsonCssExtractionStrategy(schema)
+    cfg = CrawlerRunConfig(
+        extraction_strategy=extraction,
+        cache_mode=CacheMode.BYPASS,
+        wait_for = ".search-marvel-srp",
+        session_id="company_search",
+        delay_before_return_html=1,
+        magic = True,
+        verbose= False,
+    )
+    companies, page = [], 1
+    while len(companies) < max(limit, 10):
+        paged_url = f"{url}&page={page}"
+        res = await crawler.arun(paged_url, config=cfg)
+        batch = json.loads(res[0].extracted_content)
+        if not batch:
+            break
+        for item in batch:
+            name = item.get("name", "").strip()
+            handle = item.get("handle", "").strip()
+            if not handle or not name:
+                continue
+            descriptor = item.get("descriptor")
+            about = item.get("about")
+            followers = _openai_friendly_number(str(item.get("followers", "")))
+            companies.append(
+                {
+                    "handle": handle,
+                    "name": name,
+                    "descriptor": descriptor,
+                    "about": about,
+                    "followers": followers,
+                    "people_url": f"{handle}people/",
+                    "captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
+                }
+            )
+        page += 1
+        logging.info(
+            f"[dim]Page {page}[/] — running total: {len(companies)}/{limit} companies"
+        )
+
+    return companies[:max(limit, 10)]
+
+
+async def crawl_people_page(
+    crawler: AsyncWebCrawler,
+    people_url: str,
+    schema: Dict,
+    limit: int,
+    title_kw: str,
+) -> List[Dict]:
+    people_u = f"{people_url}?keywords={quote(title_kw)}"
+    extraction = JsonCssExtractionStrategy(schema)
+    cfg = CrawlerRunConfig(
+        extraction_strategy=extraction,
+        # scan_full_page=True,
+        cache_mode=CacheMode.BYPASS,
+        magic=True,
+        wait_for=".org-people-profile-card__card-spacing",
+        delay_before_return_html=1,
+        session_id="people_search",
+    )
+    res = await crawler.arun(people_u, config=cfg)
+    if not res[0].success:
+        return []
+    raw = json.loads(res[0].extracted_content)
+    people = []
+    for p in raw[:limit]:
+        followers = _openai_friendly_number(str(p.get("followers", "")))
+        people.append(
+            {
+                "profile_url": p.get("profile_url"),
+                "name": p.get("name"),
+                "headline": p.get("headline"),
+                "followers": followers,
+                "connection_degree": p.get("connection_degree"),
+                "avatar_url": p.get("avatar_url"),
+            }
+        )
+    return people
+
+# ---------------------------------------------------------------------------
+# CLI + main
+# ---------------------------------------------------------------------------
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    ap = argparse.ArgumentParser("c4ai-discover — Crawl4AI LinkedIn discovery")
+    sub = ap.add_subparsers(dest="cmd", required=False, help="run scope")
+
+    def add_flags(parser: argparse.ArgumentParser):
+        parser.add_argument("--query", required=False, help="query keyword(s)")
+        parser.add_argument("--geo", required=False, type=int, help="LinkedIn geoUrn")
+        parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords")
+        parser.add_argument("--max-companies", type=int, default=1000)
+        parser.add_argument("--max-people", type=int, default=500)
+        parser.add_argument("--profile-name", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc"))
+        parser.add_argument("--outdir", default="./output")
+        parser.add_argument("--concurrency", type=int, default=4)
+        parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"])
+
+    add_flags(sub.add_parser("full"))
+    add_flags(sub.add_parser("companies"))
+    add_flags(sub.add_parser("people"))
+
+    # global flags
+    ap.add_argument(
+        "--debug",
+        action="store_true",
+        help="Use built-in demo defaults (same as C4AI_DEMO_DEBUG=1)",
+    )
+    return ap
+
+
+def detect_debug_defaults(force = False) -> SimpleNamespace:
+    if not force and sys.gettrace() is None and not os.getenv("C4AI_DEMO_DEBUG"):
+        return SimpleNamespace()
+    # ----- debug‑friendly defaults -----
+    return SimpleNamespace(
+        cmd="full",
+        query="health insurance management",
+        geo=102713980,
+        # title_filters="Product,Engineering",
+        title_filters="",
+        max_companies=10,
+        max_people=5,
+        profile_name="profile_linkedin_uc",
+        outdir="./debug_out",
+        concurrency=2,
+        log_level="debug",
+    )
+
+
+async def async_main(opts):
+    # ─────────── logging setup ───────────
+    console = Console()
+    logging.basicConfig(
+        level=opts.log_level.upper(),
+        format="%(message)s",
+        handlers=[RichHandler(console=console, markup=True, rich_tracebacks=True)],
+    )
+
+    # -------------------------------------------------------------------
+    # Load or build schemas (one‑time LLM call each)
+    # -------------------------------------------------------------------
+    company_schema = _load_or_build_schema(
+        COMPANY_SCHEMA_PATH,
+        _SAMPLE_COMPANY_HTML,
+        _COMPANY_SCHEMA_QUERY,
+        _COMPANY_SCHEMA_EXAMPLE,
+        # True
+    )
+    people_schema = _load_or_build_schema(
+        PEOPLE_SCHEMA_PATH,
+        _SAMPLE_PEOPLE_HTML,
+        _PEOPLE_SCHEMA_QUERY,
+        _PEOPLE_SCHEMA_EXAMPLE,
+        # True
+    )
+
+    outdir = BASE_DIR / pathlib.Path(opts.outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    f_companies = (BASE_DIR / outdir / "companies.jsonl").open("a", encoding="utf-8")
+    f_people = (BASE_DIR / outdir / "people.jsonl").open("a", encoding="utf-8")
+
+    # -------------------------------------------------------------------
+    # Prepare crawler with cookie pool rotation
+    # -------------------------------------------------------------------
+    profiler = BrowserProfiler()
+    path = profiler.get_profile_path(opts.profile_name)
+    bc = BrowserConfig(
+        headless=False,        
+        verbose=False,
+        user_data_dir=path,
+        use_managed_browser=True,
+        user_agent_mode = "random",
+        user_agent_generator_config= {
+            "platforms": "mobile",
+            "os": "Android"
+        }
+    )
+    crawler = AsyncWebCrawler(config=bc)
+    
+    await crawler.start()
+
+    # Single worker for simplicity; concurrency can be scaled by arun_many if needed.
+    # crawler = await next_crawler().start()
+    try:
+        # Build LinkedIn search URL
+        search_url = f'https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&companyHqGeo="{opts.geo}"'
+        logging.info("Seed URL => %s", search_url)
+
+        companies: List[Dict] = []
+        if opts.cmd in ("companies", "full"):
+            companies = await crawl_company_search(
+                crawler, search_url, company_schema, opts.max_companies
+            )
+            for c in companies:
+                f_companies.write(json.dumps(c, ensure_ascii=False) + "\n")
+            logging.info(f"[bold green]✓[/] Companies scraped so far: {len(companies)}")
+
+        if opts.cmd in ("people", "full"):
+            if not companies:
+                # load from previous run
+                src = outdir / "companies.jsonl"
+                if not src.exists():
+                    logging.error("companies.jsonl missing — run companies/full first")
+                    return 10
+                companies = [json.loads(l) for l in src.read_text().splitlines()]
+            total_people = 0
+            title_kw = " ".join([t.strip() for t in opts.title_filters.split(",") if t.strip()]) if opts.title_filters else ""
+            for comp in companies:
+                people = await crawl_people_page(
+                    crawler,
+                    comp["people_url"],
+                    people_schema,
+                    opts.max_people,
+                    title_kw,
+                )
+                for p in people:
+                    rec = p | {
+                        "company_handle": comp["handle"],
+                        # "captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
+                        "captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
+                    }
+                    f_people.write(json.dumps(rec, ensure_ascii=False) + "\n")
+                total_people += len(people)
+                logging.info(
+                    f"{comp['name']} — [cyan]{len(people)}[/] people extracted"
+                )
+                await asyncio.sleep(random.uniform(0.5, 1))
+            logging.info("Total people scraped: %d", total_people)
+    finally:
+        await crawler.close()
+        f_companies.close()
+        f_people.close()
+
+    return 0
+
+
+def main():
+    parser = build_arg_parser()
+    cli_opts = parser.parse_args()
+
+    # decide on debug defaults
+    if cli_opts.debug:
+        opts = detect_debug_defaults(force=True)
+    else:
+        env_defaults = detect_debug_defaults()
+        opts = env_defaults if env_defaults else cli_opts
+
+    if not getattr(opts, "cmd", None):
+        opts.cmd = "full"
+
+    exit_code = asyncio.run(async_main(cli_opts))
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/apps/linkdin/c4ai_insights.py b/docs/apps/linkdin/c4ai_insights.py
new file mode 100644
index 00000000..60348f43
--- /dev/null
+++ b/docs/apps/linkdin/c4ai_insights.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+"""
+Stage-2 Insights builder
+------------------------
+Reads companies.jsonl & people.jsonl (Stage-1 output) and produces:
+ • company_graph.json
+ • org_chart_<handle>.json  (one per company)
+ • decision_makers.csv
+ • graph_view.html          (interactive visualisation)
+
+Run:
+    python c4ai_insights.py --in ./stage1_out --out ./stage2_out
+
+Author : Tom @ Kidocode, 2025-04-28
+"""
+
+from __future__ import annotations
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Imports & Third-party
+# ───────────────────────────────────────────────────────────────────────────────
+
+import argparse, asyncio, json, pathlib, random
+from datetime import datetime, UTC
+from types import SimpleNamespace
+from pathlib import Path
+from typing import List, Dict, Any
+# Pretty CLI UX
+from rich.console import Console
+from rich.logging import RichHandler
+from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
+import logging
+
+
+BASE_DIR = pathlib.Path(__file__).resolve().parent
+
+# ───────────────────────────────────────────────────────────────────────────────
+# 3rd-party deps
+# ───────────────────────────────────────────────────────────────────────────────
+import numpy as np
+# from sentence_transformers import SentenceTransformer
+# from sklearn.metrics.pairwise import cosine_similarity
+import pandas as pd
+import hashlib
+
+from litellm import completion   #Support any LLM Provider
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Utils
+# ───────────────────────────────────────────────────────────────────────────────
+def load_jsonl(path: Path) -> List[Dict[str, Any]]:
+    with open(path, "r", encoding="utf-8") as f:
+        return [json.loads(l) for l in f]
+
+def dump_json(obj, path: Path):
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, ensure_ascii=False, indent=2)
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Constants
+# ───────────────────────────────────────────────────────────────────────────────
+BASE_DIR = pathlib.Path(__file__).resolve().parent
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Debug defaults   (mirrors Stage-1 trick)
+# ───────────────────────────────────────────────────────────────────────────────
+def dev_defaults() -> SimpleNamespace:
+    return SimpleNamespace(
+        in_dir="./debug_out",          
+        out_dir="./insights_debug",
+        embed_model="all-MiniLM-L6-v2",
+        top_k=10,
+        llm_provider="openai/gpt-4.1", 
+        llm_api_key=None,
+        max_llm_tokens=8000,
+        llm_temperature=1.0,
+        workers=4
+    )
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Graph builders
+# ───────────────────────────────────────────────────────────────────────────────
+def embed_descriptions(companies, model_name:str, opts) -> np.ndarray:
+    from sentence_transformers import SentenceTransformer
+    
+    logging.debug(f"Using embedding model: {model_name}")
+    cache_path = BASE_DIR / Path(opts.out_dir) / "embeds_cache.json"
+    cache = {}
+    if cache_path.exists():
+        with open(cache_path) as f:
+            cache = json.load(f)
+        # flush cache if model differs
+        if cache.get("_model") != model_name:
+            cache = {}
+
+    model = SentenceTransformer(model_name)
+    new_texts, new_indices = [], []
+    vectors = np.zeros((len(companies), 384), dtype=np.float32)
+
+    for idx, comp in enumerate(companies):
+        text = comp.get("about") or comp.get("descriptor","")
+        h = hashlib.sha1(text.encode("utf-8")).hexdigest()
+        cached = cache.get(comp["handle"])
+        if cached and cached["hash"] == h:
+            vectors[idx] = np.array(cached["vector"], dtype=np.float32)
+        else:
+            new_texts.append(text)
+            new_indices.append((idx, comp["handle"], h))
+
+    if new_texts:
+        embeds = model.encode(new_texts, show_progress_bar=False, convert_to_numpy=True)
+        for vec, (idx, handle, h) in zip(embeds, new_indices):
+            vectors[idx] = vec
+            cache[handle] = {"hash": h, "vector": vec.tolist()}
+        cache["_model"] = model_name
+        with open(cache_path, "w") as f:
+            json.dump(cache, f)
+
+    return vectors
+
+def build_company_graph(companies, embeds:np.ndarray, top_k:int) -> Dict[str,Any]:
+    from sklearn.metrics.pairwise import cosine_similarity
+    sims = cosine_similarity(embeds)
+    nodes, edges = [], []
+    idx_of = {c["handle"]: i for i,c in enumerate(companies)}
+    for i,c in enumerate(companies):
+        node = dict(
+            id=c["handle"].strip("/"),
+            name=c["name"],
+            handle=c["handle"],
+            about=c.get("about",""),
+            people_url=c.get("people_url",""),
+            industry=c.get("descriptor","").split("•")[0].strip(),
+            geoUrn=c.get("geoUrn"),
+            followers=c.get("followers",0),
+            # desc_embed=embeds[i].tolist(),
+            desc_embed=[],
+        )
+        nodes.append(node)
+        # pick top-k most similar except itself
+        top_idx = np.argsort(sims[i])[::-1][1:top_k+1]
+        for j in top_idx:
+            tgt = companies[j]
+            weight = float(sims[i,j])
+            if node["industry"] == tgt.get("descriptor","").split("•")[0].strip():
+                weight += 0.10
+            if node["geoUrn"] == tgt.get("geoUrn"):
+                weight += 0.05
+            tgt['followers'] = tgt.get("followers", None) or 1
+            node["followers"] = node.get("followers", None) or 1
+            follower_ratio = min(node["followers"], tgt.get("followers",1)) / max(node["followers"] or 1, tgt.get("followers",1))
+            weight += 0.05 * follower_ratio
+            edges.append(dict(
+                source=node["id"],
+                target=tgt["handle"].strip("/"),
+                weight=round(weight,4),
+                drivers=dict(
+                    embed_sim=round(float(sims[i,j]),4),
+                    industry_match=0.10 if node["industry"] == tgt.get("descriptor","").split("•")[0].strip() else 0,
+                    geo_overlap=0.05 if node["geoUrn"] == tgt.get("geoUrn") else 0,
+                )
+            ))
+    # return {"nodes":nodes,"edges":edges,"meta":{"generated_at":datetime.now(UTC).isoformat()}}
+    return {"nodes":nodes,"edges":edges,"meta":{"generated_at":datetime.now(UTC).isoformat()}}
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Org-chart via LLM
+# ───────────────────────────────────────────────────────────────────────────────
+async def infer_org_chart_llm(company, people, llm_provider:str, api_key:str, max_tokens:int, temperature:float, stub:bool=False, base_url:str=None):
+    if stub:
+        # Tiny fake org-chart when debugging offline
+        chief = random.choice(people)
+        nodes = [{
+            "id": chief["profile_url"],
+            "name": chief["name"],
+            "title": chief["headline"],
+            "dept": chief["headline"].split()[:1][0],
+            "yoe_total": 8,
+            "yoe_current": 2,
+            "seniority_score": 0.8,
+            "decision_score": 0.9,
+            "avatar_url": chief.get("avatar_url")
+        }]
+        return {"nodes":nodes,"edges":[],"meta":{"debug_stub":True,"generated_at":datetime.now(UTC).isoformat()}}
+    
+    prompt = [
+        {"role":"system","content":"You are an expert B2B org-chart reasoner."},
+        {"role":"user","content":f"""Here is the company description:
+         
+<company>
+{json.dumps(company, ensure_ascii=False)}
+</company>
+                
+Here is a JSON list of employees:
+<employees>
+{json.dumps(people, ensure_ascii=False)}
+</employees>
+
+1) Build a reporting tree (manager -> direct reports)
+2) For each person output a decision_score 0-1 for buying new software
+
+Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_score,decision_score,avatar_url,profile_url}}], "edges":[{{source,target,type,confidence}}] }}
+"""}
+    ]
+    resp = completion(
+        model=llm_provider,
+        messages=prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        response_format={"type":"json_object"},
+        api_key=api_key,
+        base_url=base_url
+    )
+    chart = json.loads(resp.choices[0].message.content)
+    chart["meta"] = dict(
+        model=llm_provider,
+        generated_at=datetime.now(UTC).isoformat()
+    )
+    return chart
+
+# ───────────────────────────────────────────────────────────────────────────────
+# CSV flatten
+# ───────────────────────────────────────────────────────────────────────────────
+def export_decision_makers(charts_dir:Path, csv_path:Path, threshold:float=0.5):
+    rows=[]
+    for p in charts_dir.glob("org_chart_*.json"):
+        data=json.loads(p.read_text())
+        comp = p.stem.split("org_chart_")[1]
+        for n in data.get("nodes",[]):
+            if n.get("decision_score",0)>=threshold:
+                rows.append(dict(
+                    company=comp,
+                    person=n["name"],
+                    title=n["title"],
+                    decision_score=n["decision_score"],
+                    profile_url=n["id"]
+                ))
+    pd.DataFrame(rows).to_csv(csv_path,index=False)
+
+# ───────────────────────────────────────────────────────────────────────────────
+# HTML rendering
+# ───────────────────────────────────────────────────────────────────────────────
+def render_html(out:Path, template_dir:Path):
+    # From template folder cp graph_view.html and ai.js in out folder
+    import shutil
+    shutil.copy(template_dir/"graph_view_template.html", out / "graph_view.html")
+    shutil.copy(template_dir/"ai.js", out)
+
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Main async pipeline
+# ───────────────────────────────────────────────────────────────────────────────
+async def run(opts):
+    # ── silence SDK noise ──────────────────────────────────────────────────────
+    for noisy in ("openai", "httpx", "httpcore"):
+        lg = logging.getLogger(noisy)
+        lg.setLevel(logging.WARNING)     # or ERROR if you want total silence
+        lg.propagate = False             # optional: stop them reaching root
+
+    # ────────────── logging bootstrap ──────────────
+    console = Console()
+    logging.basicConfig(
+        level="INFO",
+        format="%(message)s",
+        handlers=[RichHandler(console=console, markup=True, rich_tracebacks=True)],
+    )
+
+    in_dir  = BASE_DIR / Path(opts.in_dir)
+    out_dir = BASE_DIR / Path(opts.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    companies = load_jsonl(in_dir/"companies.jsonl")
+    people    = load_jsonl(in_dir/"people.jsonl")
+
+    logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people")
+
+    logging.info("[bold]⇢[/] Embedding company descriptions…")
+    embeds = embed_descriptions(companies, opts.embed_model, opts)
+    
+    logging.info("[bold]⇢[/] Building similarity graph")
+    company_graph = build_company_graph(companies, embeds, opts.top_k)
+    dump_json(company_graph, out_dir/"company_graph.json")
+
+    # Filter companies that need processing
+    to_process = []
+    for comp in companies:
+        handle = comp["handle"].strip("/").replace("/","_")
+        out_file = out_dir/f"org_chart_{handle}.json"
+        if out_file.exists() and False:
+            logging.info(f"[green]✓[/] Skipping existing {comp['name']}")
+            continue
+        to_process.append(comp)
+    
+    
+    if not to_process:
+        logging.info("[yellow]All companies already processed[/]")
+    else:
+        workers = getattr(opts, 'workers', 1)
+        parallel = workers > 1
+        
+        logging.info(f"[bold]⇢[/] Inferring org-charts via LLM {f'(parallel={workers} workers)' if parallel else ''}")
+        
+        with Progress(
+            SpinnerColumn(),
+            BarColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            TimeElapsedColumn(),
+            console=console,
+        ) as progress:
+            task = progress.add_task("Org charts", total=len(to_process))
+            
+            async def process_one(comp):
+                handle = comp["handle"].strip("/").replace("/","_")
+                persons = [p for p in people if p["company_handle"].strip("/") == comp["handle"].strip("/")]
+                chart = await infer_org_chart_llm(
+                    comp, persons,
+                    llm_provider=opts.llm_provider,
+                    api_key=opts.llm_api_key or None,
+                    max_tokens=opts.max_llm_tokens,
+                    temperature=opts.llm_temperature,
+                    stub=opts.stub or False,
+                    base_url=opts.llm_base_url or None
+                )
+                chart["meta"]["company"] = comp["name"]
+                
+                # Save the result immediately
+                dump_json(chart, out_dir/f"org_chart_{handle}.json")
+                
+                progress.update(task, advance=1, description=f"{comp['name']} ({len(persons)} ppl)")
+            
+            # Create tasks for all companies
+            tasks = [process_one(comp) for comp in to_process]
+            
+            # Process in batches based on worker count
+            semaphore = asyncio.Semaphore(workers)
+            
+            async def bounded_process(coro):
+                async with semaphore:
+                    return await coro
+            
+            # Run with concurrency control
+            await asyncio.gather(*(bounded_process(task) for task in tasks))
+
+    logging.info("[bold]⇢[/] Flattening decision-makers CSV")
+    export_decision_makers(out_dir, out_dir/"decision_makers.csv")
+        
+    render_html(out_dir, template_dir=BASE_DIR/"templates")
+    logging.success = lambda msg, **k: console.print(f"[bold green]✓[/] {msg}", **k)
+    logging.success(f"Stage-2 artefacts written to {out_dir}")
+
+# ───────────────────────────────────────────────────────────────────────────────
+# CLI
+# ───────────────────────────────────────────────────────────────────────────────
+def build_arg_parser():
+    p = argparse.ArgumentParser(description="Build graphs & visualisation from Stage-1 output")
+    p.add_argument("--in",       dest="in_dir",  required=False, help="Stage-1 output dir", default=".")
+    p.add_argument("--out",      dest="out_dir", required=False, help="Destination dir",   default=".")
+    p.add_argument("--embed-model", default="all-MiniLM-L6-v2")
+    p.add_argument("--top-k", type=int, default=10, help="Top-k neighbours per company")
+    p.add_argument("--llm-provider", default="openai/gpt-4.1", 
+                  help="LLM model to use in format 'provider/model_name' (e.g., 'anthropic/claude-3')")
+    p.add_argument("--llm-api-key", help="API key for LLM provider (defaults to env vars)")
+    p.add_argument("--llm-base-url", help="Base URL for LLM API endpoint")
+    p.add_argument("--max-llm-tokens", type=int, default=8024)
+    p.add_argument("--llm-temperature", type=float, default=1.0)
+    p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts")
+    p.add_argument("--workers", type=int, default=4, help="Number of parallel workers for LLM inference")
+    return p
+
+def main():
+    dbg = dev_defaults()
+    # opts = dbg if True else build_arg_parser().parse_args()
+    opts = build_arg_parser().parse_args()
+    asyncio.run(run(opts))
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/apps/linkdin/schemas/company_card.json b/docs/apps/linkdin/schemas/company_card.json
new file mode 100644
index 00000000..80ee8e2f
--- /dev/null
+++ b/docs/apps/linkdin/schemas/company_card.json
@@ -0,0 +1,39 @@
+{
+  "name": "LinkedIn Company Card",
+  "baseSelector": "div.search-results-container ul[role='list'] > li",
+  "fields": [
+    {
+      "name": "handle",
+      "selector": "a[href*='/company/']",
+      "type": "attribute",
+      "attribute": "href"
+    },
+    {
+      "name": "profile_image",
+      "selector": "a[href*='/company/'] img",
+      "type": "attribute",
+      "attribute": "src"
+    },
+    {
+      "name": "name",
+      "selector": "span[class*='t-16'] a",
+      "type": "text"
+    },
+    {
+      "name": "descriptor",
+      "selector": "div[class*='t-black t-normal']",
+      "type": "text"
+    },
+    {
+      "name": "about",
+      "selector": "p[class*='entity-result__summary--2-lines']",
+      "type": "text"
+    },
+    {
+      "name": "followers",
+      "selector": "div:contains('followers')",
+      "type": "regex",
+      "pattern": "(\\d+)\\s*followers"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/docs/apps/linkdin/schemas/people_card.json b/docs/apps/linkdin/schemas/people_card.json
new file mode 100644
index 00000000..5514b981
--- /dev/null
+++ b/docs/apps/linkdin/schemas/people_card.json
@@ -0,0 +1,38 @@
+{
+  "name": "LinkedIn People Card",
+  "baseSelector": "li.org-people-profile-card__profile-card-spacing",
+  "fields": [
+    {
+      "name": "profile_url",
+      "selector": "a.eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo",
+      "type": "attribute",
+      "attribute": "href"
+    },
+    {
+      "name": "name",
+      "selector": ".artdeco-entity-lockup__title .lt-line-clamp--single-line",
+      "type": "text"
+    },
+    {
+      "name": "headline",
+      "selector": ".artdeco-entity-lockup__subtitle .lt-line-clamp--multi-line",
+      "type": "text"
+    },
+    {
+      "name": "followers",
+      "selector": ".lt-line-clamp--multi-line.t-12",
+      "type": "text"
+    },
+    {
+      "name": "connection_degree",
+      "selector": ".artdeco-entity-lockup__badge .artdeco-entity-lockup__degree",
+      "type": "text"
+    },
+    {
+      "name": "avatar_url",
+      "selector": ".artdeco-entity-lockup__image img",
+      "type": "attribute",
+      "attribute": "src"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/docs/apps/linkdin/snippets/company.html b/docs/apps/linkdin/snippets/company.html
new file mode 100644
index 00000000..8df4ea5f
--- /dev/null
+++ b/docs/apps/linkdin/snippets/company.html
@@ -0,0 +1,143 @@
+<li class="yCLWzruNprmIzaZzFFonVFBtMrbaVYnuDFA">
+    <!----><!---->
+
+
+
+    <div class="IxlEPbRZwQYrRltKPvHAyjBmCdIWTAoYo" data-chameleon-result-urn="urn:li:company:362492"
+        data-view-name="search-entity-result-universal-template">
+
+
+
+
+        <div class="linked-area flex-1
+              cursor-pointer">
+
+            <div class="BAEgVqVuxosMJZodcelsgPoyRcrkiqgVCGHXNQ">
+                <div class="afcvrbGzNuyRlhPPQWrWirJtUdHAAtUlqxwvVA">
+                    <div class="display-flex align-items-center">
+                        <!---->
+
+                        <a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo  scale-down " aria-hidden="true"
+                            tabindex="-1" href="https://www.linkedin.com/company/managment-research-services-inc./"
+                            data-test-app-aware-link="">
+
+                            <div class="ivm-image-view-model   ">
+
+                                <div class="ivm-view-attr__img-wrapper
+            
+            ">
+                                    <!---->
+                                    <!----> <img width="48"
+                                        src="https://media.licdn.com/dms/image/v2/C560BAQFWpusEOgW-ww/company-logo_100_100/company-logo_100_100/0/1630583697877/managment_research_services_inc_logo?e=1750896000&amp;v=beta&amp;t=Ch9vyEZdfng-1D1m_XqP5kjNpVXUBKkk9cNhMZUhx0E"
+                                        loading="lazy" height="48" alt="Management Research Services, Inc. (MRS, Inc)"
+                                        id="ember28"
+                                        class="ivm-view-attr__img--centered EntityPhoto-square-3   evi-image lazy-image ember-view">
+                                </div>
+
+                            </div>
+
+                        </a>
+
+
+                    </div>
+                </div>
+                <div
+                    class="wympnVuDByXHvafWrMGJLZuchDmCRqLmWPwg MmzCPRicJimZvjJhvqTzDcDbdHhWPzspERzA pt3 pb3 t-12 t-black--light">
+                    <div class="mb1">
+
+                        <div class="t-roman t-sans">
+
+
+
+                            <div class="display-flex">
+                                <span class="TikBXjihYvcNUoIzkslUaEjfIuLmYxfs OoHEyXgsiIqGADjcOtTmfdpoYVXrLKTvkwI ">
+                                    <span class="CgaWLOzmXNuKbRIRARSErqCJcBPYudEKo
+                t-16">
+                                        <a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo "
+                                            href="https://www.linkedin.com/company/managment-research-services-inc./"
+                                            data-test-app-aware-link="">
+                                            <!---->Management Research Services, Inc. (MRS, Inc)<!---->
+                                            <!----> </a>
+                                        <!----> </span>
+                                </span>
+                                <!---->
+                            </div>
+
+
+
+                        </div>
+
+
+
+                        <div class="LjmdKCEqKITHihFOiQsBAQylkdnsWhqZii
+              t-14 t-black t-normal">
+                            <!---->Insurance • Milwaukee, Wisconsin<!---->
+                        </div>
+
+                        <div class="cTPhJiHyNLmxdQYFlsEOutjznmqrVHUByZwZ
+              t-14 t-normal">
+                            <!---->1K followers<!---->
+                        </div>
+
+
+
+
+
+                    </div>
+
+                    <!---->
+                    <p class="yWzlqwKNlvCWVNoKqmzoDDEnBMUuyynaLg
+                    entity-result__summary--2-lines
+                    t-12 t-black--light
+                    ">
+                        <!---->MRS combines 30 years of experience supporting the Life,<span class="white-space-pre">
+                        </span><strong><!---->Health<!----></strong><span class="white-space-pre"> </span>and
+                        Annuities<span class="white-space-pre"> </span><strong><!---->Insurance<!----></strong><span
+                            class="white-space-pre"> </span>Industry with customized<span class="white-space-pre">
+                        </span><strong><!---->insurance<!----></strong><span class="white-space-pre">
+                        </span>underwriting solutions that efficiently support clients’ workflows. Supported by the
+                        Agenium Platform (www.agenium.ai) our innovative underwriting solutions are guaranteed to
+                        optimize requirements...<!---->
+                    </p>
+
+                    <!---->
+                </div>
+                <div class="qXxdnXtzRVFTnTnetmNpssucBwQBsWlUuk MmzCPRicJimZvjJhvqTzDcDbdHhWPzspERzA">
+                    <!---->
+
+
+                    <div>
+
+
+
+
+                        <button aria-label="Follow Management Research Services, Inc. (MRS, Inc)" id="ember61"
+                            class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view"
+                            type="button"><!---->
+                            <span class="artdeco-button__text">
+                                Follow
+                            </span></button>
+
+
+
+                        <!---->
+                        <!---->
+
+
+                    </div>
+
+
+
+                </div>
+            </div>
+
+        </div>
+
+
+
+
+    </div>
+
+
+
+</li>
\ No newline at end of file
diff --git a/docs/apps/linkdin/snippets/people.html b/docs/apps/linkdin/snippets/people.html
new file mode 100644
index 00000000..9faa9cda
--- /dev/null
+++ b/docs/apps/linkdin/snippets/people.html
@@ -0,0 +1,94 @@
+<li class="grid grid__col--lg-8 block org-people-profile-card__profile-card-spacing">
+    <div>
+
+
+        <section class="artdeco-card full-width qQdPErXQkSAbwApNgNfuxukTIPPykttCcZGOHk">
+            <!---->
+
+            <img width="210" src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"
+                ariarole="presentation" loading="lazy" height="210" alt="" id="ember96"
+                class="evi-image lazy-image ghost-default ember-view org-people-profile-card__cover-photo org-people-profile-card__cover-photo--people">
+
+            <div class="org-people-profile-card__profile-info">
+                <div id="ember97"
+                    class="artdeco-entity-lockup artdeco-entity-lockup--stacked-center artdeco-entity-lockup--size-7 ember-view">
+                    <div id="ember98"
+                        class="artdeco-entity-lockup__image artdeco-entity-lockup__image--type-circle ember-view"
+                        type="circle">
+
+                        <a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo "
+                            id="org-people-profile-card__profile-image-0"
+                            href="https://www.linkedin.com/in/speakerrayna?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABsqUBoBr5x071PuGGpNtK3NlvSARiVXPIs"
+                            data-test-app-aware-link="">
+                            <img width="104"
+                                src="https://media.licdn.com/dms/image/v2/D5603AQGs2Vyju4xZ7A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1681741067031?e=1750896000&amp;v=beta&amp;t=Hvj--IrrmpVIH7pec7-l_PQok8vsS__CGeUqBWOw7co"
+                                loading="lazy" height="104" alt="Dr. Rayna S." id="ember99"
+                                class="evi-image lazy-image ember-view">
+                        </a>
+
+
+                    </div>
+                    <div id="ember100" class="artdeco-entity-lockup__content ember-view">
+                        <div id="ember101" class="artdeco-entity-lockup__title ember-view">
+                            <a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo  link-without-visited-state"
+                                aria-label="View Dr. Rayna S.’s profile"
+                                href="https://www.linkedin.com/in/speakerrayna?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABsqUBoBr5x071PuGGpNtK3NlvSARiVXPIs"
+                                data-test-app-aware-link="">
+                                <div id="ember103" class="ember-view lt-line-clamp lt-line-clamp--single-line AGabuksChUpCmjWshSnaZryLKSthOKkwclxY
+          t-black" style="">
+                                    Dr. Rayna S.
+
+                                    <!---->
+                                </div>
+
+                            </a>
+
+                        </div>
+                        <div id="ember104" class="artdeco-entity-lockup__badge ember-view"> <span class="a11y-text">3rd+
+                                degree connection</span>
+                            <span class="artdeco-entity-lockup__degree" aria-hidden="true">
+                                ·&nbsp;3rd
+                            </span>
+                            <!----><!---->
+                        </div>
+                        <div id="ember105" class="artdeco-entity-lockup__subtitle ember-view">
+                            <div class="t-14 t-black--light t-normal">
+                                <div id="ember107" class="ember-view lt-line-clamp lt-line-clamp--multi-line"
+                                    style="-webkit-line-clamp: 2">
+                                    Leadership and Talent Development Consultant and Professional Speaker
+
+                                    <!---->
+                                </div>
+
+                            </div>
+                        </div>
+                        <div id="ember108" class="artdeco-entity-lockup__caption ember-view"></div>
+                    </div>
+
+                </div>
+                <span class="text-align-center">
+                    <span id="ember110"
+                        class="ember-view lt-line-clamp lt-line-clamp--multi-line t-12 t-black--light mt2"
+                        style="-webkit-line-clamp: 3">
+                        727 followers
+
+                        <!----> </span>
+
+                </span>
+            </div>
+
+            <footer class="ph3 pb3">
+                <button aria-label="Follow Dr. Rayna S." id="ember111"
+                    class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view full-width"
+                    type="button"><!---->
+                    <span class="artdeco-button__text">
+                        Follow
+                    </span></button>
+            </footer>
+
+        </section>
+
+
+    </div>
+
+</li>
\ No newline at end of file
diff --git a/docs/apps/linkdin/templates/ai.js b/docs/apps/linkdin/templates/ai.js
new file mode 100644
index 00000000..f67e1108
--- /dev/null
+++ b/docs/apps/linkdin/templates/ai.js
@@ -0,0 +1,50 @@
+// ==== File: ai.js ====
+
+class ApiHandler {
+    constructor(apiKey = null) {
+      this.apiKey = apiKey || localStorage.getItem("openai_api_key") || "";
+      console.log("ApiHandler ready");
+    }
+  
+    setApiKey(k) {
+      this.apiKey = k.trim();
+      if (this.apiKey) localStorage.setItem("openai_api_key", this.apiKey);
+    }
+  
+    async *chatStream(messages, {model = "gpt-4o", temperature = 0.7} = {}) {
+      if (!this.apiKey) throw new Error("OpenAI API key missing");
+      const payload = {model, messages, stream: true, max_tokens: 1024};
+      const controller = new AbortController();
+  
+      const res = await fetch("https://api.openai.com/v1/chat/completions", {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          Authorization: `Bearer ${this.apiKey}`,
+        },
+        body: JSON.stringify(payload),
+        signal: controller.signal,
+      });
+      if (!res.ok) throw new Error(`OpenAI: ${res.statusText}`);
+      const reader = res.body.getReader();
+      const dec = new TextDecoder();
+  
+      let buf = "";
+      while (true) {
+        const {done, value} = await reader.read();
+        if (done) break;
+        buf += dec.decode(value, {stream: true});
+        for (const line of buf.split("\n")) {
+          if (!line.startsWith("data: ")) continue;
+          if (line.includes("[DONE]")) return;
+          const json = JSON.parse(line.slice(6));
+          const delta = json.choices?.[0]?.delta?.content;
+          if (delta) yield delta;
+        }
+        buf = buf.endsWith("\n") ? "" : buf; // keep partial line
+      }
+    }
+  }
+  
+  window.API = new ApiHandler();
+  
\ No newline at end of file
diff --git a/docs/apps/linkdin/templates/graph_view_template.html b/docs/apps/linkdin/templates/graph_view_template.html
new file mode 100644
index 00000000..68b8ce59
--- /dev/null
+++ b/docs/apps/linkdin/templates/graph_view_template.html
@@ -0,0 +1,1171 @@
+<!DOCTYPE html>
+<html lang="en" class="dark">
+
+<head>
+    <meta charset="utf-8" />
+    <title>C4AI Insights</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script src="https://unpkg.com/vis-network@9.1.2/dist/vis-network.min.js"></script>
+    <!-- our tiny OpenAI wrapper -->
+    <script src="ai.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/split.js/1.6.5/split.min.js"></script>
+    <link href="https://unpkg.com/vis-network@9.1.2/dist/vis-network.min.css" rel="stylesheet" />
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" />
+    <style>
+        .vis-network canvas {
+            background-color: #1f1f1f !important;
+            background-image: 
+              linear-gradient(rgba(255, 255, 255, 0.03) 1px, transparent 1px),
+              linear-gradient(90deg, rgba(255, 255, 255, 0.03) 1px, transparent 1px);
+            background-size: 30px 30px;
+        }
+
+        #chatDrawer {
+            max-height: 45vh;
+            height: 45vh;
+            display: flex;
+            flex-direction: column;
+        }
+
+        #chatBody {
+            flex: 1;
+            overflow-y: auto;
+            min-height: 0;
+            max-height: calc(45vh - 90px);
+        }
+
+        #chatInputContainer {
+            min-height: 60px;
+        }
+
+        /* Split.js vertical gutter */
+        .gutter.gutter-vertical {
+            cursor: row-resize;
+            height: 6px;
+            background-image: url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAB4AAAAFCAYAAABSIVz6AAAABmJLR0QA/wD/AP+gvaeTAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAB3RJTUUH4AkKCQQBdo6l1QAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAArSURBVCjPY2AYBfQMgVAFzGCGIpgBxTklpCgGQ0O54P//Y8zAs14lighENAAAVTsOYMqVl/QAAAAASUVORK5CYII=');
+        }
+
+        /* Split.js styles */
+        .gutter {
+            background-color: #2d2d2d;
+            background-repeat: no-repeat;
+            background-position: 50%;
+        }
+
+        .gutter.gutter-horizontal {
+            cursor: col-resize;
+            background-image: url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAeCAYAAADkftS9AAAAIklEQVQoU2M4c+bMfxAGAgYYmwGrIIiDjrELjpo5aiZeMwF+yNnOs5KSvgAAAABJRU5ErkJggg==');
+        }
+
+        /* Sidebar styles */
+        .sidebar-collapse-btn {
+            position: absolute;
+            top: 10px;
+            background-color: #2d2d2d;
+            color: #999;
+            border: none;
+            border-radius: 4px;
+            width: 24px;
+            height: 24px;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            cursor: pointer;
+            z-index: 10;
+        }
+
+        .sidebar-collapse-btn:hover {
+            background-color: #444;
+            color: #eee;
+        }
+
+        #leftSidebarToggle {
+            right: -12px;
+        }
+
+        #leftSidebarToggle.collapsed {
+            right: -20px;
+        }
+
+        #rightSidebarToggle {
+            left: -12px;
+        }
+
+        .collapsed {
+            width: 0 !important;
+            padding: 0 !important;
+            overflow: visible !important;
+        }
+
+        .full-width {
+            width: 100% !important;
+        }
+
+        .splitter-container {
+            height: 100%;
+            display: flex;
+        }
+    </style>
+
+    <!-- Toast notification style -->
+    <style>
+        #toast {
+            position: fixed;
+            bottom: 20px;
+            left: 50%;
+            transform: translateX(-50%);
+            background-color: #2d2d2d;
+            color: #eee;
+            padding: 10px 20px;
+            border-radius: 4px;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3);
+            opacity: 0;
+            transition: opacity 0.3s ease;
+            z-index: 1000;
+            pointer-events: none;
+        }
+
+        #toast.show {
+            opacity: 1;
+        }
+    </style>
+</head>
+
+<body class="h-screen flex flex-col bg-neutral-900 text-neutral-100  overflow-hidden">
+    <!-- Toast notification -->
+    <div id="toast"></div>
+    
+    <!-- API Settings Modal -->
+    <div id="settingsModal" class="fixed inset-0 bg-black bg-opacity-50 z-50 flex items-center justify-center hidden">
+        <div class="bg-neutral-800 rounded-lg shadow-lg p-6 max-w-md w-full mx-4">
+            <div class="flex justify-between items-center mb-4">
+                <h3 class="text-lg font-semibold">API Settings</h3>
+                <button id="closeSettingsModal" class="text-neutral-400 hover:text-neutral-200">
+                    <i class="fa fa-times"></i>
+                </button>
+            </div>
+            <div class="mb-4">
+                <label for="apiKeyInput" class="block text-sm font-medium text-neutral-300 mb-2">OpenAI API Key</label>
+                <input type="password" id="apiKeyInput" class="w-full p-2 rounded bg-neutral-700 text-neutral-100 border border-neutral-600 focus:border-blue-500 focus:outline-none" placeholder="sk-...">
+                <p class="text-xs text-neutral-400 mt-1">Your API key is stored locally in your browser and never sent to our servers.</p>
+            </div>
+            <div class="flex justify-end">
+                <button id="saveApiKey" class="bg-emerald-600 hover:bg-emerald-500 text-white px-4 py-2 rounded transition-colors">
+                    Save Settings
+                </button>
+            </div>
+        </div>
+    </div>
+    <div class="flex flex-1 splitter-container" id="mainSplitter">
+        <div id="leftSidebar" class="p-2 border-r border-neutral-700 relative w-72 overflow-y-auto">
+            <button id="leftSidebarToggle" class="sidebar-collapse-btn">
+                <i class="fa fa-chevron-left"></i>
+            </button>
+            <input id="search" class="w-full mb-3 p-2 border rounded bg-neutral-800 text-neutral-100 border-neutral-600"
+                placeholder="Search company">
+            <div class="text-xs text-neutral-400 mb-2 flex justify-between items-center">
+                <span>Companies</span>
+                <span id="companyCount">0 companies</span>
+            </div>
+            <ul id="companyList" class="space-y-3 text-sm"></ul>
+        </div>
+        <div id="mainContent" class="flex-1 relative">
+            <div class="absolute top-4 left-4 z-10 flex space-x-2">
+                <label class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2 px-3 rounded-full shadow-lg transition-colors cursor-pointer flex items-center">
+                    <i class="fas fa-upload mr-2"></i>
+                    <span>Load Data</span>
+                    <input type="file" id="dataFileInput" accept=".json" class="hidden">
+                </label>
+                <button id="clearDataBtn" class="bg-red-800 hover:bg-red-700 text-neutral-200 p-2 px-3 rounded-full shadow-lg transition-colors flex items-center">
+                    <i class="fas fa-trash-alt mr-2"></i>
+                    <span>Clear Data</span>
+                </button>
+                <button id="settingsBtn" class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2 px-3 rounded-full shadow-lg transition-colors flex items-center">
+                    <i class="fas fa-cog mr-2"></i>
+                    <span>Settings</span>
+                </button>
+            </div>
+            <div class="absolute top-4 right-4 z-10 flex space-x-2">
+                <button id="zoomInBtn"
+                    class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2 px-3 rounded-full shadow-lg transition-colors">
+                    <i class="fas fa-search-plus"></i>
+                </button>
+                <button id="zoomOutBtn"
+                    class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2  px-3 rounded-full shadow-lg transition-colors">
+                    <i class="fas fa-search-minus"></i>
+                </button>
+                <button id="resetViewBtn"
+                    class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2  px-3 rounded-full shadow-lg transition-colors">
+                    <i class="fas fa-compress-arrows-alt"></i>
+                </button>
+            </div>
+            <div class="absolute bottom-4 left-4 z-10 text-xs text-neutral-500">
+                <div id="graphInfo" class="bg-neutral-800/70 backdrop-blur-sm p-2 rounded shadow-lg hidden">
+                    <div id="graphInfoContent"></div>
+                </div>
+            </div>
+            <div id="graph" class="w-full h-full"></div>
+            <!-- ───── Chat drawer (hidden by default) ───── -->
+            <div id="chatDrawer" class="absolute bottom-0 inset-x-0 bg-neutral-900 border-t
+    border-neutral-700 translate-y-full transition-transform
+    duration-300">
+                <div class="flex items-center px-3 py-2">
+                    <span class="font-semibold flex-1">🔮 Chat with C4AI Assistant</span>
+                    <button id="chatClose" class="text-neutral-400 hover:text-neutral-200">
+                        <i class="fa fa-times"></i>
+                    </button>
+                </div>
+                <div id="chatBody" class="flex-1 overflow-y-auto p-3 space-y-2 text-sm max-h-full"></div>
+                <div class="p-2 border-t border-neutral-700">
+                    <input id="chatInput" class="w-full bg-neutral-800 p-2 rounded outline-none"
+                        placeholder="Ask something… (Enter to send)">
+                </div>
+            </div>
+
+        </div>
+        <div id="rightSidebar" class="bg-neutral-800 shadow-lg relative w-80 overflow-y-auto">
+            <button id="rightSidebarToggle" class="sidebar-collapse-btn">
+                <i class="fa fa-chevron-right"></i>
+            </button>
+            <div id="rightPane" class="h-full">
+                <div class="flex flex-col items-center justify-center h-full p-8 text-center">
+                    <div class="text-neutral-500 mb-4">
+                        <i class="fas fa-sitemap text-4xl"></i>
+                    </div>
+                    <h3 class="text-lg font-semibold mb-2">Organization Details</h3>
+                    <p class="text-neutral-400 text-sm">Select a company to view its organization structure and key
+                        decision makers.</p>
+                </div>
+            </div>
+        </div>
+    </div>
+
+
+    <!-- chat floating action button -->
+    <button id="chatFab" class="fixed bottom-6 right-6 bg-emerald-500 hover:bg-emerald-400
+                 p-3 rounded-full shadow-lg focus:outline-none" style="padding: 0.65rem 0.75rem">
+        <i class="fa fa-comments text-neutral-900"></i>
+    </button>
+
+
+    <script>
+        // Check localStorage first, otherwise fetch default data
+        let data;
+        // Toast notification function
+        function showToast(message, duration = 3000) {
+            const toast = document.getElementById('toast');
+            toast.textContent = message;
+            toast.classList.add('show');
+
+            setTimeout(() => {
+                toast.classList.remove('show');
+            }, duration);
+        }
+
+        const loadDataFromSource = () => {
+            const savedData = localStorage.getItem('companyGraphData');
+
+            if (savedData) {
+                try {
+                    data = JSON.parse(savedData);
+                    console.log('Loaded data from localStorage');
+                    initializeGraph(data);
+                    showToast('Using data from local storage. Click "Clear Data" to revert to default.');
+                } catch (error) {
+                    console.error('Error parsing stored data:', error);
+                    fetchDefaultData();
+                }
+            } else {
+                fetchDefaultData();
+            }
+        };
+
+        const fetchDefaultData = () => {
+            fetch('./company_graph.json')
+                .then(response => response.json())
+                .then(data => {
+                    initializeGraph(data);
+                })
+                .catch(error => console.error('Error loading default JSON:', error));
+        };
+
+        // File input handler
+        document.getElementById('dataFileInput').addEventListener('change', (event) => {
+            const file = event.target.files[0];
+            if (!file) return;
+
+            const reader = new FileReader();
+            reader.onload = (e) => {
+                try {
+                    const data = JSON.parse(e.target.result);
+                    // Validate data structure
+                    if (!data.nodes || !data.edges) {
+                        alert('Invalid data format. File must contain nodes and edges arrays.');
+                        return;
+                    }
+
+                    // Save to localStorage
+                    localStorage.setItem('companyGraphData', JSON.stringify(data));
+
+                    // Show notification before reload
+                    showToast('Data file loaded successfully! Refreshing page...', 1500);
+
+                    // Reload the page to initialize with new data
+                    setTimeout(() => {
+                        window.location.reload();
+                    }, 1500);
+                } catch (error) {
+                    alert('Error parsing JSON file: ' + error.message);
+                }
+            };
+            reader.readAsText(file);
+        });
+
+        // Clear data button
+        document.getElementById('clearDataBtn').addEventListener('click', () => {
+            if (confirm('Are you sure you want to clear the loaded data? This will revert to the default dataset.')) {
+                // Clear the localStorage data
+                localStorage.removeItem('companyGraphData');
+                
+                // Force clear the current data from memory
+                window.companyGraphData = null;
+                
+                showToast('Custom data cleared! Loading default dataset...', 1500);
+                
+                // Completely reload the page to ensure fresh start
+                setTimeout(() => {
+                    window.location.href = window.location.href.split('?')[0] + '?nocache=' + new Date().getTime();
+                }, 1500);
+            }
+        });
+
+        // Initialize
+        loadDataFromSource();
+
+        function initializeGraph(data) {
+            window.companyGraphData = data          // expose globally
+
+            // lazy-load people.jsonl once so chat can reference raw rows
+            let peopleRows = null
+            async function getPeopleRows() {
+                if (peopleRows) return peopleRows
+                try {
+                    const txt = await fetch("people.jsonl").then(r => r.text())
+                    peopleRows = txt.trim().split("\n").map(l => JSON.parse(l))
+                } catch { peopleRows = [] }
+                return peopleRows
+            }
+            const container = document.getElementById('graph')
+            // Create node objects with enhanced styling and tooltips
+            const nodes = new vis.DataSet(data.nodes.map(n => ({
+                id: n.id,
+                label: n.name,
+                title: `${n.name}\n${n.industry || 'Industry: N/A'}\n${n.followers?.toLocaleString() || '0'} followers`,
+                shape: 'dot',
+                font: {
+                    color: '#ffffff',
+                    face: 'Inter, system-ui, sans-serif',
+                    size: 16,
+                    strokeWidth: 2,
+                    strokeColor: '#222222'
+                },
+                size: Math.max(15, Math.log10((n.followers || 1)) * 6 + 10),
+                borderWidth: 2,
+                borderWidthSelected: 4,
+                color: {
+                    background: '#3b82f6',  // blue-500
+                    border: '#1e40af',      // blue-800
+                    highlight: {
+                        background: '#60a5fa',  // blue-400
+                        border: '#ffffff'
+                    },
+                    hover: {
+                        background: '#93c5fd',  // blue-300
+                        border: '#ffffff'
+                    }
+                },
+                shadow: {
+                    enabled: true,
+                    color: 'rgba(0,0,0,0.3)',
+                    size: 10,
+                    x: 0,
+                    y: 0
+                }
+            })))
+
+            // Create edge objects with enhanced styling
+            const edges = new vis.DataSet(data.edges.map(e => ({
+                from: e.source,
+                to: e.target,
+                width: Math.max(1, Math.min(8, e.weight * 4)),
+                selectionWidth: 2,
+                color: {
+                    color: '#6b7280',       // gray-500
+                    highlight: '#10b981',   // emerald-500
+                    hover: '#a3e635'        // lime-400
+                },
+                arrows: {
+                    to: {
+                        enabled: e.weight > 0.3,  // Only show arrows for stronger connections
+                        scaleFactor: 0.5
+                    }
+                },
+                smooth: {
+                    type: 'continuous',
+                    forceDirection: 'none',
+                    roundness: 0.2
+                }
+            })))
+
+            // Configure and create the network
+            const network = new vis.Network(container, { nodes, edges }, {
+                physics: {
+                    barnesHut: {
+                        gravitationalConstant: -2000,
+                        springLength: 120,
+                        springConstant: 0.05,
+                        avoidOverlap: 0.5,
+                        damping: 0.09
+                    },
+                    stabilization: {
+                        iterations: 200,
+                        updateInterval: 25
+                    },
+                    enabled: true,
+                    timestep: 0.5,
+                    adaptiveTimestep: true
+                },
+                interaction: {
+                    hover: true,
+                    navigationButtons: false,
+                    keyboard: true,
+                    tooltipDelay: 100,
+                    hideEdgesOnDrag: false,  // Keep edges visible when dragging
+                    multiselect: false,
+                    selectable: true,
+                    dragNodes: true,
+                    dragView: true,
+                    zoomView: true,
+                    mouseWheel: {
+                        speed: 0.15,           // Reduced from default 1.0
+                        smooth: true           // Enable smooth zooming
+                    }
+                },
+                nodes: {
+                    font: {
+                        size: 16,
+                        strokeWidth: 2,
+                        strokeColor: '#222222'
+                    },
+                    fixed: false
+                },
+                edges: {
+                    smooth: {
+                        type: 'continuous',
+                        forceDirection: 'none',
+                        roundness: 0.2
+                    },
+                    hoverWidth: 1.5,
+                    selectionWidth: 2
+                }
+            })
+
+            window.network = network;
+
+            network.once('stabilized', () => {
+                console.log('Network stabilized')
+                // Freeze layout so nodes stop running away and dragging feels crisp
+                network.setOptions({ physics: false });
+
+                // get id of first node
+                let firstNodeId = data.nodes[0].id;
+                // network.focus(firstNodeId, { animation: true, scale: 1.5 });
+
+                // Automatically select the first company in the list
+                if (data.nodes.length > 0) {
+                    focusCompany(firstNodeId);
+                    // Highlight the first company in the sidebar
+                    const firstCompanyElement = document.querySelector('#companyList li');
+                    if (firstCompanyElement) {
+                        firstCompanyElement.classList.add('border-blue-500');
+                    }
+                }
+            });
+
+            const companyList = document.getElementById('companyList')
+            const companyCount = document.getElementById('companyCount')
+            companyCount.textContent = `${data.nodes.length} companies`
+
+            data.nodes.forEach(n => {
+                const li = document.createElement('li')
+                li.className = 'p-3 rounded bg-neutral-800 hover:bg-neutral-700 border border-neutral-700 transition-colors'
+                li.innerHTML = `
+                        <div class="flex justify-between items-start mb-1">
+                            <h3 class="font-semibold text-blue-400 cursor-pointer">${n.name}</h3>
+                            <span class="px-2 py-0.5 bg-neutral-700 rounded-full text-xs">${n.industry || 'N/A'}</span>
+                        </div>
+                        <p class="text-xs text-neutral-300 mb-2">${n.about || 'No description available'}</p>
+                        <div class="flex justify-between items-center text-xs">
+                            <div class="flex items-center">
+                                <i class="fa fa-users mr-1 text-neutral-400"></i>
+                                <span>${n.followers?.toLocaleString() || '0'} followers</span>
+                            </div>
+                            <a href="https://www.linkedin.com${n.handle}" target="_blank" class="text-emerald-400 hover:text-emerald-300">
+                                <i class="fab fa-linkedin mr-1"></i>View on LinkedIn
+                            </a>
+                        </div>
+                    `
+                // Make the entire card clickable for better UX
+                li.style.cursor = 'pointer'
+                li.onclick = (e) => {
+                    // Don't trigger if clicking on the LinkedIn link
+                    if (e.target.tagName === 'A' || e.target.closest('a')) return
+                    focusCompany(n.id)
+                    // Add active state visual indicator
+                    document.querySelectorAll('#companyList li').forEach(el =>
+                        el.classList.remove('border-blue-500'))
+                    li.classList.add('border-blue-500')
+                }
+                companyList.appendChild(li)
+            })
+
+            // Add search functionality
+            const searchInput = document.getElementById('search')
+            searchInput.addEventListener('input', () => {
+                const query = searchInput.value.toLowerCase()
+                const items = companyList.querySelectorAll('li')
+                let visibleCount = 0
+
+                items.forEach(item => {
+                    const companyName = item.querySelector('h3').textContent.toLowerCase()
+                    const industryText = item.querySelector('span').textContent.toLowerCase()
+                    const aboutText = item.querySelector('p').textContent.toLowerCase()
+
+                    if (companyName.includes(query) || industryText.includes(query) || aboutText.includes(query)) {
+                        item.style.display = ''
+                        visibleCount++
+                    } else {
+                        item.style.display = 'none'
+                    }
+                })
+
+                companyCount.textContent = `${visibleCount} of ${data.nodes.length} companies`
+            })
+
+            function focusCompany(id) {
+                network.focus(id, { scale: 1.5, animation: true, })
+                loadOrgChart(id)
+            }
+
+            async function loadOrgChart(id) {
+                const pane = document.getElementById('rightPane')
+                pane.innerHTML = '<div class="p-4 text-sm">Loading…</div>'
+                if (rightSidebar.classList.contains('collapsed')) {
+                    toggleRightSidebar()
+                }
+                try {
+                    const chart = await fetch(`org_chart_${id.replace(/\//g, "_")}.json`).then(r => r.json())
+                    currentCompany = id
+                    currentChart = chart
+
+                    // Clear any previously selected person
+                    selectedPerson = null
+
+                    renderOrg(chart, pane)
+                } catch (e) {
+                    pane.innerHTML = `
+                            <div class="flex flex-col items-center justify-center h-full p-8 text-center">
+                                <div class="text-red-500 mb-3">
+                                    <i class="fas fa-exclamation-circle text-4xl"></i>
+                                </div>
+                                <h3 class="text-lg font-semibold mb-2">Organization Chart Not Found</h3>
+                                <p class="text-neutral-400 text-sm">Data for this company is not available.</p>
+                            </div>`
+                }
+            }
+
+            // REMOVED - Using colorForScore instead
+
+            function renderOrg(chart, pane) {
+                // Format company info from chart metadata
+                const companyName = chart.meta?.company || 'Company';
+                const employeeCount = chart.nodes.length;
+                const decisionMakers = chart.nodes.filter(n => n.decision_score >= 0.5);
+
+                pane.innerHTML = `
+  <div class="p-4 border-b border-neutral-700 bg-neutral-800">
+    <div class="flex justify-between items-center">
+      <h2 class="font-semibold text-lg text-blue-400">${companyName}</h2>
+      <span class="px-2 py-1 bg-neutral-700 rounded-full text-xs">${employeeCount} employees</span>
+    </div>
+  </div>
+  <div id="orgNet" style="height:320px" class="border-b border-neutral-700"></div>
+  <div class="p-4 bg-neutral-800">
+     <div class="flex justify-between items-center mb-3">
+       <h3 class="font-semibold">Decision Makers</h3>
+       <span class="px-2 py-0.5 bg-emerald-700 text-emerald-100 rounded-full text-xs">${decisionMakers.length} key people</span>
+     </div>
+     <ul class="text-sm space-y-2 max-h-60 overflow-y-auto pr-1 mb-4">
+       ${decisionMakers.map(n => `
+         <li class="p-2 rounded bg-neutral-700 hover:bg-neutral-600 transition-colors flex justify-between items-center">
+           <div>
+             <div class="font-medium">${n.name}</div>
+             <div class="text-xs text-neutral-300">${n.title}</div>
+           </div>
+           <div class="flex items-center gap-2">
+             <span class="text-xs px-2 py-0.5 rounded-full" style="background-color:${colorForScore(n.decision_score)}">${(n.decision_score * 100).toFixed(0)}%</span>
+             <a href="${n.profile_url}" target="_blank" class="text-emerald-400 hover:text-emerald-300 text-xs">
+               <i class="fab fa-linkedin"></i>
+             </a>
+           </div>
+         </li>`).join('')}
+     </ul>
+     <div class="mt-4 text-xs border-t border-neutral-700 pt-3">
+       <div class="font-semibold mb-2">Influence Scale</div>
+       <div class="h-2 w-full rounded-full mb-1" style="background: linear-gradient(to right, rgb(255,0,100), rgb(0,255,100))"></div>
+       <div class="flex justify-between text-xs text-neutral-400">
+         <span>Low influence (0%)</span>
+         <span>High influence (100%)</span>
+       </div>
+     </div>
+  </div>
+  <div id="personPane" class="p-4 border-t border-neutral-700 text-sm bg-neutral-900 hidden"></div>`
+
+                const n = new vis.DataSet(chart.nodes.map(p => ({
+                    id: p.id,
+                    label: p.name,
+                    title: `${p.name} - ${p.title || 'Employee'} (${(p.decision_score * 100).toFixed(0)}%)`,
+                    shape: 'box',
+                    color: {
+                        background: colorForScore(p.decision_score || 0),
+                        border: '#333333',
+                        highlight: {
+                            background: '#4ade80',
+                            border: '#ffffff'
+                        }
+                    },
+                    borderWidth: 2
+                })))
+
+                const e = new vis.DataSet(chart.edges.map(e => ({ from: e.source, to: e.target, arrows: 'to' })))
+
+                const orgNet = new vis.Network(
+                    pane.querySelector('#orgNet'),
+                    { nodes: n, edges: e },
+                    {
+                        layout: {
+                            hierarchical: {
+                                direction: 'UD',
+                                sortMethod: 'directed',
+                                levelSeparation: 100
+                            }
+                        },
+                        nodes: {
+                            color: { border: '#333333' },
+                            font: { color: '#ffffff', size: 14 },
+                            shadow: { enabled: true, color: 'rgba(0,0,0,0.5)', size: 5 }
+                        },
+                        edges: {
+                            color: { color: '#555555' },
+                            width: 2,
+                            smooth: { type: 'cubicBezier' }
+                        },
+                        interaction: {
+                            hover: true,
+                            tooltipDelay: 200
+                        },
+                        physics: {
+                            enabled: false
+                        }
+                    }
+                )
+
+                let currentChart = chart        // stash for click handler
+
+                orgNet.on('click', params => {
+                    if (!params.nodes.length) return
+
+                    // Reset any previously highlighted nodes
+                    orgNet.selectNodes([]);
+
+                    // Get the selected person
+                    const person = currentChart.nodes.find(x => x.id === params.nodes[0])
+                    if (person) {
+                        // Highlight the selected node
+                        orgNet.selectNodes([person.id]);
+
+                        selectedPerson = person
+                        showPersonDetails(person)
+
+                        // Scroll right sidebar to the person details
+                        const rightSidebar = document.getElementById('rightSidebar');
+                        rightSidebar.scrollTo({
+                            top: rightSidebar.scrollHeight,
+                            behavior: 'smooth'
+                        });
+                    }
+                })
+            }
+
+            function colorForScore(s) {                // 0 → gray, 1 → emerald
+                const g = Math.round(200 * (1 - s))
+                return `rgb(${g},${255 - g},120)`
+            }
+
+            function showPersonDetails(p) {
+                const box = document.getElementById('personPane')
+                box.classList.remove('hidden')
+
+                // Render decision score badge with appropriate color
+                const scoreColor = colorForScore(p.decision_score || 0)
+                const scorePercentage = (p.decision_score * 100).toFixed(1)
+
+                box.innerHTML = `
+                    <div class="bg-neutral-800 rounded-lg p-4">
+                      <div class="flex items-start space-x-3">
+                        <img src="${p.avatar_url || 'https://ui-avatars.com/api/?name=' + encodeURIComponent(p.name)}" 
+                             class="h-16 w-16 rounded-full object-cover border-2 border-neutral-700"/>
+                        <div class="flex-1">
+                          <div class="flex justify-between items-start">
+                            <div>
+                              <div class="font-semibold text-lg">${p.name}</div>
+                              <div class="text-neutral-300 text-sm">${p.title || 'Employee'}</div>
+                            </div>
+                            <span class="px-2 py-1 rounded-full text-sm font-medium" 
+                                  style="background-color:${scoreColor}">
+                              ${scorePercentage}% influence
+                            </span>
+                          </div>
+                          
+                          <div class="mt-3 grid grid-cols-2 gap-2 text-xs text-neutral-300">
+                            <div class="flex items-center">
+                              <i class="fas fa-sitemap w-5 text-neutral-500"></i>
+                              <span class="ml-1">${p.dept || 'Department not specified'}</span>
+                            </div>
+                            <div class="flex items-center">
+                              <i class="fas fa-calendar-alt w-5 text-neutral-500"></i>
+                              <span class="ml-1">${p.yoe_current || '?'} years at company</span>
+                            </div>
+                            <div class="flex items-center">
+                              <i class="fas fa-briefcase w-5 text-neutral-500"></i>
+                              <span class="ml-1">${p.title_level || 'Level not specified'}</span>
+                            </div>
+                            <div class="flex items-center">
+                              <i class="fas fa-user-friends w-5 text-neutral-500"></i>
+                              <span class="ml-1">${p.connection_count || '?'} connections</span>
+                            </div>
+                          </div>
+                          
+                          <div class="mt-3 flex justify-end">
+                            <a href="${p.id}" target="_blank" 
+                               class="bg-emerald-800 hover:bg-emerald-700 text-emerald-100 px-3 py-1 rounded text-xs flex items-center transition-colors">
+                              <i class="fab fa-linkedin mr-1"></i> View on LinkedIn
+                            </a>
+                          </div>
+                        </div>
+                      </div>
+                    </div>`
+            }
+
+            // ───── Chat drawer logic ─────
+            const chatFab = document.getElementById('chatFab')
+            const chatDrawer = document.getElementById('chatDrawer')
+            const chatClose = document.getElementById('chatClose')
+            const chatBody = document.getElementById('chatBody')
+            const chatInput = document.getElementById('chatInput')
+
+            chatFab.onclick = () => {
+                chatDrawer.style.transform = 'translateY(0)';
+                chatFab.style.display = 'none';
+            }
+
+            chatClose.onclick = () => {
+                chatDrawer.style.transform = 'translateY(100%)'
+                chatFab.style.display = 'block';
+            }
+
+            chatInput.addEventListener('keydown', e => {
+                if (e.key === 'Enter' && chatInput.value.trim()) {
+                    sendChat(chatInput.value.trim())
+                    chatInput.value = ''
+                }
+            })
+
+            // context vars
+            let currentCompany = null, currentChart = null, companyMeta = null,
+                decisionMakers = null, similarCompanies = null, selectedPerson = null
+            function loadOrgChart(id) {
+                const pane = document.getElementById('rightPane')
+                pane.innerHTML = '<div class="p-4 text-sm">Loading…</div>'
+                pane.style.transform = 'translateX(0)'
+                try {
+                    fetch(`org_chart_${id.replace(/\//g, "_")}.json`)
+                        .then(r => r.json())
+                        .then(chart => {
+                            currentChart = chart
+                            currentCompany = id
+                            companyMeta = data.nodes.find(n => n.id === id) || {}
+                            decisionMakers = chart.nodes.filter(n => n.decision_score >= 0.5)
+                            similarCompanies = data.edges.filter(e => e.source === id)
+                                .sort((a, b) => b.weight - a.weight)
+                                .slice(0, 3).map(e => e.target)
+
+                            renderOrg(chart, pane)
+                        })
+                } catch (e) { pane.innerHTML = '<div class="p-4 text-red-600">Org chart not found</div>' }
+            }
+
+            async function sendChat(userMsg) {
+                appendMsg("you", userMsg)
+                try {
+                    const msgs = []
+                    const context = {
+                        company: companyMeta,
+                        orgChart: currentChart,
+                        decisionMakers,
+                        // similarCompanies,
+                        selectedPerson,
+                        rawEmployees: (await getPeopleRows()).filter(p => p.company_handle.replace(/\/$/, '') === currentCompany)
+                    }
+                    // remove desc_embed from company
+                    context.company.desc_embed = ""
+
+                    msgs.push({ role: "system", content: `CONTEXT:\n${JSON.stringify(context)}` })
+                    msgs.push({ role: "user", content: userMsg })
+
+                    for await (const chunk of API.chatStream(msgs)) {
+                        appendMsg("ai", chunk, true)
+                    }
+                } catch (err) { appendMsg("ai", `[error: ${err.message}]`) }
+            }
+
+            function appendMsg(sender, text, streaming = false) {
+                let el = chatBody.lastElementChild
+                if (streaming && el && el.dataset.sender === sender) {
+                    // Just append raw text for streaming mode
+                    el.lastChild.innerHTML += text.replace(/\n/g, "<br>")
+                } else {
+                    el = document.createElement('div')
+                    el.className = 'chat-message'
+                    el.dataset.sender = sender
+
+                    // Create sender element
+                    const senderEl = document.createElement('span')
+                    senderEl.className = sender === 'you' ? 'text-emerald-400' : 'text-cyan-400'
+                    senderEl.textContent = `${sender}:`
+
+                    // Create content element
+                    const contentEl = document.createElement('div')
+                    contentEl.className = 'ml-1 mt-1'
+
+                    // Apply markdown parsing
+                    contentEl.innerHTML = marked.parse(text)
+
+                    // Style markdown elements
+                    const style = document.createElement('style')
+                    style.textContent = `
+          .chat-message a { color: #34D399; text-decoration: underline; }
+          .chat-message p { margin-bottom: 0.5rem; }
+          .chat-message h1, .chat-message h2, .chat-message h3 { 
+            font-weight: bold; 
+            margin-top: 1rem;
+            margin-bottom: 0.5rem;
+          }
+          .chat-message code {
+            background-color: #222;
+            padding: 0.1rem 0.3rem;
+            border-radius: 0.25rem;
+            font-family: monospace;
+          }
+          .chat-message pre {
+            background-color: #222;
+            padding: 0.5rem;
+            border-radius: 0.25rem;
+            overflow-x: auto;
+            margin: 0.5rem 0;
+          }
+          .chat-message pre code {
+            background-color: transparent;
+            padding: 0;
+          }
+          .chat-message ul, .chat-message ol {
+            margin-left: 1.5rem;
+            margin-bottom: 0.5rem;
+          }
+          .chat-message ul { list-style-type: disc; }
+          .chat-message ol { list-style-type: decimal; }
+        `
+                    document.head.appendChild(style)
+
+                    // Append elements
+                    el.appendChild(senderEl)
+                    el.appendChild(contentEl)
+                    chatBody.appendChild(el)
+                }
+                chatBody.scrollTop = chatBody.scrollHeight
+            }
+
+            // Settings modal and API key management
+            const settingsBtn = document.getElementById('settingsBtn');
+            const settingsModal = document.getElementById('settingsModal');
+            const closeSettingsModal = document.getElementById('closeSettingsModal');
+            const apiKeyInput = document.getElementById('apiKeyInput');
+            const saveApiKey = document.getElementById('saveApiKey');
+            
+            // Check for saved API key in localStorage
+            const savedApiKey = localStorage.getItem('openai_api_key');
+            if (savedApiKey) {
+                API.setApiKey(savedApiKey);
+                apiKeyInput.value = savedApiKey;
+            } else {
+                // Show settings modal on page load if no API key is set
+                setTimeout(() => {
+                    settingsModal.classList.remove('hidden');
+                }, 500);
+            }
+            
+            // Open settings modal when settings button is clicked
+            settingsBtn.addEventListener('click', () => {
+                settingsModal.classList.remove('hidden');
+            });
+            
+            // Close settings modal
+            closeSettingsModal.addEventListener('click', () => {
+                settingsModal.classList.add('hidden');
+            });
+            
+            // Close modal when clicking outside of it
+            settingsModal.addEventListener('click', (e) => {
+                if (e.target === settingsModal) {
+                    settingsModal.classList.add('hidden');
+                }
+            });
+            
+            // Save API key
+            saveApiKey.addEventListener('click', () => {
+                const apiKey = apiKeyInput.value.trim();
+                if (apiKey) {
+                    localStorage.setItem('openai_api_key', apiKey);
+                    API.setApiKey(apiKey);
+                    settingsModal.classList.add('hidden');
+                    showToast('API key saved successfully', 2000);
+                } else {
+                    showToast('Please enter a valid API key', 2000);
+                }
+            });
+            
+            // Allow Enter key to submit API key
+            apiKeyInput.addEventListener('keydown', (e) => {
+                if (e.key === 'Enter') {
+                    saveApiKey.click();
+                }
+            });
+
+
+            // ───── Split.js and sidebar setup ─────
+            const leftSidebar = document.getElementById('leftSidebar');
+            const rightSidebar = document.getElementById('rightSidebar');
+            const mainContent = document.getElementById('mainContent');
+            const leftSidebarToggle = document.getElementById('leftSidebarToggle');
+            const rightSidebarToggle = document.getElementById('rightSidebarToggle');
+
+            // Load saved splitter sizes
+            const savedSizes = localStorage.getItem('mainSplitSizes');
+            const defaultSizes = [20, 80, 0];
+
+            // Initialize Split.js
+            const split = Split(['#mainContent', '#rightSidebar'], {
+                sizes: savedSizes ? JSON.parse(savedSizes) : defaultSizes,
+                minSize: [0, 300, 0],
+                gutterSize: 5,
+                snapOffset: 0,
+                dragInterval: 1,
+                direction: 'horizontal',
+                elementStyle: function (dimension, size, gutterSize) {
+                    return {
+                        'flex-basis': `calc(${size}% - ${gutterSize}px)`,
+                    }
+                },
+                gutterStyle: function (dimension, gutterSize) {
+                    return {
+                        'flex-basis': `${gutterSize}px`,
+                    }
+                },
+                onDragEnd: function (sizes) {
+                    localStorage.setItem('mainSplitSizes', JSON.stringify(sizes));
+                }
+            });
+
+            // Set initial sidebar states based on saved sizes
+            if (savedSizes) {
+                const sizes = JSON.parse(savedSizes);
+                if (sizes[0] < 1) {
+                    leftSidebar.classList.add('collapsed');
+                    leftSidebarToggle.innerHTML = '<i class="fa fa-chevron-right"></i>';
+                }
+                if (sizes[2] < 1) {
+                    rightSidebar.classList.add('collapsed');
+                    rightSidebarToggle.innerHTML = '<i class="fa fa-chevron-left"></i>';
+                }
+            }
+
+            // Toggle left sidebar
+            function toggleLeftSidebar() {
+                const isCollapsed = leftSidebar.classList.toggle('collapsed');
+                leftSidebarToggle.innerHTML = isCollapsed
+                    ? '<i class="fa fa-chevron-right"></i>'
+                    : '<i class="fa fa-chevron-left"></i>';
+
+                // if (isCollapsed) {
+                //     split.setSizes([0, rightSidebar.classList.contains('collapsed') ? 100 : 70, 30]);
+                // } else {
+                //     split.setSizes([20, rightSidebar.classList.contains('collapsed') ? 80 : 50, 30]);
+                // }
+
+                // Save current sizes
+                localStorage.setItem('mainSplitSizes', JSON.stringify(split.getSizes()));
+
+                // Resize graph when sidebar toggles
+                setTimeout(resizeGraph, 300);
+            }
+
+            // Toggle right sidebar
+            function toggleRightSidebar() {
+                const isCollapsed = rightSidebar.classList.toggle('collapsed');
+
+                if (isCollapsed) {
+                    // read current value of "flex-basis"
+                    let flexBasis = getComputedStyle(rightSidebar).flexBasis;
+                    rightSidebar.dataset.flexBasis = flexBasis;
+                    rightSidebar.style.flexBasis = '0px';
+                } else {
+                    // restore the value of "flex-basis" from the dataset
+                    rightSidebar.style.flexBasis = rightSidebar.dataset.flexBasis;
+                }
+
+                rightSidebarToggle.innerHTML = isCollapsed
+                    ? '<i class="fa fa-chevron-left"></i>'
+                    : '<i class="fa fa-chevron-right"></i>';
+
+                // Save current sizes
+                localStorage.setItem('mainSplitSizes', JSON.stringify(split.getSizes()));
+
+                // Resize graph when sidebar toggles
+                setTimeout(resizeGraph, 300);
+            }
+
+            // Add event listeners for toggle buttons
+            leftSidebarToggle.addEventListener('click', toggleLeftSidebar);
+            rightSidebarToggle.addEventListener('click', toggleRightSidebar);
+
+            // Resize the network graph when window or splitter changes
+            function resizeGraph() {
+                if (network) {
+                    const container = document.getElementById('graph');
+                    const availableWidth = container.clientWidth;
+                    const availableHeight = container.clientHeight;
+
+                    // Apply smart fit with appropriate scale and offset
+                    // get the current selected node id
+                    let selectedNodeId = network.getSelectedNodes()[0]
+                    // if no node is selected, use the first node
+                    if (!selectedNodeId) {
+                        const firstNode = data.nodes[0]
+                        selectedNodeId = firstNode.id
+                        // select the first node
+                        network.selectNodes([selectedNodeId])
+                        loadOrgChart(selectedNodeId)
+                    } else {
+                        network.focus(selectedNodeId, {
+                            animation: true,
+                            scale: Math.min(1.5, Math.max(0.5, Math.min(availableWidth, availableHeight) / 500))
+                        });
+                    }
+                }
+            }
+
+            window.resizeGraph = resizeGraph;
+            window.addEventListener('resize', resizeGraph);
+
+            // Add zoom control buttons functionality
+            const ZOOM_STEP = 0.2         // relative factor
+            document.getElementById('zoomInBtn').addEventListener('click', () => {
+                network.moveTo({
+                    scale: network.getScale() + ZOOM_STEP,
+                    animation: { duration: 300, easingFunction: 'easeInOutQuad' }
+                })
+            })
+            document.getElementById('zoomOutBtn').addEventListener('click', () => {
+                network.moveTo({
+                    scale: Math.max(0.1, network.getScale() - ZOOM_STEP),
+                    animation: { duration: 300, easingFunction: 'easeInOutQuad' }
+                })
+            });
+
+            document.getElementById('resetViewBtn').addEventListener('click', () => {
+                network.fit({
+                    animation: { duration: 800, easingFunction: 'easeInOutQuad' }
+                })
+            });
+
+            // Add hover information for nodes
+            network.on('hoverNode', params => {
+                const nodeId = params.node;
+                const node = data.nodes.find(n => n.id === nodeId);
+                if (node) {
+                    const graphInfo = document.getElementById('graphInfo');
+                    const graphInfoContent = document.getElementById('graphInfoContent');
+
+                    graphInfoContent.innerHTML = `
+                            <div class="font-semibold text-neutral-200">${node.name}</div>
+                            <div class="text-neutral-400">${node.industry || 'Industry: N/A'}</div>
+                            <div class="flex items-center mt-1">
+                                <i class="fas fa-users mr-1 text-blue-400"></i>
+                                <span>${node.followers?.toLocaleString() || '0'} followers</span>
+                            </div>
+                            <div class="mt-1">${node.about || ''}</div>
+                        `;
+
+                    graphInfo.classList.remove('hidden');
+                }
+            });
+
+            network.on('blurNode', () => {
+                document.getElementById('graphInfo').classList.add('hidden');
+            });
+
+            // Add selected node styling
+            network.on('selectNode', params => {
+                const nodeId = params.nodes[0];
+                if (nodeId) {
+                    // Focus on the selected node
+                    network.focus(nodeId, {
+                        scale: 1.2,
+                        animation: true,
+                    });
+
+                    // Also load the org chart for the selected company
+                    focusCompany(nodeId);
+
+                    // Add visual indicator in the sidebar
+                    document.querySelectorAll('#companyList li').forEach(el => {
+                        el.classList.remove('border-blue-500');
+                        // Find the corresponding company in the sidebar
+                        if (el.querySelector('h3').textContent === data.nodes.find(n => n.id === nodeId)?.name) {
+                            el.classList.add('border-blue-500');
+                            // Scroll the sidebar to show the selected company
+                            el.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
+                        }
+                    });
+                }
+            });
+
+            // Initial fit after a short delay to ensure the network is properly initialized
+            setTimeout(resizeGraph, 500);
+        }
+
+    </script>
+</body>
+
+</html>
\ No newline at end of file
diff --git a/docs/assets/powered-by-dark.svg b/docs/assets/powered-by-dark.svg
new file mode 100644
index 00000000..b2a6c696
--- /dev/null
+++ b/docs/assets/powered-by-dark.svg
@@ -0,0 +1,25 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
+  <!-- Dark Theme -->
+  <g>
+    <defs>
+      <pattern id="halftoneDark" width="4" height="4" patternUnits="userSpaceOnUse">
+        <circle cx="2" cy="2" r="1" fill="#eee" opacity="0.1"/>
+      </pattern>
+      <pattern id="halftoneTextDark" width="3" height="3" patternUnits="userSpaceOnUse">
+        <circle cx="1.5" cy="1.5" r="2" fill="#aaa" opacity="0.2"/>
+      </pattern>
+    </defs>
+    <!-- White border - added as outer rectangle -->
+    <rect width="120" height="35" rx="5" fill="#111"/>
+    <!-- Dark background slightly smaller to show thicker border -->
+    <rect x="2" y="2" width="116" height="31" rx="4" fill="#1a1a1a"/>
+    <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneDark)"/>
+    
+    <!-- Logo with halftone -->
+    <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#eee" stroke-width="2"/>
+    <path d="M18 17.5 L27 17.5" stroke="#eee" stroke-width="2"/>
+    <circle cx="22.5" cy="17.5" r="2" fill="#eee"/>
+    
+    <text x="40" y="23" fill="#eee" font-family="Arial, sans-serif" font-weight="500" font-size="14">Crawl4AI</text>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/docs/assets/powered-by-disco.svg b/docs/assets/powered-by-disco.svg
new file mode 100644
index 00000000..96c6356c
--- /dev/null
+++ b/docs/assets/powered-by-disco.svg
@@ -0,0 +1,64 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
+  <g>
+    <defs>
+      <pattern id="cyberdots" width="4" height="4" patternUnits="userSpaceOnUse">
+        <circle cx="2" cy="2" r="1">
+          <animate attributeName="fill" 
+                   values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4" 
+                   dur="6s" 
+                   repeatCount="indefinite"/>
+          <animate attributeName="opacity" 
+                   values="0.2;0.4;0.2" 
+                   dur="4s" 
+                   repeatCount="indefinite"/>
+        </circle>
+      </pattern>
+      <filter id="neonGlow" x="-20%" y="-20%" width="140%" height="140%">
+        <feGaussianBlur stdDeviation="1" result="blur"/>
+        <feFlood flood-color="#FF2EC4" flood-opacity="0.2">
+          <animate attributeName="flood-color"
+                   values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
+                   dur="8s"
+                   repeatCount="indefinite"/>
+        </feFlood>
+        <feComposite in2="blur" operator="in"/>
+        <feMerge>
+          <feMergeNode/>
+          <feMergeNode in="SourceGraphic"/>
+        </feMerge>
+      </filter>
+    </defs>
+    
+    <rect width="120" height="35" rx="5" fill="#0A0A0F"/>
+    <rect x="2" y="2" width="116" height="31" rx="4" fill="#16161E"/>
+    <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#cyberdots)"/>
+    
+    <!-- Logo with animated neon -->
+    <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)">
+      <animate attributeName="stroke"
+               values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
+               dur="8s"
+               repeatCount="indefinite"/>
+    </path>
+    <path d="M18 17.5 L27 17.5" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)">
+      <animate attributeName="stroke"
+               values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
+               dur="8s"
+               repeatCount="indefinite"/>
+    </path>
+    <circle cx="22.5" cy="17.5" r="2" fill="#0BC5EA">
+      <animate attributeName="fill" 
+               values="#0BC5EA;#FF2EC4;#8B5CF6;#0BC5EA" 
+               dur="8s" 
+               repeatCount="indefinite"/>
+    </circle>
+    
+    <text x="40" y="23" font-family="Arial, sans-serif" font-weight="500" font-size="14" filter="url(#neonGlow)">
+      <animate attributeName="fill"
+               values="#FF2EC4;#8B5CF6;#0BC5EA;#FF2EC4"
+               dur="8s"
+               repeatCount="indefinite"/>
+      Crawl4AI
+    </text>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/docs/assets/powered-by-light.svg b/docs/assets/powered-by-light.svg
new file mode 100644
index 00000000..150d217b
--- /dev/null
+++ b/docs/assets/powered-by-light.svg
@@ -0,0 +1,21 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
+  <g>
+    <defs>
+      <pattern id="halftoneLight" width="4" height="4" patternUnits="userSpaceOnUse">
+        <circle cx="2" cy="2" r="1" fill="#111" opacity="0.1"/>
+      </pattern>
+    </defs>
+    <!-- Dark border -->
+    <rect width="120" height="35" rx="5" fill="#DDD"/>
+    <!-- Light background -->
+    <rect x="2" y="2" width="116" height="31" rx="4" fill="#fff"/>
+    <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneLight)"/>
+    
+    <!-- Logo -->
+    <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#111" stroke-width="2"/>
+    <path d="M18 17.5 L27 17.5" stroke="#111" stroke-width="2"/>
+    <circle cx="22.5" cy="17.5" r="2" fill="#111"/>
+    
+    <text x="40" y="23" fill="#111" font-family="Arial, sans-serif" font-weight="500" font-size="14">Crawl4AI</text>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/docs/assets/powered-by-night.svg b/docs/assets/powered-by-night.svg
new file mode 100644
index 00000000..330ea324
--- /dev/null
+++ b/docs/assets/powered-by-night.svg
@@ -0,0 +1,28 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="120" height="35" viewBox="0 0 120 35">
+  <g>
+    <defs>
+      <pattern id="halftoneDark" width="4" height="4" patternUnits="userSpaceOnUse">
+        <circle cx="2" cy="2" r="1" fill="#8B5CF6" opacity="0.1"/>
+      </pattern>
+      <filter id="neonGlow" x="-20%" y="-20%" width="140%" height="140%">
+        <feGaussianBlur stdDeviation="1" result="blur"/>
+        <feFlood flood-color="#8B5CF6" flood-opacity="0.2"/>
+        <feComposite in2="blur" operator="in"/>
+        <feMerge>
+          <feMergeNode/>
+          <feMergeNode in="SourceGraphic"/>
+        </feMerge>
+      </filter>
+    </defs>
+    <rect width="120" height="35" rx="5" fill="#0A0A0F"/>
+    <rect x="2" y="2" width="116" height="31" rx="4" fill="#16161E"/>
+    <rect x="2" y="2" width="116" height="31" rx="4" fill="url(#halftoneDark)"/>
+    
+    <!-- Logo with neon glow -->
+    <path d="M30 17.5 a7.5 7.5 0 1 1 -15 0 a7.5 7.5 0 1 1 15 0" fill="none" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)"/>
+    <path d="M18 17.5 L27 17.5" stroke="#8B5CF6" stroke-width="2" filter="url(#neonGlow)"/>
+    <circle cx="22.5" cy="17.5" r="2" fill="#8B5CF6"/>
+    
+    <text x="40" y="23" fill="#fff" font-family="Arial, sans-serif" font-weight="500" font-size="14" filter="url(#neonGlow)">Crawl4AI</text>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/docs/codebase/browser.md b/docs/codebase/browser.md
new file mode 100644
index 00000000..17645c75
--- /dev/null
+++ b/docs/codebase/browser.md
@@ -0,0 +1,51 @@
+### browser_manager.py
+
+| Function | What it does |
+|---|---|
+| `ManagedBrowser.build_browser_flags` | Returns baseline Chromium CLI flags, disables GPU and sandbox, plugs locale, timezone, stealth tweaks, and any extras from `BrowserConfig`. |
+| `ManagedBrowser.__init__` | Stores config and logger, creates temp dir, preps internal state. |
+| `ManagedBrowser.start` | Spawns or connects to the Chromium process, returns its CDP endpoint plus the `subprocess.Popen` handle. |
+| `ManagedBrowser._initial_startup_check` | Pings the CDP endpoint once to be sure the browser is alive, raises if not. |
+| `ManagedBrowser._monitor_browser_process` | Async-loops on the subprocess, logs exits or crashes, restarts if policy allows. |
+| `ManagedBrowser._get_browser_path_WIP` | Old helper that maps OS + browser type to an executable path. |
+| `ManagedBrowser._get_browser_path` | Current helper, checks env vars, Playwright cache, and OS defaults for the real executable. |
+| `ManagedBrowser._get_browser_args` | Builds the final CLI arg list by merging user flags, stealth flags, and defaults. |
+| `ManagedBrowser.cleanup` | Terminates the browser, stops monitors, deletes the temp dir. |
+| `ManagedBrowser.create_profile` | Opens a visible browser so a human can log in, then zips the resulting user-data-dir to `~/.crawl4ai/profiles/<name>`. |
+| `ManagedBrowser.list_profiles` | Thin wrapper, now forwarded to `BrowserProfiler.list_profiles()`. |
+| `ManagedBrowser.delete_profile` | Thin wrapper, now forwarded to `BrowserProfiler.delete_profile()`. |
+| `BrowserManager.__init__` | Holds the global Playwright instance, browser handle, config signature cache, session map, and logger. |
+| `BrowserManager.start` | Boots the underlying `ManagedBrowser`, then spins up the default Playwright browser context with stealth patches. |
+| `BrowserManager._build_browser_args` | Translates `CrawlerRunConfig` (proxy, UA, timezone, headless flag, etc.) into Playwright `launch_args`. |
+| `BrowserManager.setup_context` | Applies locale, geolocation, permissions, cookies, and UA overrides on a fresh context. |
+| `BrowserManager.create_browser_context` | Internal helper that actually calls `browser.new_context(**options)` after running `setup_context`. |
+| `BrowserManager._make_config_signature` | Hashes the non-ephemeral parts of `CrawlerRunConfig` so contexts can be reused safely. |
+| `BrowserManager.get_page` | Returns a ready `Page` for a given session id, reusing an existing one or creating a new context/page, injects helper scripts, updates `last_used`. |
+| `BrowserManager.kill_session` | Force-closes a context/page for a session and removes it from the session map. |
+| `BrowserManager._cleanup_expired_sessions` | Periodic sweep that drops sessions idle longer than `ttl_seconds`. |
+| `BrowserManager.close` | Gracefully shuts down all contexts, the browser, Playwright, and background tasks. |
+
+---
+
+### browser_profiler.py
+
+| Function | What it does |
+|---|---|
+| `BrowserProfiler.__init__` | Sets up profile folder paths, async logger, and signal handlers. |
+| `BrowserProfiler.create_profile` | Launches a visible browser with a new user-data-dir for manual login, on exit compresses and stores it as a named profile. |
+| `BrowserProfiler.cleanup_handler` | General SIGTERM/SIGINT cleanup wrapper that kills child processes. |
+| `BrowserProfiler.sigint_handler` | Handles Ctrl-C during an interactive session, makes sure the browser shuts down cleanly. |
+| `BrowserProfiler.listen_for_quit_command` | Async REPL that exits when the user types `q`. |
+| `BrowserProfiler.list_profiles` | Enumerates `~/.crawl4ai/profiles`, prints profile name, browser type, size, and last modified. |
+| `BrowserProfiler.get_profile_path` | Returns the absolute path of a profile given its name, or `None` if missing. |
+| `BrowserProfiler.delete_profile` | Removes a profile folder or a direct path from disk, with optional confirmation prompt. |
+| `BrowserProfiler.interactive_manager` | Text UI loop for listing, creating, deleting, or launching profiles. |
+| `BrowserProfiler.launch_standalone_browser` | Starts a non-headless Chromium with remote debugging enabled and keeps it alive for manual tests. |
+| `BrowserProfiler.get_cdp_json` | Pulls `/json/version` from a CDP endpoint and returns the parsed JSON. |
+| `BrowserProfiler.launch_builtin_browser` | Spawns a headless Chromium in the background, saves `{wsEndpoint, pid, started_at}` to `~/.crawl4ai/builtin_browser.json`. |
+| `BrowserProfiler.get_builtin_browser_info` | Reads that JSON file, verifies the PID, and returns browser status info. |
+| `BrowserProfiler._is_browser_running` | Cross-platform helper that checks if a PID is still alive. |
+| `BrowserProfiler.kill_builtin_browser` | Terminates the background builtin browser and removes its status file. |
+| `BrowserProfiler.get_builtin_browser_status` | Returns `{running: bool, wsEndpoint, pid, started_at}` for quick health checks. |
+
+Let me know what you want to tweak or dive into next.
\ No newline at end of file
diff --git a/docs/codebase/cli.md b/docs/codebase/cli.md
new file mode 100644
index 00000000..6f5a348b
--- /dev/null
+++ b/docs/codebase/cli.md
@@ -0,0 +1,40 @@
+### `cli.py` command surface
+
+| Command | Inputs / flags | What it does |
+|---|---|---|
+| **profiles** | *(none)* | Opens the interactive profile manager, lets you list, create, delete saved browser profiles that live in `~/.crawl4ai/profiles`. |
+| **browser status** | – | Prints whether the always-on *builtin* browser is running, shows its CDP URL, PID, start time. |
+| **browser stop** | – | Kills the builtin browser and deletes its status file. |
+| **browser view** | `--url, -u` URL *(optional)* | Pops a visible window of the builtin browser, navigates to `URL` or `about:blank`. |
+| **config list** | – | Dumps every global setting, showing current value, default, and description. |
+| **config get** | `key` | Prints the value of a single setting, falls back to default if unset. |
+| **config set** | `key value` | Persists a new value in the global config (stored under `~/.crawl4ai/config.yml`). |
+| **examples** | – | Just spits out real-world CLI usage samples. |
+| **crawl** | `url` *(positional)*<br>`--browser-config,-B` path<br>`--crawler-config,-C` path<br>`--filter-config,-f` path<br>`--extraction-config,-e` path<br>`--json-extract,-j` [desc]\*<br>`--schema,-s` path<br>`--browser,-b` k=v list<br>`--crawler,-c` k=v list<br>`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*<br>`--output-file,-O` path<br>`--bypass-cache,-b` *(flag, default true — note flag reuse)*<br>`--question,-q` str<br>`--verbose,-v` *(flag)*<br>`--profile,-p` profile-name | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. |
+| **(default)** | Same flags as **crawl**, plus `--example` | Shortcut so you can type just `crwl https://site.com`. When first arg is not a known sub-command, it falls through to *crawl*. |
+
+\* `--json-extract/-j` with no value turns on LLM-based JSON extraction using an auto schema, supplying a string lets you prompt-engineer the field descriptions.
+
+> Quick mental model  
+> `profiles` = manage identities,  
+> `browser ...` = control long-running headless Chrome that all crawls can piggy-back on,  
+> `crawl` = do the actual work,  
+> `config` = tweak global defaults,  
+> everything else is sugar.
+
+### Quick-fire “profile” usage cheatsheet
+
+| Scenario | Command (copy-paste ready) | Notes |
+|---|---|---|
+| **Launch interactive Profile Manager UI** | `crwl profiles` | Opens TUI with options: 1 List, 2 Create, 3 Delete, 4 Use-to-crawl, 5 Exit. |
+| **Create a fresh profile** | `crwl profiles` → choose **2** → name it → browser opens → log in → press **q** in terminal | Saves to `~/.crawl4ai/profiles/<name>`. |
+| **List saved profiles** | `crwl profiles` → choose **1** | Shows name, browser type, size, last-modified. |
+| **Delete a profile** | `crwl profiles` → choose **3** → pick the profile index → confirm | Removes the folder. |
+| **Crawl with a profile (default alias)** | `crwl https://site.com/dashboard -p my-profile` | Keeps login cookies, sets `use_managed_browser=true` under the hood. |
+| **Crawl + verbose JSON output** | `crwl https://site.com -p my-profile -o json -v` | Any other `crawl` flags work the same. |
+| **Crawl with extra browser tweaks** | `crwl https://site.com -p my-profile -b "headless=true,viewport_width=1680"` | CLI overrides go on top of the profile. |
+| **Same but via explicit sub-command** | `crwl crawl https://site.com -p my-profile` | Identical to default alias. |
+| **Use profile from inside Profile Manager** | `crwl profiles` → choose **4** → pick profile → enter URL → follow prompts | Handy when demo-ing to non-CLI folks. |
+| **One-off crawl with a profile folder path (no name lookup)** | `crwl https://site.com -b "user_data_dir=$HOME/.crawl4ai/profiles/my-profile,use_managed_browser=true"` | Bypasses registry, useful for CI scripts. |
+| **Launch a dev browser on CDP port with the same identity** | `crwl cdp -d $HOME/.crawl4ai/profiles/my-profile -P 9223` | Lets Puppeteer/Playwright attach for debugging. |
+
diff --git a/docs/examples/README_BUILTIN_BROWSER.md b/docs/examples/README_BUILTIN_BROWSER.md
new file mode 100644
index 00000000..35ade639
--- /dev/null
+++ b/docs/examples/README_BUILTIN_BROWSER.md
@@ -0,0 +1,123 @@
+# Builtin Browser in Crawl4AI
+
+This document explains the builtin browser feature in Crawl4AI and how to use it effectively.
+
+## What is the Builtin Browser?
+
+The builtin browser is a persistent Chrome instance that Crawl4AI manages for you. It runs in the background and can be used by multiple crawling operations, eliminating the need to start and stop browsers for each crawl.
+
+Benefits include:
+- **Faster startup times** - The browser is already running, so your scripts start faster
+- **Shared resources** - All your crawling scripts can use the same browser instance
+- **Simplified management** - No need to worry about CDP URLs or browser processes
+- **Persistent cookies and sessions** - Browser state persists between script runs
+- **Less resource usage** - Only one browser instance for multiple scripts
+
+## Using the Builtin Browser
+
+### In Python Code
+
+Using the builtin browser in your code is simple:
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+# Create browser config with builtin mode
+browser_config = BrowserConfig(
+    browser_mode="builtin",  # This is the key setting!
+    headless=True            # Can be headless or not
+)
+
+# Create the crawler
+crawler = AsyncWebCrawler(config=browser_config)
+
+# Use it - no need to explicitly start()
+result = await crawler.arun("https://example.com")
+```
+
+Key points:
+1. Set `browser_mode="builtin"` in your BrowserConfig
+2. No need for explicit `start()` call - the crawler will automatically connect to the builtin browser
+3. No need to use a context manager or call `close()` - the browser stays running
+
+### Via CLI
+
+The CLI provides commands to manage the builtin browser:
+
+```bash
+# Start the builtin browser
+crwl browser start
+
+# Check its status
+crwl browser status
+
+# Open a visible window to see what the browser is doing
+crwl browser view --url https://example.com
+
+# Stop it when no longer needed
+crwl browser stop
+
+# Restart with different settings
+crwl browser restart --no-headless
+```
+
+When crawling via CLI, simply add the builtin browser mode:
+
+```bash
+crwl https://example.com -b "browser_mode=builtin"
+```
+
+## How It Works
+
+1. When a crawler with `browser_mode="builtin"` is created:
+   - It checks if a builtin browser is already running
+   - If not, it automatically launches one
+   - It connects to the browser via CDP (Chrome DevTools Protocol)
+
+2. The browser process continues running after your script exits
+   - This means it's ready for the next crawl
+   - You can manage it via the CLI commands
+
+3. During installation, Crawl4AI attempts to create a builtin browser automatically
+
+## Example
+
+See the [builtin_browser_example.py](builtin_browser_example.py) file for a complete example.
+
+Run it with:
+
+```bash
+python builtin_browser_example.py
+```
+
+## When to Use
+
+The builtin browser is ideal for:
+- Scripts that run frequently
+- Development and testing workflows
+- Applications that need to minimize startup time
+- Systems where you want to manage browser instances centrally
+
+You might not want to use it when:
+- Running one-off scripts
+- When you need different browser configurations for different tasks
+- In environments where persistent processes are not allowed
+
+## Troubleshooting
+
+If you encounter issues:
+
+1. Check the browser status:
+   ```
+   crwl browser status
+   ```
+
+2. Try restarting it:
+   ```
+   crwl browser restart
+   ```
+
+3. If problems persist, stop it and let Crawl4AI start a fresh one:
+   ```
+   crwl browser stop
+   ```
\ No newline at end of file
diff --git a/docs/examples/arun_vs_arun_many.py b/docs/examples/arun_vs_arun_many.py
new file mode 100644
index 00000000..40bc4381
--- /dev/null
+++ b/docs/examples/arun_vs_arun_many.py
@@ -0,0 +1,79 @@
+import asyncio
+import time
+from crawl4ai.async_webcrawler import AsyncWebCrawler, CacheMode
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter
+
+VERBOSE = False
+
+async def crawl_sequential(urls):
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
+    results = []
+    start_time = time.perf_counter()
+    async with AsyncWebCrawler() as crawler:
+        for url in urls:
+            result_container = await crawler.arun(url=url, config=config)
+            results.append(result_container[0])
+    total_time = time.perf_counter() - start_time
+    return total_time, results
+
+async def crawl_parallel_dispatcher(urls):
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
+    # Dispatcher with rate limiter enabled (default behavior)
+    dispatcher = MemoryAdaptiveDispatcher(
+        rate_limiter=RateLimiter(base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3),
+        max_session_permit=50,
+    )
+    start_time = time.perf_counter()
+    async with AsyncWebCrawler() as crawler:
+        result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
+        results = []
+        if isinstance(result_container, list):
+            results = result_container
+        else:
+            async for res in result_container:
+                results.append(res)
+    total_time = time.perf_counter() - start_time
+    return total_time, results
+
+async def crawl_parallel_no_rate_limit(urls):
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
+    # Dispatcher with no rate limiter and a high session permit to avoid queuing
+    dispatcher = MemoryAdaptiveDispatcher(
+        rate_limiter=None,
+        max_session_permit=len(urls)  # allow all URLs concurrently
+    )
+    start_time = time.perf_counter()
+    async with AsyncWebCrawler() as crawler:
+        result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
+        results = []
+        if isinstance(result_container, list):
+            results = result_container
+        else:
+            async for res in result_container:
+                results.append(res)
+    total_time = time.perf_counter() - start_time
+    return total_time, results
+
+async def main():
+    urls = ["https://example.com"] * 100
+    print(f"Crawling {len(urls)} URLs sequentially...")
+    seq_time, seq_results = await crawl_sequential(urls)
+    print(f"Sequential crawling took: {seq_time:.2f} seconds\n")
+    
+    print(f"Crawling {len(urls)} URLs in parallel using arun_many with dispatcher (with rate limit)...")
+    disp_time, disp_results = await crawl_parallel_dispatcher(urls)
+    print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds\n")
+       
+    print(f"Crawling {len(urls)} URLs in parallel using dispatcher with no rate limiter...")
+    no_rl_time, no_rl_results = await crawl_parallel_no_rate_limit(urls)
+    print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds\n")
+    
+    print("Crawl4ai - Crawling Comparison")
+    print("--------------------------------------------------------")
+    print(f"Sequential crawling took: {seq_time:.2f} seconds")
+    print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds")
+    print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds")
+    
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/browser_optimization_example.py b/docs/examples/browser_optimization_example.py
index 73637a71..cf5e1307 100644
--- a/docs/examples/browser_optimization_example.py
+++ b/docs/examples/browser_optimization_example.py
@@ -52,7 +52,7 @@ async def crawl_sequential(urls: List[str]):
             )
             if result.success:
                 print(f"Successfully crawled {url}")
-                print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+                print(f"Content length: {len(result.markdown.raw_markdown)}")
     finally:
         await crawler.close()
 
@@ -101,7 +101,7 @@ async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
                     print(f"Error crawling {url}: {str(result)}")
                 elif result.success:
                     print(f"Successfully crawled {url}")
-                    print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+                    print(f"Content length: {len(result.markdown.raw_markdown)}")
     finally:
         await crawler.close()
 
diff --git a/docs/examples/builtin_browser_example.py b/docs/examples/builtin_browser_example.py
new file mode 100644
index 00000000..0d551085
--- /dev/null
+++ b/docs/examples/builtin_browser_example.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+"""
+Builtin Browser Example
+
+This example demonstrates how to use Crawl4AI's builtin browser feature,
+which simplifies the browser management process. With builtin mode:
+
+- No need to manually start or connect to a browser
+- No need to manage CDP URLs or browser processes
+- Automatically connects to an existing browser or launches one if needed
+- Browser persists between script runs, reducing startup time
+- No explicit cleanup or close() calls needed
+
+The example also demonstrates "auto-starting" where you don't need to explicitly
+call start() method on the crawler.
+"""
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+import time
+
+async def crawl_with_builtin_browser():
+    """
+    Simple example of crawling with the builtin browser.
+    
+    Key features:
+    1. browser_mode="builtin" in BrowserConfig
+    2. No explicit start() call needed
+    3. No explicit close() needed
+    """
+    print("\n=== Crawl4AI Builtin Browser Example ===\n")
+    
+    # Create a browser configuration with builtin mode
+    browser_config = BrowserConfig(
+        browser_mode="builtin",  # This is the key setting!
+        headless=True            # Can run headless for background operation
+    )
+    
+    # Create crawler run configuration
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,  # Skip cache for this demo
+        screenshot=True,              # Take a screenshot
+        verbose=True                  # Show verbose logging
+    )
+    
+    # Create the crawler instance
+    # Note: We don't need to use "async with" context manager
+    crawler = AsyncWebCrawler(config=browser_config)
+    
+    # Start crawling several URLs - no explicit start() needed!
+    # The crawler will automatically connect to the builtin browser
+    print("\n➡️ Crawling first URL...")
+    t0 = time.time()
+    result1 = await crawler.arun(
+        url="https://crawl4ai.com",
+        config=crawler_config
+    )
+    t1 = time.time()
+    print(f"✅ First URL crawled in {t1-t0:.2f} seconds")
+    print(f"   Got {len(result1.markdown.raw_markdown)} characters of content")
+    print(f"   Title: {result1.metadata.get('title', 'No title')}")
+    
+    # Try another URL - the browser is already running, so this should be faster
+    print("\n➡️ Crawling second URL...")
+    t0 = time.time()
+    result2 = await crawler.arun(
+        url="https://example.com",
+        config=crawler_config
+    )
+    t1 = time.time()
+    print(f"✅ Second URL crawled in {t1-t0:.2f} seconds")
+    print(f"   Got {len(result2.markdown.raw_markdown)} characters of content")
+    print(f"   Title: {result2.metadata.get('title', 'No title')}")
+    
+    # The builtin browser continues running in the background
+    # No need to explicitly close it
+    print("\n🔄 The builtin browser remains running for future use")
+    print("   You can use 'crwl browser status' to check its status")
+    print("   or 'crwl browser stop' to stop it when completely done")
+
+async def main():
+    """Run the example"""
+    await crawl_with_builtin_browser()
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/cli/browser.yml b/docs/examples/cli/browser.yml
new file mode 100644
index 00000000..dd6caf60
--- /dev/null
+++ b/docs/examples/cli/browser.yml
@@ -0,0 +1,13 @@
+browser_type: "chromium"
+headless: true
+viewport_width: 1280
+viewport_height: 800
+user_agent_mode: "random"
+verbose: true
+text_mode: false
+light_mode: false
+ignore_https_errors: true
+java_script_enabled: true
+extra_args:
+  - "--disable-gpu"
+  - "--no-sandbox"
\ No newline at end of file
diff --git a/docs/examples/cli/crawler.yml b/docs/examples/cli/crawler.yml
new file mode 100644
index 00000000..61bd6670
--- /dev/null
+++ b/docs/examples/cli/crawler.yml
@@ -0,0 +1,13 @@
+cache_mode: "bypass"
+wait_until: "networkidle"
+page_timeout: 30000
+delay_before_return_html: 0.5
+word_count_threshold: 100
+scan_full_page: true
+scroll_delay: 0.3
+process_iframes: false
+remove_overlay_elements: true
+magic: true
+verbose: true
+exclude_external_links: true
+exclude_social_media_links: true
\ No newline at end of file
diff --git a/docs/examples/cli/css_schema.json b/docs/examples/cli/css_schema.json
new file mode 100644
index 00000000..935efeb8
--- /dev/null
+++ b/docs/examples/cli/css_schema.json
@@ -0,0 +1,27 @@
+{
+  "name": "ArticleExtractor",
+  "baseSelector": ".cards[data-tax=news] .card__data",
+  "fields": [
+    {
+      "name": "title",
+      "selector": "h4.card__title",
+      "type": "text"
+    },
+    {
+      "name": "link",
+      "selector": "h4.card__title a", 
+      "type": "attribute",
+      "attribute": "href"
+    },
+    {
+      "name": "details",
+      "selector": ".card__details",
+      "type": "text"
+    },
+    {
+      "name": "topics",
+      "selector": ".card__topics.topics",
+      "type": "text"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/docs/examples/cli/extract.yml b/docs/examples/cli/extract.yml
new file mode 100644
index 00000000..be22dd5e
--- /dev/null
+++ b/docs/examples/cli/extract.yml
@@ -0,0 +1,11 @@
+type: "llm"
+provider: "openai/gpt-4o-mini"
+api_token: "env:OPENAI_API_KEY"
+instruction: "Extract all articles with their titles, authors, publication dates and main topics in a structured format"
+params:
+  chunk_token_threshold: 4096
+  overlap_rate: 0.1
+  word_token_rate: 0.75
+  temperature: 0.3
+  max_tokens: 1000
+  verbose: true
\ No newline at end of file
diff --git a/docs/examples/cli/extract_css.yml b/docs/examples/cli/extract_css.yml
new file mode 100644
index 00000000..a4004a3e
--- /dev/null
+++ b/docs/examples/cli/extract_css.yml
@@ -0,0 +1,3 @@
+type: "json-css"
+params:
+  verbose: true 
\ No newline at end of file
diff --git a/docs/examples/cli/llm_schema.json b/docs/examples/cli/llm_schema.json
new file mode 100644
index 00000000..a6969ccd
--- /dev/null
+++ b/docs/examples/cli/llm_schema.json
@@ -0,0 +1,26 @@
+{
+  "title": "NewsArticle",
+  "type": "object",
+  "properties": {
+    "title": {
+      "type": "string",
+      "description": "The title/headline of the news article"
+    },
+    "link": {
+      "type": "string",
+      "description": "The URL or link to the full article"
+    },
+    "details": {
+      "type": "string", 
+      "description": "Brief summary or details about the article content"
+    },
+    "topics": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      },
+      "description": "List of topics or categories associated with the article"
+    }
+  },
+  "required": ["title", "details"]
+}
\ No newline at end of file
diff --git a/docs/examples/crawler_monitor_example.py b/docs/examples/crawler_monitor_example.py
new file mode 100644
index 00000000..85d80ae6
--- /dev/null
+++ b/docs/examples/crawler_monitor_example.py
@@ -0,0 +1,209 @@
+"""
+CrawlerMonitor Example
+
+This example demonstrates how to use the CrawlerMonitor component 
+to visualize and track web crawler operations in real-time.
+"""
+
+import time
+import uuid
+import random
+import threading
+from crawl4ai.components.crawler_monitor import CrawlerMonitor
+from crawl4ai.models import CrawlStatus
+
+def simulate_webcrawler_operations(monitor, num_tasks=20):
+    """
+    Simulates a web crawler's operations with multiple tasks and different states.
+    
+    Args:
+        monitor: The CrawlerMonitor instance
+        num_tasks: Number of tasks to simulate
+    """
+    print(f"Starting simulation with {num_tasks} tasks...")
+    
+    # Create and register all tasks first
+    task_ids = []
+    for i in range(num_tasks):
+        task_id = str(uuid.uuid4())
+        url = f"https://example.com/page{i}"
+        monitor.add_task(task_id, url)
+        task_ids.append((task_id, url))
+        
+        # Small delay between task creation
+        time.sleep(0.2)
+    
+    # Process tasks with a variety of different behaviors
+    threads = []
+    for i, (task_id, url) in enumerate(task_ids):
+        # Create a thread for each task
+        thread = threading.Thread(
+            target=process_task,
+            args=(monitor, task_id, url, i)
+        )
+        thread.daemon = True
+        threads.append(thread)
+    
+    # Start threads in batches to simulate concurrent processing
+    batch_size = 4  # Process 4 tasks at a time
+    for i in range(0, len(threads), batch_size):
+        batch = threads[i:i+batch_size]
+        for thread in batch:
+            thread.start()
+            time.sleep(0.5)  # Stagger thread start times
+        
+        # Wait a bit before starting next batch
+        time.sleep(random.uniform(1.0, 3.0))
+        
+        # Update queue statistics
+        update_queue_stats(monitor)
+        
+        # Simulate memory pressure changes
+        active_threads = [t for t in threads if t.is_alive()]
+        if len(active_threads) > 8:
+            monitor.update_memory_status("CRITICAL")
+        elif len(active_threads) > 4:
+            monitor.update_memory_status("PRESSURE")
+        else:
+            monitor.update_memory_status("NORMAL")
+    
+    # Wait for all threads to complete
+    for thread in threads:
+        thread.join()
+    
+    # Final updates
+    update_queue_stats(monitor)
+    monitor.update_memory_status("NORMAL")
+    
+    print("Simulation completed!")
+    
+def process_task(monitor, task_id, url, index):
+    """Simulate processing of a single task."""
+    # Tasks start in queued state (already added)
+    
+    # Simulate waiting in queue
+    wait_time = random.uniform(0.5, 3.0)
+    time.sleep(wait_time)
+    
+    # Start processing - move to IN_PROGRESS
+    monitor.update_task(
+        task_id=task_id,
+        status=CrawlStatus.IN_PROGRESS,
+        start_time=time.time(),
+        wait_time=wait_time
+    )
+    
+    # Simulate task processing with memory usage changes
+    total_process_time = random.uniform(2.0, 10.0)
+    step_time = total_process_time / 5  # Update in 5 steps
+    
+    for step in range(5):
+        # Simulate increasing then decreasing memory usage
+        if step < 3:  # First 3 steps - increasing
+            memory_usage = random.uniform(5.0, 20.0) * (step + 1)
+        else:  # Last 2 steps - decreasing
+            memory_usage = random.uniform(5.0, 20.0) * (5 - step)
+            
+        # Update peak memory if this is higher
+        peak = max(memory_usage, monitor.get_task_stats(task_id).get("peak_memory", 0))
+        
+        monitor.update_task(
+            task_id=task_id,
+            memory_usage=memory_usage,
+            peak_memory=peak
+        )
+        
+        time.sleep(step_time)
+    
+    # Determine final state - 80% success, 20% failure
+    if index % 5 == 0:  # Every 5th task fails
+        monitor.update_task(
+            task_id=task_id,
+            status=CrawlStatus.FAILED,
+            end_time=time.time(),
+            memory_usage=0.0,
+            error_message="Connection timeout"
+        )
+    else:
+        monitor.update_task(
+            task_id=task_id,
+            status=CrawlStatus.COMPLETED,
+            end_time=time.time(),
+            memory_usage=0.0
+        )
+
+def update_queue_stats(monitor):
+    """Update queue statistics based on current tasks."""
+    task_stats = monitor.get_all_task_stats()
+    
+    # Count queued tasks
+    queued_tasks = [
+        stats for stats in task_stats.values() 
+        if stats["status"] == CrawlStatus.QUEUED.name
+    ]
+    
+    total_queued = len(queued_tasks)
+    
+    if total_queued > 0:
+        current_time = time.time()
+        # Calculate wait times
+        wait_times = [
+            current_time - stats.get("enqueue_time", current_time)
+            for stats in queued_tasks
+        ]
+        highest_wait_time = max(wait_times) if wait_times else 0.0
+        avg_wait_time = sum(wait_times) / len(wait_times) if wait_times else 0.0
+    else:
+        highest_wait_time = 0.0
+        avg_wait_time = 0.0
+    
+    # Update monitor
+    monitor.update_queue_statistics(
+        total_queued=total_queued,
+        highest_wait_time=highest_wait_time,
+        avg_wait_time=avg_wait_time
+    )
+
+def main():
+    # Initialize the monitor
+    monitor = CrawlerMonitor(
+        urls_total=20,  # Total URLs to process
+        refresh_rate=0.5,  # Update UI twice per second
+        enable_ui=True,    # Enable terminal UI
+        max_width=120     # Set maximum width to 120 characters
+    )
+    
+    # Start the monitor
+    monitor.start()
+    
+    try:
+        # Run simulation
+        simulate_webcrawler_operations(monitor)
+        
+        # Keep monitor running a bit to see final state
+        print("Waiting to view final state...")
+        time.sleep(5)
+        
+    except KeyboardInterrupt:
+        print("\nExample interrupted by user")
+    finally:
+        # Stop the monitor
+        monitor.stop()
+        print("Example completed!")
+        
+        # Print some statistics
+        summary = monitor.get_summary()
+        print("\nCrawler Statistics Summary:")
+        print(f"Total URLs: {summary['urls_total']}")
+        print(f"Completed: {summary['urls_completed']}")
+        print(f"Completion percentage: {summary['completion_percentage']:.1f}%")
+        print(f"Peak memory usage: {summary['peak_memory_percent']:.1f}%")
+        
+        # Print task status counts
+        status_counts = summary['status_counts']
+        print("\nTask Status Counts:")
+        for status, count in status_counts.items():
+            print(f"  {status}: {count}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/docs/examples/crypto_analysis_example.py b/docs/examples/crypto_analysis_example.py
new file mode 100644
index 00000000..c5537a93
--- /dev/null
+++ b/docs/examples/crypto_analysis_example.py
@@ -0,0 +1,445 @@
+"""
+Crawl4AI Crypto Trading Analysis Demo
+Author: Unclecode
+Date: 2024-03-15
+
+This script demonstrates advanced crypto market analysis using:
+1. Web scraping of real-time CoinMarketCap data
+2. Smart table extraction with layout detection
+3. Hedge fund-grade financial metrics
+4. Interactive visualizations for trading signals
+
+Key Features:
+- Volume Anomaly Detection: Finds unusual trading activity
+- Liquidity Power Score: Identifies easily tradable assets
+- Volatility-Weighted Momentum: Surface sustainable trends
+- Smart Money Signals: Algorithmic buy/hold recommendations
+"""
+
+import asyncio
+import pandas as pd
+import numpy as np
+import re
+import plotly.express as px
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    LXMLWebScrapingStrategy,
+)
+from crawl4ai import CrawlResult
+from typing import List
+
+__current_dir__ = __file__.rsplit("/", 1)[0]
+
+class CryptoAlphaGenerator:
+    """
+    Advanced crypto analysis engine that transforms raw web data into:
+    - Volume anomaly flags
+    - Liquidity scores
+    - Momentum-risk ratios
+    - Machine learning-inspired trading signals
+
+    Methods:
+    analyze_tables(): Process raw tables into trading insights
+    create_visuals(): Generate institutional-grade visualizations
+    generate_insights(): Create plain English trading recommendations
+    """
+
+    def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Convert crypto market data to machine-readable format.
+        Handles currency symbols, units (B=Billions), and percentage values.
+        """
+        # Make a copy to avoid SettingWithCopyWarning
+        df = df.copy()
+        
+        # Clean Price column (handle currency symbols)
+        df["Price"] = df["Price"].astype(str).str.replace("[^\d.]", "", regex=True).astype(float)
+        
+        # Handle Market Cap and Volume, considering both Billions and Trillions
+        def convert_large_numbers(value):
+            if pd.isna(value):
+                return float('nan')
+            value = str(value)
+            multiplier = 1
+            if 'B' in value:
+                multiplier = 1e9
+            elif 'T' in value:
+                multiplier = 1e12
+            # Handle cases where the value might already be numeric
+            cleaned_value = re.sub(r"[^\d.]", "", value)
+            return float(cleaned_value) * multiplier if cleaned_value else float('nan')
+        
+        df["Market Cap"] = df["Market Cap"].apply(convert_large_numbers)
+        df["Volume(24h)"] = df["Volume(24h)"].apply(convert_large_numbers)
+        
+        # Convert percentages to decimal values
+        for col in ["1h %", "24h %", "7d %"]:
+            if col in df.columns:
+                # First ensure it's string, then clean
+                df[col] = (
+                    df[col].astype(str)
+                    .str.replace("%", "")
+                    .str.replace(",", ".")
+                    .replace("nan", np.nan)
+                )
+                df[col] = pd.to_numeric(df[col], errors='coerce') / 100
+        
+        return df
+
+    def calculate_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Compute advanced trading metrics used by quantitative funds:
+
+        1. Volume/Market Cap Ratio - Measures liquidity efficiency
+           (High ratio = Underestimated attention, and small-cap = higher growth potential)
+
+        2. Volatility Score - Risk-adjusted momentum potential - Shows how stable is the trend
+           (STD of 1h/24h/7d returns)
+
+        3. Momentum Score - Weighted average of returns - Shows how strong is the trend
+           (1h:30% + 24h:50% + 7d:20%)
+
+        4. Volume Anomaly - 3σ deviation detection
+           (Flags potential insider activity) - Unusual trading activity – Flags coins with volume spikes (potential insider buying or news).
+        """
+        # Liquidity Metrics
+        df["Volume/Market Cap Ratio"] = df["Volume(24h)"] / df["Market Cap"]
+
+        # Risk Metrics
+        df["Volatility Score"] = df[["1h %", "24h %", "7d %"]].std(axis=1)
+
+        # Momentum Metrics
+        df["Momentum Score"] = df["1h %"] * 0.3 + df["24h %"] * 0.5 + df["7d %"] * 0.2
+
+        # Anomaly Detection
+        median_vol = df["Volume(24h)"].median()
+        df["Volume Anomaly"] = df["Volume(24h)"] > 3 * median_vol
+
+        # Value Flags
+        # Undervalued Flag - Low market cap and high momentum
+        # (High growth potential and low attention)
+        df["Undervalued Flag"] = (df["Market Cap"] < 1e9) & (
+            df["Momentum Score"] > 0.05
+        )
+        # Liquid Giant Flag - High volume/market cap ratio and large market cap
+        # (High liquidity and large market cap = institutional interest)
+        df["Liquid Giant"] = (df["Volume/Market Cap Ratio"] > 0.15) & (
+            df["Market Cap"] > 1e9
+        )
+
+        return df
+
+    def generate_insights_simple(self, df: pd.DataFrame) -> str:
+        """
+        Generates an ultra-actionable crypto trading report with:
+        - Risk-tiered opportunities (High/Medium/Low)
+        - Concrete examples for each trade type
+        - Entry/exit strategies spelled out
+        - Visual cues for quick scanning
+        """
+        report = [
+            "🚀 **CRYPTO TRADING CHEAT SHEET** 🚀",
+            "*Based on quantitative signals + hedge fund tactics*",
+            "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+        ]
+
+        # 1. HIGH-RISK: Undervalued Small-Caps (Momentum Plays)
+        high_risk = df[df["Undervalued Flag"]].sort_values("Momentum Score", ascending=False)
+        if not high_risk.empty:
+            example_coin = high_risk.iloc[0]
+            report.extend([
+                "\n🔥 **HIGH-RISK: Rocket Fuel Small-Caps**",
+                f"*Example Trade:* {example_coin['Name']} (Price: ${example_coin['Price']:.6f})",
+                "📊 *Why?* Tiny market cap (<$1B) but STRONG momentum (+{:.0f}% last week)".format(example_coin['7d %']*100),
+                "🎯 *Strategy:*",
+                "1. Wait for 5-10% dip from recent high (${:.6f} → Buy under ${:.6f})".format(
+                    example_coin['Price'] / (1 - example_coin['24h %']),  # Approx recent high
+                    example_coin['Price'] * 0.95
+                ),
+                "2. Set stop-loss at -10% (${:.6f})".format(example_coin['Price'] * 0.90),
+                "3. Take profit at +20% (${:.6f})".format(example_coin['Price'] * 1.20),
+                "⚠️ *Risk Warning:* These can drop 30% fast! Never bet more than 5% of your portfolio.",
+                "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+            ])
+
+        # 2. MEDIUM-RISK: Liquid Giants (Swing Trades)
+        medium_risk = df[df["Liquid Giant"]].sort_values("Volume/Market Cap Ratio", ascending=False)
+        if not medium_risk.empty:
+            example_coin = medium_risk.iloc[0]
+            report.extend([
+                "\n💎 **MEDIUM-RISK: Liquid Giants (Safe Swing Trades)**",
+                f"*Example Trade:* {example_coin['Name']} (Market Cap: ${example_coin['Market Cap']/1e9:.1f}B)",
+                "📊 *Why?* Huge volume (${:.1f}M/day) makes it easy to enter/exit".format(example_coin['Volume(24h)']/1e6),
+                "🎯 *Strategy:*",
+                "1. Buy when 24h volume > 15% of market cap (Current: {:.0f}%)".format(example_coin['Volume/Market Cap Ratio']*100),
+                "2. Hold 1-4 weeks (Big coins trend longer)",
+                "3. Exit when momentum drops below 5% (Current: {:.0f}%)".format(example_coin['Momentum Score']*100),
+                "📉 *Pro Tip:* Watch Bitcoin's trend - if BTC drops 5%, these usually follow.",
+                "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+            ])
+
+        # 3. LOW-RISK: Stable Momentum (DCA Targets)
+        low_risk = df[
+            (df["Momentum Score"] > 0.05) & 
+            (df["Volatility Score"] < 0.03)
+        ].sort_values("Market Cap", ascending=False)
+        if not low_risk.empty:
+            example_coin = low_risk.iloc[0]
+            report.extend([
+                "\n🛡️ **LOW-RISK: Steady Climbers (DCA & Forget)**",
+                f"*Example Trade:* {example_coin['Name']} (Volatility: {example_coin['Volatility Score']:.2f}/5)",
+                "📊 *Why?* Rises steadily (+{:.0f}%/week) with LOW drama".format(example_coin['7d %']*100),
+                "🎯 *Strategy:*",
+                "1. Buy small amounts every Tuesday/Friday (DCA)",
+                "2. Hold for 3+ months (Compound gains work best here)",
+                "3. Sell 10% at every +25% milestone",
+                "💰 *Best For:* Long-term investors who hate sleepless nights",
+                "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+            ])
+
+        # Volume Spike Alerts
+        anomalies = df[df["Volume Anomaly"]].sort_values("Volume(24h)", ascending=False)
+        if not anomalies.empty:
+            example_coin = anomalies.iloc[0]
+            report.extend([
+                "\n🚨 **Volume Spike Alert (Possible News/Whale Action)**",
+                f"*Coin:* {example_coin['Name']} (Volume: ${example_coin['Volume(24h)']/1e6:.1f}M, usual: ${example_coin['Volume(24h)']/3/1e6:.1f}M)",
+                "🔍 *Check:* Twitter/CoinGecko for news before trading",
+                "⚡ *If no news:* Could be insider buying - watch price action:",
+                "- Break above today's high → Buy with tight stop-loss",
+                "- Fade back down → Avoid (may be a fakeout)"
+            ])
+
+        # Pro Tip Footer
+        report.append("\n✨ *Pro Tip:* Bookmark this report & check back in 24h to see if signals held up.")
+
+        return "\n".join(report)
+
+    def generate_insights(self, df: pd.DataFrame) -> str:
+        """
+        Generates a tactical trading report with:
+        - Top 3 trades per risk level (High/Medium/Low)
+        - Auto-calculated entry/exit prices
+        - BTC chart toggle tip
+        """
+        # Filter top candidates for each risk level
+        high_risk = (
+            df[df["Undervalued Flag"]]
+            .sort_values("Momentum Score", ascending=False)
+            .head(3)
+        )
+        medium_risk = (
+            df[df["Liquid Giant"]]
+            .sort_values("Volume/Market Cap Ratio", ascending=False)
+            .head(3)
+        )
+        low_risk = (
+            df[(df["Momentum Score"] > 0.05) & (df["Volatility Score"] < 0.03)]
+            .sort_values("Momentum Score", ascending=False)
+            .head(3)
+        )
+
+        report = ["# 🎯 Crypto Trading Tactical Report (Top 3 Per Risk Tier)"]
+        
+        # 1. High-Risk Trades (Small-Cap Momentum)
+        if not high_risk.empty:
+            report.append("\n## 🔥 HIGH RISK: Small-Cap Rockets (5-50% Potential)")
+            for i, coin in high_risk.iterrows():
+                current_price = coin["Price"]
+                entry = current_price * 0.95  # -5% dip
+                stop_loss = current_price * 0.90  # -10%
+                take_profit = current_price * 1.20  # +20%
+                
+                report.append(
+                    f"\n### {coin['Name']} (Momentum: {coin['Momentum Score']:.1%})"
+                    f"\n- **Current Price:** ${current_price:.4f}"
+                    f"\n- **Entry:** < ${entry:.4f} (Wait for pullback)"
+                    f"\n- **Stop-Loss:** ${stop_loss:.4f} (-10%)"
+                    f"\n- **Target:** ${take_profit:.4f} (+20%)"
+                    f"\n- **Risk/Reward:** 1:2"
+                    f"\n- **Watch:** Volume spikes above {coin['Volume(24h)']/1e6:.1f}M"
+                )
+
+        # 2. Medium-Risk Trades (Liquid Giants)
+        if not medium_risk.empty:
+            report.append("\n## 💎 MEDIUM RISK: Liquid Swing Trades (10-30% Potential)")
+            for i, coin in medium_risk.iterrows():
+                current_price = coin["Price"]
+                entry = current_price * 0.98  # -2% dip
+                stop_loss = current_price * 0.94  # -6%
+                take_profit = current_price * 1.15  # +15%
+                
+                report.append(
+                    f"\n### {coin['Name']} (Liquidity Score: {coin['Volume/Market Cap Ratio']:.1%})"
+                    f"\n- **Current Price:** ${current_price:.2f}"
+                    f"\n- **Entry:** < ${entry:.2f} (Buy slight dips)"
+                    f"\n- **Stop-Loss:** ${stop_loss:.2f} (-6%)"
+                    f"\n- **Target:** ${take_profit:.2f} (+15%)"
+                    f"\n- **Hold Time:** 1-3 weeks"
+                    f"\n- **Key Metric:** Volume/Cap > 15%"
+                )
+
+        # 3. Low-Risk Trades (Stable Momentum)
+        if not low_risk.empty:
+            report.append("\n## 🛡️ LOW RISK: Steady Gainers (5-15% Potential)")
+            for i, coin in low_risk.iterrows():
+                current_price = coin["Price"]
+                entry = current_price * 0.99  # -1% dip
+                stop_loss = current_price * 0.97  # -3%
+                take_profit = current_price * 1.10  # +10%
+                
+                report.append(
+                    f"\n### {coin['Name']} (Stability Score: {1/coin['Volatility Score']:.1f}x)"
+                    f"\n- **Current Price:** ${current_price:.2f}"
+                    f"\n- **Entry:** < ${entry:.2f} (Safe zone)"
+                    f"\n- **Stop-Loss:** ${stop_loss:.2f} (-3%)"
+                    f"\n- **Target:** ${take_profit:.2f} (+10%)"
+                    f"\n- **DCA Suggestion:** 3 buys over 72 hours"
+                )
+
+        # Volume Anomaly Alert
+        anomalies = df[df["Volume Anomaly"]].sort_values("Volume(24h)", ascending=False).head(2)
+        if not anomalies.empty:
+            report.append("\n⚠️ **Volume Spike Alerts**")
+            for i, coin in anomalies.iterrows():
+                report.append(
+                    f"- {coin['Name']}: Volume {coin['Volume(24h)']/1e6:.1f}M "
+                    f"(3x normal) | Price moved: {coin['24h %']:.1%}"
+                )
+
+        # Pro Tip
+        report.append(
+            "\n📊 **Chart Hack:** Hide BTC in visuals:\n"
+            "```python\n"
+            "# For 3D Map:\n"
+            "fig.update_traces(visible=False, selector={'name':'Bitcoin'})\n"
+            "# For Treemap:\n"
+            "df = df[df['Name'] != 'Bitcoin']\n"
+            "```"
+        )
+
+        return "\n".join(report)
+
+    def create_visuals(self, df: pd.DataFrame) -> dict:
+        """Enhanced visuals with BTC toggle support"""
+        # 3D Market Map (with BTC toggle hint)
+        fig1 = px.scatter_3d(
+            df,
+            x="Market Cap",
+            y="Volume/Market Cap Ratio",
+            z="Momentum Score",
+            color="Name",  # Color by name to allow toggling
+            hover_name="Name",
+            title="Market Map (Toggle BTC in legend to focus on alts)",
+            log_x=True
+        )
+        fig1.update_traces(
+            marker=dict(size=df["Volatility Score"]*100 + 5)  # Dynamic sizing
+        )
+        
+        # Liquidity Tree (exclude BTC if too dominant)
+        if df[df["Name"] == "BitcoinBTC"]["Market Cap"].values[0] > df["Market Cap"].median() * 10:
+            df = df[df["Name"] != "BitcoinBTC"]
+        
+        fig2 = px.treemap(
+            df,
+            path=["Name"],
+            values="Market Cap",
+            color="Volume/Market Cap Ratio",
+            title="Liquidity Tree (BTC auto-removed if dominant)"
+        )
+        
+        return {"market_map": fig1, "liquidity_tree": fig2}
+
+async def main():
+    """
+    Main execution flow:
+    1. Configure headless browser for scraping
+    2. Extract live crypto market data
+    3. Clean and analyze using hedge fund models
+    4. Generate visualizations and insights
+    5. Output professional trading report
+    """
+    # Configure browser with anti-detection features
+    browser_config = BrowserConfig(
+        headless=False,
+    )
+
+    # Initialize crawler with smart table detection
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    try:
+        # Set up scraping parameters
+        crawl_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            table_score_threshold=8,  # Strict table detection
+            keep_data_attributes=True,
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            scan_full_page=True,
+            scroll_delay=0.2,
+        )
+
+        # Execute market data extraction
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://coinmarketcap.com/?page=1", config=crawl_config
+        )
+
+        # Process results
+        raw_df = pd.DataFrame()
+        for result in results:
+            # Use the new tables field, falling back to media["tables"] for backward compatibility
+            tables = result.tables if hasattr(result, "tables") and result.tables else result.media.get("tables", [])
+            if result.success and tables:
+                # Extract primary market table
+                # DataFrame
+                raw_df = pd.DataFrame(
+                    tables[0]["rows"],
+                    columns=tables[0]["headers"],
+                )
+                break
+
+
+        # This is for debugging only
+        # ////// Remove this in production from here..
+        # Save raw data for debugging
+        raw_df.to_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv", index=False)
+        print("🔍 Raw data saved to 'raw_crypto_data.csv'")
+
+        # Read from file for debugging
+        raw_df = pd.read_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv")
+        # ////// ..to here
+
+        # Select top 20
+        raw_df = raw_df.head(50)
+        # Remove "Buy" from name
+        raw_df["Name"] = raw_df["Name"].str.replace("Buy", "")
+
+        # Initialize analysis engine
+        analyzer = CryptoAlphaGenerator()
+        clean_df = analyzer.clean_data(raw_df)
+        analyzed_df = analyzer.calculate_metrics(clean_df)
+
+        # Generate outputs
+        visuals = analyzer.create_visuals(analyzed_df)
+        insights = analyzer.generate_insights(analyzed_df)
+
+        # Save visualizations
+        visuals["market_map"].write_html(f"{__current_dir__}/tmp/market_map.html")
+        visuals["liquidity_tree"].write_html(f"{__current_dir__}/tmp/liquidity_tree.html")
+
+        # Display results
+        print("🔑 Key Trading Insights:")
+        print(insights)
+        print("\n📊 Open 'market_map.html' for interactive analysis")
+        print("\n📊 Open 'liquidity_tree.html' for interactive analysis")
+
+    finally:
+        await crawler.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/deepcrawl_example.py b/docs/examples/deepcrawl_example.py
new file mode 100644
index 00000000..741c0039
--- /dev/null
+++ b/docs/examples/deepcrawl_example.py
@@ -0,0 +1,498 @@
+import asyncio
+import time
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+    ContentRelevanceFilter,
+    SEOFilter,
+)
+from crawl4ai.deep_crawling.scorers import (
+    KeywordRelevanceScorer,
+)
+
+
+# 1️⃣ Basic Deep Crawl Setup
+async def basic_deep_crawl():
+    """
+    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
+
+    This function shows:
+    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
+    - Setting depth and domain parameters
+    - Processing the results to show the hierarchy
+    """
+    print("\n===== BASIC DEEP CRAWL SETUP =====")
+
+    # Configure a 2-level deep crawl using Breadth-First Search strategy
+    # max_depth=2 means: initial page (depth 0) + 2 more levels
+    # include_external=False means: only follow links within the same domain
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True,  # Show progress during crawling
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        start_time = time.perf_counter()
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        # Group results by depth to visualize the crawl tree
+        pages_by_depth = {}
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            if depth not in pages_by_depth:
+                pages_by_depth[depth] = []
+            pages_by_depth[depth].append(result.url)
+
+        print(f"✅ Crawled {len(results)} pages total")
+
+        # Display crawl structure by depth
+        for depth, urls in sorted(pages_by_depth.items()):
+            print(f"\nDepth {depth}: {len(urls)} pages")
+            # Show first 3 URLs for each depth as examples
+            for url in urls[:3]:
+                print(f"  → {url}")
+            if len(urls) > 3:
+                print(f"  ... and {len(urls) - 3} more")
+
+        print(
+            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
+        )
+
+# 2️⃣ Stream vs. Non-Stream Execution
+async def stream_vs_nonstream():
+    """
+    PART 2: Demonstrates the difference between stream and non-stream execution.
+
+    Non-stream: Waits for all results before processing
+    Stream: Processes results as they become available
+    """
+    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
+
+    # Common configuration for both examples
+    base_config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=False,
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # NON-STREAMING MODE
+        print("\n📊 NON-STREAMING MODE:")
+        print("  In this mode, all results are collected before being returned.")
+
+        non_stream_config = base_config.clone()
+        non_stream_config.stream = False
+
+        start_time = time.perf_counter()
+        results = await crawler.arun(
+            url="https://docs.crawl4ai.com", config=non_stream_config
+        )
+
+        print(f"  ✅ Received all {len(results)} results at once")
+        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
+
+        # STREAMING MODE
+        print("\n📊 STREAMING MODE:")
+        print("  In this mode, results are processed as they become available.")
+
+        stream_config = base_config.clone()
+        stream_config.stream = True
+
+        start_time = time.perf_counter()
+        result_count = 0
+        first_result_time = None
+
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=stream_config
+        ):
+            result_count += 1
+            if result_count == 1:
+                first_result_time = time.perf_counter() - start_time
+                print(
+                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
+                )
+            elif result_count % 5 == 0:  # Show every 5th result for brevity
+                print(f"  → Result #{result_count}: {result.url}")
+
+        print(f"  ✅ Total: {result_count} results")
+        print(f"  ✅ First result: {first_result_time:.2f} seconds")
+        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
+        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
+
+# 3️⃣ Introduce Filters & Scorers
+async def filters_and_scorers():
+    """
+    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
+
+    This function progressively adds:
+    1. A single URL pattern filter
+    2. Multiple filters in a chain
+    3. Scorers for prioritizing pages
+    """
+    print("\n===== FILTERS AND SCORERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SINGLE FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
+        print("  Only crawl pages containing 'core' in the URL")
+
+        # Create a filter that only allows URLs with 'guide' in them
+        url_filter = URLPatternFilter(patterns=["*core*"])
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1,
+                include_external=False,
+                filter_chain=FilterChain([url_filter]),  # Single filter
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
+        for result in results[:3]:  # Show first 3 results
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # MULTIPLE FILTERS EXAMPLE
+        print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
+        print("  Only crawl pages that:")
+        print("  1. Contain '2024' in the URL")
+        print("  2. Are from 'techcrunch.com'")
+        print("  3. Are of text/html or application/javascript content type")
+
+        # Create a chain of filters
+        filter_chain = FilterChain(
+            [
+                URLPatternFilter(patterns=["*2024*"]),
+                DomainFilter(
+                    allowed_domains=["techcrunch.com"],
+                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
+                ),
+                ContentTypeFilter(
+                    allowed_types=["text/html", "application/javascript"]
+                ),
+            ]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, include_external=False, filter_chain=filter_chain
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://techcrunch.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
+        for result in results[:3]:
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # SCORERS EXAMPLE
+        print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
+        print(
+            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
+        )
+
+        # Create a keyword relevance scorer
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(  
+                max_depth=1, include_external=False, url_scorer=keyword_scorer
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+            stream=True,
+        )
+
+        results = []
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score")
+            print(f"  → Score: {score:.2f} | {result.url}")
+
+        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
+        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
+
+# 4️⃣ Advanced Filters
+async def advanced_filters():
+    """
+    PART 4: Demonstrates advanced filtering techniques for specialized crawling.
+
+    This function covers:
+    - SEO filters
+    - Text relevancy filtering
+    - Combining advanced filters
+    """
+    print("\n===== ADVANCED FILTERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SEO FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SEO FILTERS")
+        print(
+            "Quantitative SEO quality assessment filter based searching keywords in the head section"
+        )
+
+        seo_filter = SEOFilter(
+            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([seo_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages with relevant keywords")
+        for result in results:
+            print(f"  → {result.url}")
+
+        # ADVANCED TEXT RELEVANCY FILTER
+        print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
+
+        # More sophisticated content relevance filter
+        relevance_filter = ContentRelevanceFilter(
+            query="Interact with the web using your authentic digital identity",
+            threshold=0.7,
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([relevance_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages")
+        for result in results:
+            relevance_score = result.metadata.get("relevance_score", 0)
+            print(f"  → Score: {relevance_score:.2f} | {result.url}")
+
+# 5️⃣ Max Pages and Score Thresholds
+async def max_pages_and_thresholds():
+    """
+    PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
+    
+    This function shows:
+    - How to limit the number of pages crawled
+    - How to set score thresholds for more targeted crawling
+    - Comparing BFS, DFS, and Best-First strategies with these parameters
+    """
+    print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
+    
+    from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+    
+    async with AsyncWebCrawler() as crawler:
+        # Define a common keyword scorer for all examples
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["browser", "crawler", "web", "automation"], 
+            weight=1.0
+        )
+        
+        # EXAMPLE 1: BFS WITH MAX PAGES
+        print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
+        print("  Limit the crawler to a maximum of 5 pages")
+        
+        bfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=2, 
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=5  # Only crawl 5 pages
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
+        
+        print(f"  ✅ Crawled exactly {len(results)} pages as specified by max_pages")
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | {result.url}")
+            
+        # EXAMPLE 2: DFS WITH SCORE THRESHOLD
+        print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
+        print("  Only crawl pages with a relevance score above 0.5")
+        
+        dfs_config = CrawlerRunConfig(
+            deep_crawl_strategy=DFSDeepCrawlStrategy(
+                max_depth=2,
+                include_external=False, 
+                url_scorer=keyword_scorer,
+                score_threshold=0.7,  # Only process URLs with scores above 0.5
+                max_pages=10
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+        
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
+        
+        print(f"  ✅ Crawled {len(results)} pages with scores above threshold")
+        for result in results:
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        # EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
+        print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
+        print("  Limit to 7 pages with scores above 0.3, prioritizing highest scores")
+        
+        bf_config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(
+                max_depth=2,
+                include_external=False,
+                url_scorer=keyword_scorer,
+                max_pages=7,          # Limit to 7 pages total
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+            stream=True,
+        )
+        
+        results = []
+        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"  → Depth: {depth} | Score: {score:.2f} | {result.url}")
+            
+        print(f"  ✅ Crawled {len(results)} high-value pages with scores above 0.3")
+        if results:
+            avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
+            print(f"  ✅ Average score: {avg_score:.2f}")
+            print("  🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
+
+# 6️⃣ Wrap-Up and Key Takeaways
+async def wrap_up():
+    """
+    PART 6: Wrap-Up and Key Takeaways
+
+    Summarize the key concepts learned in this tutorial.
+    """
+    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
+    print("Combining filters, scorers, and streaming for an optimized crawl")
+
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain(
+        [
+            DomainFilter(
+                allowed_domains=["docs.crawl4ai.com"],
+                blocked_domains=["old.docs.crawl4ai.com"],
+            ),
+            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
+            ContentTypeFilter(allowed_types=["text/html"]),
+        ]
+    )
+
+    # Create a composite scorer that combines multiple scoring strategies
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"], weight=0.7
+    )
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=1,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+    )
+
+    # Execute the crawl
+    results = []
+    start_time = time.perf_counter()
+
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    duration = time.perf_counter() - start_time
+
+    # Summarize the results
+    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+    print(
+        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
+    )
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("\n📊 Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+
+async def run_tutorial():
+    """
+    Executes all tutorial sections in sequence.
+    """
+    print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
+    print("======================================")
+    print("This tutorial will walk you through deep crawling techniques,")
+    print("from basic to advanced, using the Crawl4AI library.")
+
+    # Define sections - uncomment to run specific parts during development
+    tutorial_sections = [
+        basic_deep_crawl,
+        stream_vs_nonstream,
+        filters_and_scorers,
+        max_pages_and_thresholds, 
+        advanced_filters,
+        wrap_up,
+    ]
+
+    for section in tutorial_sections:
+        await section()
+
+    print("\n🎉 TUTORIAL COMPLETE! 🎉")
+    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
+    print("For more information, check out https://docs.crawl4ai.com")
+
+# Execute the tutorial when run directly
+if __name__ == "__main__":
+    asyncio.run(run_tutorial())
\ No newline at end of file
diff --git a/docs/examples/dispatcher_example.py b/docs/examples/dispatcher_example.py
index cac08186..8ac24d3b 100644
--- a/docs/examples/dispatcher_example.py
+++ b/docs/examples/dispatcher_example.py
@@ -39,7 +39,7 @@ async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
     start = time.perf_counter()
     async with AsyncWebCrawler(config=browser_config) as crawler:
         dispatcher = MemoryAdaptiveDispatcher(
-            memory_threshold_percent=70.0,
+            memory_threshold_percent=95.0,
             max_session_permit=10,
             rate_limiter=RateLimiter(
                 base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
new file mode 100644
index 00000000..0a3d51af
--- /dev/null
+++ b/docs/examples/docker/demo_docker_api.py
@@ -0,0 +1,1317 @@
+import asyncio
+import httpx
+import json
+import os
+import time
+from typing import List, Dict, Any, AsyncGenerator, Optional
+import textwrap          # ← new: for pretty code literals
+import urllib.parse  # ← needed for URL-safe /llm calls
+from dotenv import load_dotenv
+from rich.console import Console
+from rich.syntax import Syntax
+from rich.panel import Panel
+from rich.table import Table
+
+# --- Setup & Configuration ---
+load_dotenv()  # Load environment variables from .env file
+
+console = Console()
+
+# --- Configuration ---
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
+# Target URLs
+SIMPLE_URL = "https://example.com"  # For demo purposes
+SIMPLE_URL = "https://httpbin.org/html"
+LINKS_URL = "https://httpbin.org/links/10/0"
+FORMS_URL = "https://httpbin.org/forms/post"  # For JS demo
+BOOKS_URL = "http://books.toscrape.com/"  # For CSS extraction
+PYTHON_URL = "https://python.org"  # For deeper crawl
+# Use the same sample site as deep crawl tests for consistency
+DEEP_CRAWL_BASE_URL = os.getenv(
+    "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
+DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com"
+
+# --- Helper Functions ---
+
+
+async def check_server_health(client: httpx.AsyncClient):
+    """Check if the server is healthy before running tests."""
+    console.print("[bold cyan]Checking server health...[/]", end="")
+    try:
+        response = await client.get("/health", timeout=10.0)
+        response.raise_for_status()
+        health_data = response.json()
+        console.print(
+            f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        console.print(f"\n[bold red]Server health check FAILED:[/]")
+        console.print(f"Error: {e}")
+        console.print(f"Is the server running at {BASE_URL}?")
+        return False
+    except Exception as e:
+        console.print(
+            f"\n[bold red]An unexpected error occurred during health check:[/]")
+        console.print(e)
+        return False
+
+
+def print_payload(payload: Dict[str, Any]):
+    """Prints the JSON payload nicely with a dark theme."""
+    syntax = Syntax(
+        json.dumps(payload, indent=2),
+        "json",
+        theme="monokai",  # <--- Changed theme here
+        line_numbers=False,
+        word_wrap=True      # Added word wrap for potentially long payloads
+    )
+    console.print(Panel(syntax, title="Request Payload",
+                  border_style="blue", expand=False))
+
+
+def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
+    """Prints a concise summary of crawl results."""
+    if not results:
+        console.print(f"[yellow]{title}: No results received.[/]")
+        return
+
+    console.print(Panel(f"[bold]{title}[/]",
+                  border_style="green", expand=False))
+    count = 0
+    for result in results:
+        if count >= max_items:
+            console.print(
+                f"... (showing first {max_items} of {len(results)} results)")
+            break
+        count += 1
+        success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]"
+        url = result.get('url', 'N/A')
+        status = result.get('status_code', 'N/A')
+        content_info = ""
+        if result.get('extracted_content'):
+            content_str = json.dumps(result['extracted_content'])
+            snippet = (
+                content_str[:70] + '...') if len(content_str) > 70 else content_str
+            content_info = f" | Extracted: [cyan]{snippet}[/]"
+        elif result.get('markdown'):
+            content_info = f" | Markdown: [cyan]Present[/]"
+        elif result.get('html'):
+            content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]"
+
+        console.print(
+            f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}")
+        if "metadata" in result and "depth" in result["metadata"]:
+            console.print(f"  Depth: {result['metadata']['depth']}")
+        if not result.get('success') and result.get('error_message'):
+            console.print(f"  [red]Error: {result['error_message']}[/]")
+
+
+async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str) -> Optional[List[Dict[str, Any]]]:
+    """Handles non-streaming POST requests."""
+    console.rule(f"[bold blue]{title}[/]", style="blue")
+    print_payload(payload)
+    console.print(f"Sending POST request to {client.base_url}{endpoint}...")
+    try:
+        start_time = time.time()
+        response = await client.post(endpoint, json=payload)
+        duration = time.time() - start_time
+        console.print(
+            f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)")
+        response.raise_for_status()
+        data = response.json()
+        if data.get("success"):
+            results = data.get("results", [])
+            print_result_summary(results, title=f"{title} Results")
+            return results
+        else:
+            console.print("[bold red]Request reported failure:[/]")
+            console.print(data)
+            return None
+    except httpx.HTTPStatusError as e:
+        console.print(f"[bold red]HTTP Error:[/]")
+        console.print(f"Status: {e.response.status_code}")
+        try:
+            console.print(Panel(Syntax(json.dumps(
+                e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
+        except json.JSONDecodeError:
+            console.print(f"Response Body: {e.response.text}")
+    except httpx.RequestError as e:
+        console.print(f"[bold red]Request Error: {e}[/]")
+    except Exception as e:
+        console.print(f"[bold red]Unexpected Error: {e}[/]")
+    return None
+
+
+async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str):
+    """Handles streaming POST requests."""
+    console.rule(f"[bold magenta]{title}[/]", style="magenta")
+    print_payload(payload)
+    console.print(
+        f"Sending POST stream request to {client.base_url}{endpoint}...")
+    all_results = []
+    initial_status_code = None  # Store initial status code
+
+    try:
+        start_time = time.time()
+        async with client.stream("POST", endpoint, json=payload) as response:
+            initial_status_code = response.status_code  # Capture initial status
+            duration = time.time() - start_time  # Time to first byte potentially
+            console.print(
+                f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
+            response.raise_for_status()  # Raise exception for bad *initial* status codes
+
+            console.print("[magenta]--- Streaming Results ---[/]")
+            completed = False
+            async for line in response.aiter_lines():
+                if line:
+                    try:
+                        data = json.loads(line)
+                        if data.get("status") == "completed":
+                            completed = True
+                            console.print(
+                                "[bold green]--- Stream Completed ---[/]")
+                            break
+                        elif data.get("url"):  # Looks like a result dictionary
+                            all_results.append(data)
+                            # Display summary info as it arrives
+                            success_icon = "[green]✔[/]" if data.get(
+                                'success') else "[red]✘[/]"
+                            url = data.get('url', 'N/A')
+                            # Display status code FROM THE RESULT DATA if available
+                            result_status = data.get('status_code', 'N/A')
+                            console.print(
+                                f"  {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
+                            if not data.get('success') and data.get('error_message'):
+                                console.print(
+                                    f"    [red]Error: {data['error_message']}[/]")
+                        else:
+                            console.print(
+                                f"  [yellow]Stream meta-data:[/yellow] {data}")
+                    except json.JSONDecodeError:
+                        console.print(
+                            f"  [red]Stream decode error for line:[/red] {line}")
+            if not completed:
+                console.print(
+                    "[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
+
+    except httpx.HTTPStatusError as e:
+        # Use the captured initial status code if available, otherwise from the exception
+        status = initial_status_code if initial_status_code is not None else e.response.status_code
+        console.print(f"[bold red]HTTP Error (Initial Request):[/]")
+        console.print(f"Status: {status}")
+        try:
+            console.print(Panel(Syntax(json.dumps(
+                e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
+        except json.JSONDecodeError:
+            console.print(f"Response Body: {e.response.text}")
+    except httpx.RequestError as e:
+        console.print(f"[bold red]Request Error: {e}[/]")
+    except Exception as e:
+        console.print(f"[bold red]Unexpected Error during streaming: {e}[/]")
+        # Print stack trace for unexpected errors
+        console.print_exception(show_locals=False)
+
+    # Call print_result_summary with the *collected* results AFTER the stream is done
+    print_result_summary(all_results, title=f"{title} Collected Results")
+
+
+def load_proxies_from_env() -> List[Dict]:
+    """
+    Load proxies from the PROXIES environment variable.
+    Expected format: IP:PORT:USER:PASS,IP:PORT,IP2:PORT2:USER2:PASS2,...
+    Returns a list of dictionaries suitable for the 'params' of ProxyConfig.
+    """
+    proxies_params_list = []
+    proxies_str = os.getenv("PROXIES", "")
+    if not proxies_str:
+        # console.print("[yellow]PROXIES environment variable not set or empty.[/]")
+        return proxies_params_list  # Return empty list if not set
+
+    try:
+        proxy_entries = proxies_str.split(",")
+        for entry in proxy_entries:
+            entry = entry.strip()
+            if not entry:
+                continue
+
+            parts = entry.split(":")
+            proxy_dict = {}
+
+            if len(parts) == 4:  # Format: IP:PORT:USER:PASS
+                ip, port, username, password = parts
+                proxy_dict = {
+                    "server": f"http://{ip}:{port}",  # Assuming http protocol
+                    "username": username,
+                    "password": password,
+                    # "ip": ip # 'ip' is not a standard ProxyConfig param, 'server' contains it
+                }
+            elif len(parts) == 2:  # Format: IP:PORT
+                ip, port = parts
+                proxy_dict = {
+                    "server": f"http://{ip}:{port}",
+                    # "ip": ip
+                }
+            else:
+                console.print(
+                    f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}")
+                continue
+
+            proxies_params_list.append(proxy_dict)
+
+    except Exception as e:
+        console.print(
+            f"[red]Error loading proxies from environment:[/red] {e}")
+
+    if proxies_params_list:
+        console.print(
+            f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]")
+    # else:
+    #     console.print("[yellow]No valid proxies loaded from environment.[/]")
+
+    return proxies_params_list
+
+
+# --- Demo Functions ---
+
+# 1. Basic Crawling
+async def demo_basic_single_url(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS"
+            }
+        }
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl")
+    return result
+
+
+async def demo_basic_multi_url(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL, LINKS_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}}
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl")
+    return result
+
+
+async def demo_streaming_multi_url(client: httpx.AsyncClient):
+    payload = {
+        # "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL, SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL
+        "urls": [
+            "https://example.com/page1",
+            "https://example.com/page2",
+            "https://example.com/page3",
+            "https://example.com/page4",
+            "https://example.com/page5"
+        ],  # Add another URL
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": True,
+            }
+        }
+    }
+    result = await stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl")
+    return result
+
+# 2. Markdown Generation & Content Filtering
+
+
+async def demo_markdown_default(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "markdown_generator": {
+                    "type": "DefaultMarkdownGenerator",
+                    "params": {
+                        "content_source": "fit_html",
+                        "options": {
+                            "type": "dict",
+                            "value": {
+                                "ignore_links": True
+                            }
+                        }
+                    }
+                }  # Explicitly default
+            }
+        }
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation")
+    return result
+
+
+async def demo_markdown_pruning(client: httpx.AsyncClient):
+    payload = {
+        "urls": [PYTHON_URL],  # Use a more complex page
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "markdown_generator": {
+                    "type": "DefaultMarkdownGenerator",
+                    "params": {
+                        "content_filter": {
+                            "type": "PruningContentFilter",
+                            "params": {
+                                "threshold": 0.6,
+                                "threshold_type": "relative"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter")
+    return result
+
+
+async def demo_markdown_bm25(client: httpx.AsyncClient):
+    payload = {
+        "urls": [PYTHON_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "markdown_generator": {
+                    "type": "DefaultMarkdownGenerator",
+                    "params": {
+                        "content_filter": {
+                            "type": "BM25ContentFilter",
+                            "params": {
+                                "user_query": "Python documentation language reference"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    result = await make_request(client, "/crawl", payload, "Demo 2c: Markdown with BM25 Filter")
+    return result
+
+# 3. Specific Parameters
+# Corrected Demo Function: demo_param_css_selector
+
+
+async def demo_param_css_selector(client: httpx.AsyncClient):
+    css_selector = ".main-content"  # Using the suggested correct selector
+    payload = {
+        "urls": [PYTHON_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "css_selector": css_selector  # Target specific div
+                # No extraction strategy is needed to demo this parameter's effect on input HTML
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{css_selector}')")
+
+    if results:
+        result = results[0]
+        if result['success'] and result.get('html'):
+            # Check if the returned HTML is likely constrained
+            # A simple check: does it contain expected content from within the selector,
+            # and does it LACK content known to be outside (like footer links)?
+            html_content = result['html']
+            # Text likely within .main-content somewhere
+            content_present = 'Python Software Foundation' in html_content
+            # Text likely in the footer, outside .main-content
+            footer_absent = 'Legal Statements' not in html_content
+
+            console.print(
+                f"  Content Check: Text inside '{css_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}")
+            console.print(
+                f"  Content Check: Text outside '{css_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}")
+
+            if not content_present or not footer_absent:
+                console.print(
+                    f"  [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}")
+            else:
+                console.print(
+                    f"  [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}")
+
+        elif result['success']:
+            console.print(
+                "[yellow]HTML content was empty in the successful result.[/]")
+        # Error message is handled by print_result_summary called by make_request
+
+
+async def demo_param_js_execution(client: httpx.AsyncClient):
+    payload = {
+        "urls": ["https://example.com"],  # Use a page with a form
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                # Simple JS to fill and maybe click (won't submit without more complex setup)
+                "js_code": """
+                    (() => {
+                        document.querySelector('h1').innerText = 'Crawl4AI Demo';
+                        return { filled_name: document.querySelector('h1').innerText };
+                    })();
+                """,
+                "delay_before_return_html": 0.5  # Give JS time to potentially run
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter")
+    if results and results[0].get("js_execution_result"):
+        console.print("[cyan]JS Execution Result:[/]",
+                      results[0]["js_execution_result"])
+    elif results:
+        console.print("[yellow]JS Execution Result not found in response.[/]")
+
+
+async def demo_param_screenshot(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"cache_mode": "BYPASS", "screenshot": True}
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot")
+    if results and results[0].get("screenshot"):
+        console.print(
+            f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}")
+    elif results:
+        console.print("[yellow]Screenshot data not found in response.[/]")
+
+
+async def demo_param_ssl_fetch(client: httpx.AsyncClient):
+    payload = {
+        "urls": [PYTHON_URL],  # Needs HTTPS
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"cache_mode": "BYPASS", "fetch_ssl_certificate": True}
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 3d: Fetching SSL Certificate")
+    if results and results[0].get("ssl_certificate"):
+        console.print("[cyan]SSL Certificate Info:[/]")
+        console.print(results[0]["ssl_certificate"])
+    elif results:
+        console.print("[yellow]SSL Certificate data not found in response.[/]")
+
+
+async def demo_param_proxy(client: httpx.AsyncClient):
+    proxy_params_list = load_proxies_from_env()  # Get the list of parameter dicts
+    if not proxy_params_list:
+        console.rule(
+            "[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow")
+        console.print("Set the PROXIES environment variable to run this demo.")
+        console.print("Format: IP:PORT:USR:PWD,IP:PORT,...")
+        return
+
+    payload = {
+        "urls": ["https://httpbin.org/ip"],  # URL that shows originating IP
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "proxy_rotation_strategy": {
+                    "type": "RoundRobinProxyStrategy",
+                    "params": {
+                        "proxies": [
+                            # [
+                            # {
+                            # "type": "ProxyConfig",
+                            # "params": {
+                            # server:"...",
+                            # "username": "...",
+                            # "password": "..."
+                            # }
+                            # },
+                            # ...
+                            # ]
+
+                            # Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig
+                            {"type": "ProxyConfig", "params": {
+                                k: v for k, v in p.items() if k != 'ip'}}
+                            for p in proxy_params_list
+                        ]
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 3e: Using Proxies")
+
+    # --- Verification Logic ---
+    if results and results[0].get("success"):
+        result = results[0]
+        try:
+            # httpbin.org/ip returns JSON within the HTML body's <pre> tag
+            html_content = result.get('html', '')
+            # Basic extraction - find JSON within <pre> tags or just the JSON itself
+            json_str = None
+            if '<pre' in html_content:
+                start = html_content.find('{')
+                end = html_content.rfind('}')
+                if start != -1 and end != -1:
+                    json_str = html_content[start:end+1]
+            elif html_content.strip().startswith('{'):  # Maybe it's just JSON
+                json_str = html_content.strip()
+
+            if json_str:
+                ip_data = json.loads(json_str)
+                origin_ip = ip_data.get("origin")
+                console.print(
+                    f"  Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]")
+
+                # Extract the IPs from the proxy list for comparison
+                proxy_ips = {p.get("server").split(
+                    ":")[1][2:] for p in proxy_params_list}
+
+                if origin_ip and origin_ip in proxy_ips:
+                    console.print(
+                        "[bold green]  Verification SUCCESS: Origin IP matches one of the provided proxies![/]")
+                elif origin_ip:
+                    console.print(
+                        "[bold red]  Verification FAILED: Origin IP does not match any provided proxy IPs.[/]")
+                    console.print(f"  Provided Proxy IPs: {proxy_ips}")
+                else:
+                    console.print(
+                        "[yellow]  Verification SKIPPED: Could not extract origin IP from response.[/]")
+            else:
+                console.print(
+                    "[yellow]  Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]")
+                # console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging
+
+        except json.JSONDecodeError:
+            console.print(
+                "[red]  Verification FAILED: Could not parse JSON from httpbin response HTML.[/]")
+        except Exception as e:
+            console.print(
+                f"[red]  Verification Error: An unexpected error occurred during IP check: {e}[/]")
+    elif results:
+        console.print(
+            "[yellow]  Verification SKIPPED: Crawl for IP check was not successful.[/]")
+
+# 4. Extraction Strategies
+
+
+async def demo_extract_css(client: httpx.AsyncClient):
+    # Schema to extract book titles and prices
+    book_schema = {
+        "name": "BookList",
+        "baseSelector": "ol.row li.col-xs-6",
+        "fields": [
+            {"name": "title", "selector": "article.product_pod h3 a",
+                "type": "attribute", "attribute": "title"},
+            {"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
+        ]
+    }
+    payload = {
+        "urls": [BOOKS_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "extraction_strategy": {
+                    "type": "JsonCssExtractionStrategy",
+                    "params": {
+                        "schema": {
+                            "type": "dict", 
+                            "value": book_schema
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 4a: JSON/CSS Extraction")
+
+    if results and results[0].get("success") and results[0].get("extracted_content"):
+        try:
+            extracted_data = json.loads(results[0]["extracted_content"])
+            if isinstance(extracted_data, list) and extracted_data:
+                console.print("[cyan]Sample Extracted Books (CSS):[/]")
+                table = Table(show_header=True, header_style="bold magenta")
+                table.add_column("Title", style="dim")
+                table.add_column("Price")
+                for item in extracted_data[:5]:  # Show first 5
+                    table.add_row(item.get('title', 'N/A'),
+                                  item.get('price', 'N/A'))
+                console.print(table)
+            else:
+                console.print(
+                    "[yellow]CSS extraction did not return a list of results.[/]")
+                console.print(extracted_data)
+        except json.JSONDecodeError:
+            console.print("[red]Failed to parse extracted_content as JSON.[/]")
+        except Exception as e:
+            console.print(
+                f"[red]Error processing extracted CSS content: {e}[/]")
+
+# 5. LLM Extraction
+
+
+async def demo_extract_llm(client: httpx.AsyncClient):
+    if not os.getenv("OPENAI_API_KEY"):  # Basic check for a common key
+        console.rule(
+            "[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow")
+        console.print(
+            "Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
+        return
+
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "extraction_strategy": {
+                    "type": "LLMExtractionStrategy",
+                    "params": {
+                        "instruction": "Extract title and author into JSON.",
+                        "llm_config": {  # Optional: Specify provider if not default
+                            "type": "LLMConfig",
+                            "params": {}
+                            # Relies on server's default provider from config.yml & keys from .llm.env
+                            # "params": {
+                                # "provider": "openai/gpt-4o-mini",
+                                # "api_key": os.getenv("OPENAI_API_KEY")  # Optional: Override key
+                            # }
+                        },
+                        "schema": {  # Request structured output
+                            "type": "dict",
+                            "value": {
+                                "title": "BookInfo", "type": "object",
+                                "properties": {
+                                    "book_title": {"type": "string"},
+                                    "book_author": {"type": "string"}
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 4b: LLM Extraction")
+
+    if results and results[0].get("success") and results[0].get("extracted_content"):
+        try:
+            extracted_data = json.loads(results[0]["extracted_content"])
+            # Handle potential list wrapper from server
+            if isinstance(extracted_data, list) and extracted_data:
+                extracted_data = extracted_data[0]
+
+            if isinstance(extracted_data, dict):
+                console.print("[cyan]Extracted Data (LLM):[/]")
+                syntax = Syntax(json.dumps(extracted_data, indent=2),
+                                "json", theme="monokai", line_numbers=False)
+                console.print(Panel(syntax, border_style="cyan", expand=False))
+            else:
+                console.print(
+                    "[yellow]LLM extraction did not return expected dictionary.[/]")
+                console.print(extracted_data)
+        except json.JSONDecodeError:
+            console.print(
+                "[red]Failed to parse LLM extracted_content as JSON.[/]")
+        except Exception as e:
+            console.print(
+                f"[red]Error processing extracted LLM content: {e}[/]")
+
+# 6. Deep Crawling
+
+
+async def demo_deep_basic(client: httpx.AsyncClient):
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,
+                        "max_pages": 4,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {
+                                "filters": [
+                                    {
+                                        "type": "DomainFilter", 
+                                        "params": 
+                                        {
+                                            "allowed_domains": [DEEP_CRAWL_DOMAIN]
+                                        }
+                                    }
+                                ]
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 5a: Basic Deep Crawl")
+    # print_result_summary is called by make_request, showing URLs and depths
+    for result in results:
+        if result.get("success") and result.get("metadata"):
+            depth = result["metadata"].get("depth", "N/A")
+            console.print(f"  Depth: {depth}")
+        elif not result.get("success"):
+            console.print(f"  [red]Error: {result['error_message']}[/]")
+
+# 5. Streaming Deep Crawl
+
+
+async def demo_deep_streaming(client: httpx.AsyncClient):
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": True,  # Enable streaming
+                "cache_mode": "BYPASS",
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,
+                        "max_pages": 4,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    # stream_request handles printing results as they arrive
+    await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl")
+
+# 5a. Deep Crawl with Filtering & Scoring
+
+
+async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
+    """Demonstrates deep crawl with advanced URL filtering and scoring."""
+    max_depth = 2  # Go a bit deeper to see scoring/filtering effects
+    max_pages = 6
+    excluded_pattern = "*/category-1/*"  # Example pattern to exclude
+    keyword_to_score = "product"        # Example keyword to prioritize
+
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": False,
+                "cache_mode": "BYPASS",
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": max_depth,
+                        "max_pages": max_pages,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {
+                                "filters": [
+                                    {   # Stay on the allowed domain
+                                        "type": "DomainFilter",
+                                        "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+                                    },
+                                    {   # Only crawl HTML pages
+                                        "type": "ContentTypeFilter",
+                                        "params": {"allowed_types": ["text/html"]}
+                                    },
+                                    {   # Exclude URLs matching the pattern
+                                        "type": "URLPatternFilter",
+                                        "params": {
+                                            "patterns": [excluded_pattern],
+                                            "reverse": True  # Block if match
+                                        }
+                                    }
+                                ]
+                            }
+                        },
+                        "url_scorer": {
+                            "type": "CompositeScorer",
+                            "params": {
+                                "scorers": [
+                                    {   # Boost score for URLs containing the keyword
+                                        "type": "KeywordRelevanceScorer",
+                                        # Higher weight
+                                        "params": {"keywords": [keyword_to_score], "weight": 1.5}
+                                    },
+                                    {   # Slightly penalize deeper pages
+                                        "type": "PathDepthScorer",
+                                        "params": {"optimal_depth": 1, "weight": -0.1}
+                                    }
+                                ]
+                            }
+                        },
+                        # Optional: Only crawl URLs scoring above a threshold
+                        # "score_threshold": 0.1
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 5c: Deep Crawl with Filtering & Scoring")
+
+    # --- Verification/Analysis ---
+    if results:
+        console.print("[cyan]Deep Crawl Filtering/Scoring Analysis:[/]")
+        excluded_found = False
+        prioritized_found_at_depth1 = False
+        prioritized_found_overall = False
+
+        for result in results:
+            url = result.get("url", "")
+            depth = result.get("metadata", {}).get("depth", -1)
+
+            # Check Filtering
+            # Check if the excluded part is present
+            if excluded_pattern.strip('*') in url:
+                console.print(
+                    f"  [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}")
+                excluded_found = True
+
+            # Check Scoring (Observation)
+            if keyword_to_score in url:
+                prioritized_found_overall = True
+                # Check if prioritized keywords appeared early (depth 1)
+                if depth == 1:
+                    prioritized_found_at_depth1 = True
+
+        if not excluded_found:
+            console.print(
+                f"  [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.")
+        else:
+            console.print(
+                f"  [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).")
+
+        if prioritized_found_at_depth1:
+            console.print(
+                f"  [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).")
+        elif prioritized_found_overall:
+            console.print(
+                f"  [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).")
+        else:
+            console.print(
+                f"  [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.")
+
+        # print_result_summary called by make_request already shows URLs and depths
+
+# 6. Deep Crawl with Extraction
+
+
+async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
+    # Schema to extract H1 and first paragraph from any page
+    general_schema = {
+        "name": "PageContent",
+        "baseSelector": "body",  # Apply to whole body
+        "fields": [
+            {"name": "page_title", "selector": "h1",
+                "type": "text", "default": "N/A"},
+            {"name": "first_p", "selector": "p", "type": "text",
+                "default": "N/A"},  # Gets first p tag
+        ]
+    }
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "extraction_strategy": {  # Apply CSS extraction to each page
+                    "type": "JsonCssExtractionStrategy",
+                    "params": {"schema": {"type": "dict", "value": general_schema}}
+                },
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,
+                        "max_pages": 3,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {"filters": [
+                                {"type": "DomainFilter", "params": {
+                                    "allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                {"type": "ContentTypeFilter", "params": {
+                                    "allowed_types": ["text/html"]}}
+                            ]}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 6a: Deep Crawl + CSS Extraction")
+
+    if results:
+        console.print("[cyan]CSS Extraction Summary from Deep Crawl:[/]")
+        for result in results:
+            if result.get("success") and result.get("extracted_content"):
+                try:
+                    extracted = json.loads(result["extracted_content"])
+                    if isinstance(extracted, list) and extracted:
+                        extracted = extracted[0]  # Use first item
+                    title = extracted.get(
+                        'page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error'
+                    console.print(
+                        f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}")
+                except Exception:
+                    console.print(
+                        f"  [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content")
+            elif result.get("success"):
+                console.print(
+                    f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
+            else:
+                console.print(
+                    f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
+
+# 6b. Deep Crawl with LLM Extraction
+
+
+async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
+    if not os.getenv("OPENAI_API_KEY"):  # Basic check
+        console.rule(
+            "[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow")
+        console.print(
+            "Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
+        return
+
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "extraction_strategy": {  # Apply LLM extraction to each page
+                    "type": "LLMExtractionStrategy",
+                    "params": {
+                        "instruction": "What is the main topic of this page based on the H1 and first paragraph? Respond with just the topic.",
+                        # Rely on server default LLM config + .llm.env keys
+                    }
+                },
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,
+                        "max_pages": 2,  # Reduce pages for LLM cost/time
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {"filters": [
+                                {"type": "DomainFilter", "params": {
+                                    "allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                {"type": "ContentTypeFilter", "params": {
+                                    "allowed_types": ["text/html"]}}
+                            ]}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 6b: Deep Crawl + LLM Extraction")
+
+    if results:
+        console.print("[cyan]LLM Extraction Summary from Deep Crawl:[/]")
+        for result in results:
+            if result.get("success") and result.get("extracted_content"):
+                console.print(
+                    f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}")
+            elif result.get("success"):
+                console.print(
+                    f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
+            else:
+                console.print(
+                    f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
+
+
+# 6c. Deep Crawl with Proxies
+async def demo_deep_with_proxy(client: httpx.AsyncClient):
+    proxy_params_list = load_proxies_from_env()  # Get the list of parameter dicts
+    if not proxy_params_list:
+        console.rule(
+            "[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow")
+        console.print("Set the PROXIES environment variable to run this demo.")
+        return
+
+    payload = {
+        # Use a site likely accessible via proxies
+        "urls": [DEEP_CRAWL_BASE_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "proxy_rotation_strategy": {
+                    "type": "RoundRobinProxyStrategy",
+                    "params": {
+                        # Correctly create the list of {"type": ..., "params": ...} structures, excluding the demo 'ip' key
+                        "proxies": [
+                            {"type": "ProxyConfig", "params": {
+                                k: v for k, v in p.items() if k != 'ip'}}
+                            for p in proxy_params_list
+                        ]
+                    }
+                },
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,  # Just crawl start URL via proxy
+                        "max_pages": 5,
+                    }
+                }
+            }
+        }
+    }
+    # make_request calls print_result_summary, which shows URL and success status
+    results = await make_request(client, "/crawl", payload, "Demo 6c: Deep Crawl + Proxies")
+    if not results:
+        console.print("[red]No results returned from the crawl.[/]")
+        return
+    console.print("[cyan]Proxy Usage Summary from Deep Crawl:[/]")
+    # Verification of specific proxy IP usage would require more complex setup or server logs.
+    for result in results:
+        if result.get("success") and result.get("metadata"):
+            proxy_ip = result["metadata"].get("proxy_ip", "N/A")
+            console.print(f"  Proxy IP used: {proxy_ip}")
+        elif not result.get("success"):
+            console.print(f"  [red]Error: {result['error_message']}[/]")
+
+
+# 6d. Deep Crawl with SSL Certificate Fetching
+async def demo_deep_with_ssl(client: httpx.AsyncClient):
+    """Test BFS deep crawl with fetch_ssl_certificate enabled."""
+    payload = {
+        "urls": [DEEP_CRAWL_BASE_URL],  # Needs HTTPS
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": False,
+                "cache_mode": "BYPASS",
+                "fetch_ssl_certificate": True,  # <-- Enable SSL fetching
+                "deep_crawl_strategy": {
+                    "type": "BFSDeepCrawlStrategy",
+                    "params": {
+                        "max_depth": 1,  # Crawl a bit deeper
+                        "max_pages": 3,
+                        "filter_chain": {
+                            "type": "FilterChain",
+                            "params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                        }
+                    }
+                }
+            }
+        }
+    }
+    results = await make_request(client, "/crawl", payload, "Demo 6d: Deep Crawl + Fetch SSL")
+
+    if results:
+        console.print("[cyan]SSL Certificate Summary from Deep Crawl:[/]")
+        for result in results:
+            if result.get("success") and result.get("ssl_certificate"):
+                cert = result["ssl_certificate"]
+                issuer_org = cert.get('issuer', {}).get('O', 'N/A')
+                valid_from = cert.get('not_before', 'N/A')
+                valid_to = cert.get('not_after', 'N/A')
+                console.print(
+                    f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}")
+            elif result.get("success"):
+                console.print(
+                    f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.")
+            else:
+                console.print(
+                    f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
+
+# 7. Markdown helper endpoint
+
+
+async def demo_markdown_endpoint(client: httpx.AsyncClient):
+    """
+    One-shot helper around /md.
+    Fetches PYTHON_URL with FIT filter and prints the first 500 chars of Markdown.
+    """
+    target_url = PYTHON_URL
+    payload = {"url": target_url, "f": "fit", "q": None, "c": "0"}
+
+    console.rule("[bold blue]Demo 7a: /md Endpoint[/]", style="blue")
+    print_payload(payload)
+
+    try:
+        t0 = time.time()
+        resp = await client.post("/md", json=payload)
+        dt = time.time() - t0
+        console.print(
+            f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        md = resp.json().get("markdown", "")
+        snippet = (md[:500] + "...") if len(md) > 500 else md
+        console.print(Panel(snippet, title="Markdown snippet",
+                      border_style="cyan", expand=False))
+    except Exception as e:
+        console.print(f"[bold red]Error hitting /md:[/] {e}")
+
+# 8. LLM QA helper endpoint
+
+
+async def demo_llm_endpoint(client: httpx.AsyncClient):
+    """
+    Quick QA round-trip with /llm.
+    Asks a trivial question against SIMPLE_URL just to show wiring.
+    """
+    page_url = SIMPLE_URL
+    question = "What is the title of this page?"
+
+    console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta")
+    enc = urllib.parse.quote_plus(page_url, safe="")
+    console.print(f"GET /llm/{enc}?q={question}")
+
+    try:
+        t0 = time.time()
+        resp = await client.get(f"/llm/{enc}", params={"q": question})
+        dt = time.time() - t0
+        console.print(
+            f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        answer = resp.json().get("answer", "")
+        console.print(Panel(answer or "No answer returned",
+                      title="LLM answer", border_style="magenta", expand=False))
+    except Exception as e:
+        console.print(f"[bold red]Error hitting /llm:[/] {e}")
+
+
+# 9. /config/dump helpers --------------------------------------------------
+
+async def demo_config_dump_valid(client: httpx.AsyncClient):
+    """
+    Send a single top-level CrawlerRunConfig(...) expression and show the dump.
+    """
+    code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)"
+    payload = {"code": code_snippet}
+
+    console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue")
+    print_payload(payload)
+
+    try:
+        t0 = time.time()
+        resp = await client.post("/config/dump", json=payload)
+        dt = time.time() - t0
+        console.print(
+            f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        dump_json = resp.json()
+        console.print(Panel(Syntax(json.dumps(dump_json, indent=2),
+                      "json", theme="monokai"), title="Dump()", border_style="cyan"))
+    except Exception as e:
+        console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
+
+
+async def demo_config_dump_invalid(client: httpx.AsyncClient):
+    """
+    Purposely break the rule (nested call) to show the 400 parse error.
+    """
+    bad_code = textwrap.dedent("""
+        BrowserConfig(headless=True); CrawlerRunConfig()
+    """).strip()
+    payload = {"code": bad_code}
+
+    console.rule(
+        "[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
+    print_payload(payload)
+
+    try:
+        resp = await client.post("/config/dump", json=payload)
+        console.print(
+            f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
+        resp.raise_for_status()   # should throw -> except
+    except httpx.HTTPStatusError as e:
+        console.print("[cyan]Expected parse/validation failure captured:[/]")
+        try:
+            console.print(Panel(Syntax(json.dumps(
+                e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
+        except Exception:
+            console.print(e.response.text)
+    except Exception as e:
+        console.print(
+            f"[bold red]Unexpected error during invalid test:[/] {e}")
+
+
+# --- Update Main Runner to include new demo ---
+async def main_demo():
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
+        if not await check_server_health(client):
+            return
+
+        # --- Run Demos ---
+        # await demo_basic_single_url(client)
+        # await demo_basic_multi_url(client)
+        # await demo_streaming_multi_url(client)
+
+        # await demo_markdown_default(client)
+        # await demo_markdown_pruning(client)
+        # await demo_markdown_bm25(client)
+
+        # await demo_param_css_selector(client)
+        # await demo_param_js_execution(client)
+        # await demo_param_screenshot(client)
+        # await demo_param_ssl_fetch(client)
+        # await demo_param_proxy(client) # Skips if no PROXIES env var
+
+        # await demo_extract_css(client)
+        # await demo_extract_llm(client)  # Skips if no common LLM key env var
+
+        # await demo_deep_basic(client)
+        # await demo_deep_streaming(client)  # This need extra work
+
+        # await demo_deep_with_css_extraction(client)
+        # # Skips if no common LLM key env var
+        # await demo_deep_with_llm_extraction(client)
+        # await demo_deep_with_proxy(client)  # Skips if no PROXIES env var
+        # await demo_deep_with_ssl(client)   # Added the new demo
+
+        # --- Helper endpoints ---
+        await demo_markdown_endpoint(client)
+        await demo_llm_endpoint(client)
+
+        # --- /config/dump sanity checks ---
+        await demo_config_dump_valid(client)
+        await demo_config_dump_invalid(client)
+
+        console.rule("[bold green]Demo Complete[/]", style="green")
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main_demo())
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Demo interrupted by user.[/]")
+    except Exception as e:
+        console.print(
+            f"\n[bold red]An error occurred during demo execution:[/]")
+        console.print_exception(show_locals=False)
diff --git a/docs/examples/docker/demo_docker_polling.py b/docs/examples/docker/demo_docker_polling.py
new file mode 100644
index 00000000..ee895723
--- /dev/null
+++ b/docs/examples/docker/demo_docker_polling.py
@@ -0,0 +1,149 @@
+
+#!/usr/bin/env python3
+"""
+demo_docker_polling.py
+Quick sanity-check for the asynchronous crawl job endpoints:
+
+  • POST  /crawl/job          – enqueue work, get task_id
+  • GET   /crawl/job/{id}     – poll status / fetch result
+
+The style matches demo_docker_api.py (console.rule banners, helper
+functions, coloured status lines).  Adjust BASE_URL as needed.
+
+Run:  python demo_docker_polling.py
+"""
+
+import asyncio, json, os, time, urllib.parse
+from typing import Dict, List
+
+import httpx
+from rich.console import Console
+from rich.panel   import Panel
+from rich.syntax  import Syntax
+
+console   = Console()
+BASE_URL  = os.getenv("BASE_URL", "http://localhost:11234")
+SIMPLE_URL = "https://example.org"
+LINKS_URL  = "https://httpbin.org/links/10/1"
+
+# --- helpers --------------------------------------------------------------
+
+
+def print_payload(payload: Dict):
+    console.print(Panel(Syntax(json.dumps(payload, indent=2),
+                               "json", theme="monokai", line_numbers=False),
+                        title="Payload", border_style="cyan", expand=False))
+
+
+async def check_server_health(client: httpx.AsyncClient) -> bool:
+    try:
+        resp = await client.get("/health")
+        if resp.is_success:
+            console.print("[green]Server healthy[/]")
+            return True
+    except Exception:
+        pass
+    console.print("[bold red]Server is not responding on /health[/]")
+    return False
+
+
+async def poll_for_result(client: httpx.AsyncClient, task_id: str,
+                          poll_interval: float = 1.5, timeout: float = 90.0):
+    """Hit /crawl/job/{id} until COMPLETED/FAILED or timeout."""
+    start = time.time()
+    while True:
+        resp = await client.get(f"/crawl/job/{task_id}")
+        resp.raise_for_status()
+        data = resp.json()
+        status = data.get("status")
+        if status.upper() in ("COMPLETED", "FAILED"):
+            return data
+        if time.time() - start > timeout:
+            raise TimeoutError(f"Task {task_id} did not finish in {timeout}s")
+        await asyncio.sleep(poll_interval)
+
+
+# --- demo functions -------------------------------------------------------
+
+
+async def demo_poll_single_url(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig",
+                           "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig",
+                           "params": {"cache_mode": "BYPASS"}}
+    }
+
+    console.rule("[bold blue]Demo A: /crawl/job Single URL[/]", style="blue")
+    print_payload(payload)
+
+    # enqueue
+    resp = await client.post("/crawl/job", json=payload)
+    console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
+    resp.raise_for_status()
+    task_id = resp.json()["task_id"]
+    console.print(f"Task ID: [yellow]{task_id}[/]")
+
+    # poll
+    console.print("Polling…")
+    result = await poll_for_result(client, task_id)
+    console.print(Panel(Syntax(json.dumps(result, indent=2),
+                               "json", theme="fruity"),
+                        title="Final result", border_style="green"))
+    if result["status"] == "COMPLETED":
+        console.print("[green]✅ Crawl succeeded[/]")
+    else:
+        console.print("[red]❌ Crawl failed[/]")
+
+
+async def demo_poll_multi_url(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL, LINKS_URL],
+        "browser_config": {"type": "BrowserConfig",
+                           "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig",
+                           "params": {"cache_mode": "BYPASS"}}
+    }
+
+    console.rule("[bold magenta]Demo B: /crawl/job Multi-URL[/]",
+                 style="magenta")
+    print_payload(payload)
+
+    resp = await client.post("/crawl/job", json=payload)
+    console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
+    resp.raise_for_status()
+    task_id = resp.json()["task_id"]
+    console.print(f"Task ID: [yellow]{task_id}[/]")
+
+    console.print("Polling…")
+    result = await poll_for_result(client, task_id)
+    console.print(Panel(Syntax(json.dumps(result, indent=2),
+                               "json", theme="fruity"),
+                        title="Final result", border_style="green"))
+    if result["status"] == "COMPLETED":
+        console.print(
+            f"[green]✅ {len(json.loads(result['result'])['results'])} URLs crawled[/]")
+    else:
+        console.print("[red]❌ Crawl failed[/]")
+
+
+# --- main runner ----------------------------------------------------------
+
+
+async def main_demo():
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
+        if not await check_server_health(client):
+            return
+        await demo_poll_single_url(client)
+        await demo_poll_multi_url(client)
+        console.rule("[bold green]Polling demos complete[/]", style="green")
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main_demo())
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Interrupted by user[/]")
+    except Exception:
+        console.print_exception(show_locals=False)
diff --git a/docs/examples/docker_config_obj.py b/docs/examples/docker_config_obj.py
new file mode 100644
index 00000000..6ddf157a
--- /dev/null
+++ b/docs/examples/docker_config_obj.py
@@ -0,0 +1,249 @@
+from crawl4ai import BrowserConfig, CrawlerRunConfig, PruningContentFilter, DefaultMarkdownGenerator
+from crawl4ai.deep_crawling.filters import ContentTypeFilter, DomainFilter
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer, PathDepthScorer
+from crawl4ai.cache_context import CacheMode
+from crawl4ai.deep_crawling.bfs_strategy import BFSDeepCrawlStrategy
+from crawl4ai.deep_crawling.filters import FilterChain
+from crawl4ai.deep_crawling.scorers import CompositeScorer
+from crawl4ai.docker_client import Crawl4aiDockerClient
+import json
+from rich.console import Console
+from rich.syntax import Syntax
+
+console = Console()
+
+def print_json(data: dict, title: str = None):
+    """Helper to print JSON prettily with syntax highlighting"""
+    if title:
+        console.print(f"\n[bold blue]{title}[/bold blue]")
+    json_str = json.dumps(data, indent=2)
+    syntax = Syntax(json_str, "json", theme="monokai", line_numbers=True)
+    console.print(syntax)
+
+async def part1_basic_config():
+    """PART 1: Understanding Basic Configuration Objects
+    
+    Here we create simple configuration objects and examine their structure.
+    This helps understand the basic type-params pattern used throughout the API.
+    """
+    console.print("\n[bold green]Explanation:[/bold green] Configuration objects like BrowserConfig and CrawlerRunConfig are the foundation of Crawl4AI. They define how the crawler behaves—e.g., whether it runs headless or how it processes content. These objects use a 'type-params' pattern: 'type' identifies the object class, and 'params' holds its settings. This structure is key because it’s reusable and can be serialized into JSON for API calls.")
+    
+    # Create a simple browser config
+    browser_config = BrowserConfig(
+        headless=False,
+        viewport_width=500,
+        headers = {"User-Agent": "Mozilla/5.0"}
+    )
+    
+    # Show its structure
+    print_json(browser_config.dump(), "Simple Browser Config Structure")
+    
+    # Create a more complex config with nested objects
+    crawler_config = CrawlerRunConfig(
+        word_count_threshold=200,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(threshold=0.5)
+        )
+    )
+    
+    print_json(crawler_config.dump(), "Complex Config with Nested Objects")
+
+async def part2_manual_json():
+    """PART 2: Building JSON Manually
+    
+    Learn how to construct the JSON structure by hand.
+    This demonstrates deep understanding of the configuration format.
+    """
+    console.print("\n[bold green]Explanation:[/bold green] Manually building JSON configurations mirrors how the API expects data. It’s a hands-on way to learn the exact structure—each object has a 'type' and 'params' section. This is useful when you’re troubleshooting or working without the SDK, as it forces you to understand every detail of the config format.")
+    
+    # Manual browser config
+    manual_browser = {
+        "type": "BrowserConfig",
+        "params": {
+            "headless": True,
+            "viewport": {
+                "type": "dict",
+                "value": {
+                    "width": 1200,
+                    "height": 800
+                }
+            }
+        }
+    }
+    
+    # Validate by loading into BrowserConfig
+    loaded_config = BrowserConfig.load(manual_browser)
+    print_json(loaded_config.dump(), "Manually Created -> Loaded -> Dumped")
+    
+    # Show they're equivalent
+    original = BrowserConfig(headless=True, viewport={"width": 1200, "height": 800})
+    assert loaded_config.dump() == original.dump(), "Configs are equivalent!"
+    
+async def part3_complex_structures():
+    """PART 3: Working with Complex Nested Structures
+    
+    Explore more complex configurations with multiple levels of nesting.
+    This shows how the type-params pattern scales to complex scenarios.
+    """
+    console.print("\n[bold green]Explanation:[/bold green] Real-world crawling often requires detailed settings—like filtering content or customizing output. Here, we nest objects (e.g., a markdown generator with a content filter) using the same 'type-params' pattern. This nesting lets you fine-tune the crawler’s behavior at multiple levels, making it powerful and flexible.")
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter()
+        ),
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=5,
+            filter_chain=FilterChain(
+                filters=[
+                    ContentTypeFilter(allowed_types=["text/html"]),
+                    DomainFilter(allowed_domains=["example.com"])
+                ]
+            ),
+            url_scorer=CompositeScorer(
+                scorers=[
+                    KeywordRelevanceScorer(keywords=["data", "analysis"]),
+                    PathDepthScorer(optimal_depth=3)
+                ]
+            )
+        )
+    )
+    
+    print_json(config.dump(), "Deep Nested Configuration")
+
+async def part4_client_sdk():
+    """PART 4: Using the Client SDK
+    
+    Demonstrate how the SDK makes working with the API simple by handling
+    all the complex serialization automatically.
+    """
+    console.print("\n[bold green]Explanation:[/bold green] The Crawl4aiDockerClient SDK is a time-saver—it takes your configuration objects and turns them into API-ready JSON automatically. This means less manual work and fewer mistakes. You just define your settings, pass them to the SDK, and it handles the rest, making crawling easier and faster.")
+    
+    async with Crawl4aiDockerClient(base_url="http://localhost:8000") as client:
+        # You would normally authenticate here if JWT is enabled
+        await client.authenticate("user@example.com")
+        
+        # Create configs
+        browser_config = BrowserConfig(headless=True)
+        crawler_config = CrawlerRunConfig(stream=False)
+        
+        # SDK handles all serialization
+        result = await client.crawl(
+            urls=["https://example.com"],
+            browser_config=browser_config,
+            crawler_config=crawler_config
+        )
+        
+        console.print("\n[bold green]🚀 Crawl completed successfully![/bold green]")
+        console.print(f"Markdown length: {len(result.markdown)} characters")
+
+async def part5_direct_api():
+    """PART 5: Using the API Directly
+    
+    Learn how to make direct API calls without the SDK.
+    This demonstrates the raw request structure and gives more control.
+    """
+    console.print("\n[bold green]Explanation:[/bold green] Skipping the SDK means you’re in full control—you build the JSON payload yourself and send it to the API. This is harder but gives you a deeper understanding of how Crawl4AI works under the hood. It’s also useful if you’re integrating with systems that don’t use the SDK.")
+    
+    import aiohttp
+    from datetime import datetime
+
+    # Prepare the request payload
+    payload = {
+        "urls": ["https://example.com"],
+        "browser_config": {
+            "type": "BrowserConfig",
+            "params": {
+                "headless": True,
+                "viewport": {
+                    "type": "dict",
+                    "value": {
+                        "width": 1200,
+                        "height": 800
+                    }
+                }
+            }
+        },
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "bypass",
+                "markdown_generator": {
+                    "type": "DefaultMarkdownGenerator",
+                    "params": {
+                        "content_filter": {
+                            "type": "PruningContentFilter",
+                            "params": {
+                                "threshold": 0.48,
+                                "threshold_type": "fixed"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    print_json(payload, "Direct API Request Payload")
+
+    async with aiohttp.ClientSession() as session:
+        # If JWT is enabled, get token first
+        token_response = await session.post(
+            "http://localhost:8000/token",
+            json={"email": "user@example.com"}
+        )
+        token = (await token_response.json())["access_token"]
+        headers = {"Authorization": f"Bearer {token}"}
+
+        # Make the crawl request
+        start_time = datetime.now()
+        async with session.post(
+            "http://localhost:8000/crawl",
+            json=payload,
+            headers=headers  # comment if using JWT
+        ) as response:
+            result = await response.json()
+            duration = (datetime.now() - start_time).total_seconds()
+
+        console.print(f"\n[bold green]✅ API call completed in {duration:.2f}s[/bold green]")
+        print_json(result, "API Response")
+
+async def part6_wrap_up():
+    """PART 6: Wrap-Up and Key Takeaways
+    
+    Summarize the key concepts learned in this tutorial.
+    """
+    console.print("\n[bold yellow]🎓 Tutorial Wrap-Up[/bold yellow]")
+    console.print("[italic]Key Takeaways:[/italic]\n")
+    console.print("- **Configurations:** Use the type-params pattern to define settings flexibly.")
+    console.print("- **Manual JSON:** Build configs by hand to master the structure.")
+    console.print("- **Nesting:** Customize deeply with nested objects.")
+    console.print("- **SDK:** Simplify API calls with automatic serialization.")
+    console.print("- **Direct API:** Gain control by crafting raw requests.")
+    console.print("\n[bold green]🚀 You’re ready to crawl with Crawl4AI![/bold green]")
+
+async def main():
+    """Main tutorial runner that executes each part in sequence"""
+    console.print("\n[bold yellow]🎓 Crawl4AI Docker Tutorial[/bold yellow]")
+    console.print("[italic]Learn how to work with configuration objects and the Docker API[/italic]\n")
+    
+    parts = [
+        (part1_basic_config, "Understanding Basic Configurations"),
+        (part2_manual_json, "Manual JSON Construction"),
+        (part3_complex_structures, "Complex Nested Structures"),
+        (part4_client_sdk, "Using the Client SDK"),
+        (part5_direct_api, "Direct API Integration"),
+        (part6_wrap_up, "Wrap-Up and Key Takeaways")
+    ]
+    
+    for func, title in parts:
+        console.print(f"\n[bold cyan]📚 {title}[/bold cyan]")
+        console.print("[dim]" + func.__doc__.strip() + "[/dim]\n")
+        await func()
+        if func != part6_wrap_up:  # No pause after wrap-up
+            input("\nPress Enter to continue...\n")
+
+# Run the tutorial
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/docker_python_rest_api.py b/docs/examples/docker_python_rest_api.py
new file mode 100644
index 00000000..6650f8d5
--- /dev/null
+++ b/docs/examples/docker_python_rest_api.py
@@ -0,0 +1,214 @@
+import asyncio
+import json
+from typing import Optional
+from urllib.parse import quote
+
+async def get_token(session, email: str = "test@example.com") -> str:
+    """Fetch a JWT token from the /token endpoint."""
+    url = "http://localhost:8000/token"
+    payload = {"email": email}
+    print(f"\nFetching token from {url} with email: {email}")
+    try:
+        async with session.post(url, json=payload) as response:
+            status = response.status
+            data = await response.json()
+            print(f"Token Response Status: {status}")
+            print(f"Token Response: {json.dumps(data, indent=2)}")
+            if status == 200:
+                return data["access_token"]
+            else:
+                raise Exception(f"Failed to get token: {data.get('detail', 'Unknown error')}")
+    except Exception as e:
+        print(f"Error fetching token: {str(e)}")
+        raise
+
+async def test_endpoint(
+    session,
+    endpoint: str,
+    url: str,
+    token: str,
+    params: Optional[dict] = None,
+    expected_status: int = 200
+) -> Optional[dict]:
+    """Test an endpoint with token and print results."""
+    params = params or {}
+    param_str = "&".join(f"{k}={v}" for k, v in params.items())
+    full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
+    if param_str:
+        full_url += f"?{param_str}"
+    
+    headers = {"Authorization": f"Bearer {token}"}
+    print(f"\nTesting: {full_url}")
+    
+    try:
+        async with session.get(full_url, headers=headers) as response:
+            status = response.status
+            try:
+                data = await response.json()
+            except:
+                data = await response.text()
+            
+            print(f"Status: {status} (Expected: {expected_status})")
+            if isinstance(data, dict):
+                print(f"Response: {json.dumps(data, indent=2)}")
+            else:
+                print(f"Response: {data[:500]}...")  # First 500 chars
+            assert status == expected_status, f"Expected {expected_status}, got {status}"
+            return data
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return None
+
+
+async def test_stream_crawl(session, token: str):
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:8000/crawl/stream"
+    payload = {
+        "urls": [
+            "https://example.com",
+            "https://example.com/page1",  # Replicated example.com with variation
+            "https://example.com/page2",  # Replicated example.com with variation
+            "https://example.com/page3",  # Replicated example.com with variation
+            # "https://www.python.org",
+            # "https://news.ycombinator.com/news"
+        ],
+        "browser_config": {"headless": True, "viewport": {"width": 1200}},
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
+    }
+    headers = {"Authorization": f"Bearer {token}"}
+    print(f"\nTesting Streaming Crawl: {url}")
+    print(f"Payload: {json.dumps(payload, indent=2)}")
+    
+    try:
+        async with session.post(url, json=payload, headers=headers) as response:
+            status = response.status
+            print(f"Status: {status} (Expected: 200)")
+            assert status == 200, f"Expected 200, got {status}"
+            
+            # Read streaming response line-by-line (NDJSON)
+            async for line in response.content:
+                if line:
+                    data = json.loads(line.decode('utf-8').strip())
+                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+
+async def run_tests():
+    import aiohttp
+    print("Starting API Tests...")
+    
+    # Test URLs
+    urls = [
+        "example.com",
+        "https://www.python.org",
+        "https://news.ycombinator.com/news",
+        "https://github.com/trending"
+    ]
+    
+    async with aiohttp.ClientSession() as session:
+        token = "test_token"
+        # If jwt is enabled, authenticate first
+        # Fetch token once and reuse it
+        # token = await get_token(session)
+        # if not token:
+        #     print("Aborting tests due to token failure!")
+        #     return
+        
+        print("\n=== Testing Crawl Endpoint ===")
+        crawl_payload = {
+            "urls": ["https://example.com"],
+            "browser_config": {"headless": True},
+            "crawler_config": {"stream": False}
+        }
+        async with session.post(
+            "http://localhost:8000/crawl",
+            json=crawl_payload,
+            headers={"Authorization": f"Bearer {token}"}
+        ) as response:
+            status = response.status
+            data = await response.json()
+            print(f"\nCrawl Endpoint Status: {status}")
+            print(f"Crawl Response: {json.dumps(data, indent=2)}")
+        
+
+        print("\n=== Testing Crawl Stream Endpoint ===")
+        await test_stream_crawl(session, token)
+
+        print("\n=== Testing Markdown Endpoint ===")
+        for url in []: #urls:
+            for filter_type in ["raw", "fit", "bm25", "llm"]:
+                params = {"f": filter_type}
+                if filter_type in ["bm25", "llm"]:
+                    params["q"] = "extract main content"
+                
+                for cache in ["0", "1"]:
+                    params["c"] = cache
+                    await test_endpoint(session, "md", url, token, params)
+                    await asyncio.sleep(1)  # Be nice to the server
+
+        print("\n=== Testing LLM Endpoint ===")
+        for url in urls:
+            # Test basic extraction (direct response now)
+            result = await test_endpoint(
+                session,
+                "llm",
+                url,
+                token,
+                {"q": "Extract title and main content"}
+            )
+            
+            # Test with schema (direct response)
+            schema = {
+                "type": "object",
+                "properties": {
+                    "title": {"type": "string"},
+                    "content": {"type": "string"},
+                    "links": {"type": "array", "items": {"type": "string"}}
+                }
+            }
+            result = await test_endpoint(
+                session,
+                "llm",
+                url,
+                token,
+                {
+                    "q": "Extract content with links",
+                    "s": json.dumps(schema),
+                    "c": "1"  # Test with cache
+                }
+            )
+            await asyncio.sleep(2)  # Be nice to the server
+        
+        print("\n=== Testing Error Cases ===")
+        # Test invalid URL
+        await test_endpoint(
+            session,
+            "md",
+            "not_a_real_url",
+            token,
+            expected_status=500
+        )
+        
+        # Test invalid filter type
+        await test_endpoint(
+            session,
+            "md",
+            "example.com",
+            token,
+            {"f": "invalid"},
+            expected_status=422
+        )
+        
+        # Test LLM without query (should fail per your server logic)
+        await test_endpoint(
+            session,
+            "llm",
+            "example.com",
+            token,
+            expected_status=400
+        )
+        
+    print("\nAll tests completed!")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
\ No newline at end of file
diff --git a/docs/examples/docker_python_sdk.py b/docs/examples/docker_python_sdk.py
new file mode 100644
index 00000000..72091da0
--- /dev/null
+++ b/docs/examples/docker_python_sdk.py
@@ -0,0 +1,35 @@
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig
+)
+
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
+        # If jwt is enabled, authenticate first
+        # await client.authenticate("test@example.com")
+        
+        # Non-streaming crawl
+        results = await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig()
+        )
+        print(f"Non-streaming results: {results}")
+        
+        # Streaming crawl
+        crawler_config = CrawlerRunConfig(stream=True)
+        async for result in await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=crawler_config
+        ):
+            print(f"Streamed result: {result}")
+        
+        # Get schema
+        schema = await client.get_schema()
+        print(f"Schema: {schema}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/extraction_strategies_example.py b/docs/examples/extraction_strategies_examples.py
similarity index 90%
rename from docs/examples/extraction_strategies_example.py
rename to docs/examples/extraction_strategies_examples.py
index 658f7521..84192f97 100644
--- a/docs/examples/extraction_strategies_example.py
+++ b/docs/examples/extraction_strategies_examples.py
@@ -11,6 +11,7 @@ import asyncio
 import os
 
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai import LLMConfig
 from crawl4ai.extraction_strategy import (
     LLMExtractionStrategy,
     JsonCssExtractionStrategy,
@@ -38,9 +39,9 @@ async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str
         if result.success:
             print(f"\n=== {name} Results ===")
             print(f"Extracted Content: {result.extracted_content}")
-            print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
+            print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")
             print(
-                f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}"
+                f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"
             )
         else:
             print(f"Error in {name}: Crawl failed")
@@ -60,22 +61,19 @@ async def main():
 
     # 1. LLM Extraction with different input formats
     markdown_strategy = LLMExtractionStrategy(
-        provider="openai/gpt-4o-mini",
-        api_token=os.getenv("OPENAI_API_KEY"),
+        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
         instruction="Extract product information including name, price, and description",
     )
 
     html_strategy = LLMExtractionStrategy(
         input_format="html",
-        provider="openai/gpt-4o-mini",
-        api_token=os.getenv("OPENAI_API_KEY"),
+        llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
         instruction="Extract product information from HTML including structured data",
     )
 
     fit_markdown_strategy = LLMExtractionStrategy(
         input_format="fit_markdown",
-        provider="openai/gpt-4o-mini",
-        api_token=os.getenv("OPENAI_API_KEY"),
+        llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
         instruction="Extract product information from cleaned markdown",
     )
 
diff --git a/docs/examples/full_page_screenshot_and_pdf_export.md b/docs/examples/full_page_screenshot_and_pdf_export.md
index 8522675c..bf11f8db 100644
--- a/docs/examples/full_page_screenshot_and_pdf_export.md
+++ b/docs/examples/full_page_screenshot_and_pdf_export.md
@@ -12,9 +12,10 @@ We’ve introduced a new feature that effortlessly handles even the biggest page
 
 **Simple Example:**
 ```python
-import os, sys
+import os
+import sys
 import asyncio
-from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
 
 # Adjust paths as needed
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -26,9 +27,11 @@ async def main():
         # Request both PDF and screenshot
         result = await crawler.arun(
             url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
-            cache_mode=CacheMode.BYPASS,
-            pdf=True,
-            screenshot=True
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                pdf=True,
+                screenshot=True
+            )
         )
         
         if result.success:
@@ -40,9 +43,8 @@ async def main():
             
             # Save PDF
             if result.pdf:
-                pdf_bytes = b64decode(result.pdf)
                 with open(os.path.join(__location__, "page.pdf"), "wb") as f:
-                    f.write(pdf_bytes)
+                    f.write(result.pdf)
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
index 97a8187e..2ba2e852 100644
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -1,23 +1,29 @@
 import asyncio
-from crawl4ai import *
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    CrawlResult
+)
 
 
 async def main():
-    browser_config = BrowserConfig(headless=True, verbose=True)
+    browser_config = BrowserConfig(
+        headless=False,
+        verbose=True,
+    )
     async with AsyncWebCrawler(config=browser_config) as crawler:
         crawler_config = CrawlerRunConfig(
-            cache_mode=CacheMode.BYPASS,
             markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
+                content_filter=PruningContentFilter()
             ),
         )
-        result = await crawler.arun(
+        result: CrawlResult = await crawler.arun(
             url="https://www.helloworld.org", config=crawler_config
         )
-        print(result.markdown_v2.raw_markdown[:500])
-
+        print(result.markdown.raw_markdown[:500])
 
 if __name__ == "__main__":
     asyncio.run(main())
diff --git a/docs/examples/identity_based_browsing.py b/docs/examples/identity_based_browsing.py
new file mode 100644
index 00000000..01596948
--- /dev/null
+++ b/docs/examples/identity_based_browsing.py
@@ -0,0 +1,108 @@
+"""
+Identity-Based Browsing Example with Crawl4AI
+
+This example demonstrates how to:
+1. Create a persistent browser profile interactively
+2. List available profiles
+3. Use a saved profile for crawling authenticated sites
+4. Delete profiles when no longer needed
+
+Uses the new BrowserProfiler class for profile management.
+"""
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+from crawl4ai.browser_profiler import BrowserProfiler
+from crawl4ai.async_logger import AsyncLogger
+from colorama import Fore, Style, init
+
+# Initialize colorama
+init()
+
+# Create a shared logger instance
+logger = AsyncLogger(verbose=True)
+
+# Create a shared BrowserProfiler instance
+profiler = BrowserProfiler(logger=logger)
+
+
+async def crawl_with_profile(profile_path, url):
+    """Use a profile to crawl an authenticated page"""
+    logger.info(f"\nCrawling {Fore.CYAN}{url}{Style.RESET_ALL} using profile at {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CRAWL")
+    
+    # Create browser config with the profile path
+    browser_config = BrowserConfig(
+        headless=False,  # Set to False if you want to see the browser window
+        use_managed_browser=True,  # Required for persistent profiles
+        user_data_dir=profile_path
+    )
+    
+    start_time = asyncio.get_event_loop().time()
+    
+    # Initialize crawler with the browser config
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Crawl the URL - You should have access to authenticated content now
+        result = await crawler.arun(url)
+        
+        elapsed_time = asyncio.get_event_loop().time() - start_time
+        
+        if result.success:
+            # Use url_status method for consistent logging
+            logger.url_status(url, True, elapsed_time, tag="CRAWL")
+            
+            # Print page title or some indication of success
+            title = result.metadata.get("title", "")
+            logger.success(f"Page title: {Fore.GREEN}{title}{Style.RESET_ALL}", tag="CRAWL")
+            return result
+        else:
+            # Log error status
+            logger.error_status(url, result.error_message, tag="CRAWL")
+            return None
+
+
+async def main():
+    logger.info(f"{Fore.CYAN}Identity-Based Browsing Example with Crawl4AI{Style.RESET_ALL}", tag="DEMO")
+    logger.info("This example demonstrates using profiles for authenticated browsing", tag="DEMO")
+    
+    # Choose between interactive mode and automatic mode
+    mode = input(f"{Fore.CYAN}Run in [i]nteractive mode or [a]utomatic mode? (i/a): {Style.RESET_ALL}").lower()
+    
+    if mode == 'i':
+        # Interactive profile management - use the interactive_manager method
+        # Pass the crawl_with_profile function as the callback for the "crawl a website" option
+        await profiler.interactive_manager(crawl_callback=crawl_with_profile)
+    else:
+        # Automatic mode - simplified example
+        profiles = profiler.list_profiles()
+        
+        if not profiles:
+            # Create a new profile if none exists
+            logger.info("No profiles found. Creating a new one...", tag="DEMO")
+            profile_path = await profiler.create_profile()
+            if not profile_path:
+                logger.error("Cannot proceed without a valid profile", tag="DEMO")
+                return
+        else:
+            # Use the first (most recent) profile
+            profile_path = profiles[0]["path"]
+            logger.info(f"Using existing profile: {Fore.CYAN}{profiles[0]['name']}{Style.RESET_ALL}", tag="DEMO")
+        
+        # Example: Crawl an authenticated page
+        urls_to_crawl = [
+            "https://github.com/settings/profile",  # GitHub requires login
+            # "https://twitter.com/home",  # Twitter requires login
+            # "https://www.linkedin.com/feed/",  # LinkedIn requires login
+        ]
+        
+        for url in urls_to_crawl:
+            await crawl_with_profile(profile_path, url)
+
+
+if __name__ == "__main__":
+    try:
+        # Run the async main function
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        logger.warning("Example interrupted by user", tag="DEMO")
+    except Exception as e:
+        logger.error(f"Error in example: {str(e)}", tag="DEMO")
\ No newline at end of file
diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py
index e9e90dd2..27a1c310 100644
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,9 +1,11 @@
-from crawl4ai.extraction_strategy import *
-from crawl4ai.crawler_strategy import *
+from crawl4ai import LLMConfig
+from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
 import asyncio
+import os
+import json
 from pydantic import BaseModel, Field
 
-url = r"https://openai.com/api/pricing/"
+url = "https://openai.com/api/pricing/"
 
 
 class OpenAIModelFee(BaseModel):
@@ -13,10 +15,6 @@ class OpenAIModelFee(BaseModel):
         ..., description="Fee for output token for the OpenAI model."
     )
 
-
-from crawl4ai import AsyncWebCrawler
-
-
 async def main():
     # Use AsyncWebCrawler
     async with AsyncWebCrawler() as crawler:
@@ -25,8 +23,7 @@ async def main():
             word_count_threshold=1,
             extraction_strategy=LLMExtractionStrategy(
                 # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
-                provider="groq/llama-3.1-70b-versatile",
-                api_token=os.getenv("GROQ_API_KEY"),
+                llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
                 schema=OpenAIModelFee.model_json_schema(),
                 extraction_type="schema",
                 instruction="From the crawled content, extract all mentioned model names along with their "
diff --git a/docs/examples/llm_markdown_generator.py b/docs/examples/llm_markdown_generator.py
index 60b8549d..777c59b0 100644
--- a/docs/examples/llm_markdown_generator.py
+++ b/docs/examples/llm_markdown_generator.py
@@ -1,6 +1,7 @@
 import os
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai import LLMConfig
 from crawl4ai.content_filter_strategy import LLMContentFilter
 
 async def test_llm_filter():
@@ -22,8 +23,7 @@ async def test_llm_filter():
 
         # Initialize LLM filter with focused instruction
         filter = LLMContentFilter(
-            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
+            llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
             instruction="""
             Focus on extracting the core educational content about Python classes.
             Include:
@@ -43,9 +43,9 @@ async def test_llm_filter():
         )
         
         filter = LLMContentFilter(
-            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
+            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
             chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
+            ignore_cache = True,
             instruction="""
             Extract the main educational content while preserving its original wording and substance completely. Your task is to:
 
@@ -68,7 +68,7 @@ async def test_llm_filter():
         )        
 
         # Apply filtering
-        filtered_content = filter.filter_content(html, ignore_cache = True)
+        filtered_content = filter.filter_content(html)
         
         # Show results
         print("\nFiltered Content Length:", len(filtered_content))
diff --git a/docs/examples/markdown/content_source_example.py b/docs/examples/markdown/content_source_example.py
new file mode 100644
index 00000000..5d836765
--- /dev/null
+++ b/docs/examples/markdown/content_source_example.py
@@ -0,0 +1,64 @@
+"""
+Example showing how to use the content_source parameter to control HTML input for markdown generation.
+"""
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+
+async def demo_content_source():
+    """Demonstrates different content_source options for markdown generation."""
+    url = "https://example.com"  # Simple demo site
+    
+    print("Crawling with different content_source options...")
+    
+    # --- Example 1: Default Behavior (cleaned_html) ---
+    # This uses the HTML after it has been processed by the scraping strategy
+    # The HTML is cleaned, simplified, and optimized for readability
+    default_generator = DefaultMarkdownGenerator()  # content_source="cleaned_html" is default
+    default_config = CrawlerRunConfig(markdown_generator=default_generator)
+    
+    # --- Example 2: Raw HTML ---
+    # This uses the original HTML directly from the webpage
+    # Preserves more original content but may include navigation, ads, etc.
+    raw_generator = DefaultMarkdownGenerator(content_source="raw_html")
+    raw_config = CrawlerRunConfig(markdown_generator=raw_generator)
+    
+    # --- Example 3: Fit HTML ---
+    # This uses preprocessed HTML optimized for schema extraction
+    # Better for structured data extraction but may lose some formatting
+    fit_generator = DefaultMarkdownGenerator(content_source="fit_html")
+    fit_config = CrawlerRunConfig(markdown_generator=fit_generator)
+    
+    # Execute all three crawlers in sequence
+    async with AsyncWebCrawler() as crawler:
+        # Default (cleaned_html)
+        result_default = await crawler.arun(url=url, config=default_config)
+        
+        # Raw HTML
+        result_raw = await crawler.arun(url=url, config=raw_config)
+        
+        # Fit HTML
+        result_fit = await crawler.arun(url=url, config=fit_config)
+    
+    # Print a summary of the results
+    print("\nMarkdown Generation Results:\n")
+    
+    print("1. Default (cleaned_html):")
+    print(f"   Length: {len(result_default.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_default.markdown.raw_markdown[:80]}...\n")
+    
+    print("2. Raw HTML:")
+    print(f"   Length: {len(result_raw.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_raw.markdown.raw_markdown[:80]}...\n")
+    
+    print("3. Fit HTML:")
+    print(f"   Length: {len(result_fit.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_fit.markdown.raw_markdown[:80]}...\n")
+    
+    # Demonstrate differences in output
+    print("\nKey Takeaways:")
+    print("- cleaned_html: Best for readable, focused content")
+    print("- raw_html: Preserves more original content, but may include noise")
+    print("- fit_html: Optimized for schema extraction and structured data")
+
+if __name__ == "__main__":
+    asyncio.run(demo_content_source())
\ No newline at end of file
diff --git a/docs/examples/markdown/content_source_short_example.py b/docs/examples/markdown/content_source_short_example.py
new file mode 100644
index 00000000..83c3ecb4
--- /dev/null
+++ b/docs/examples/markdown/content_source_short_example.py
@@ -0,0 +1,42 @@
+"""
+Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
+"""
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+
+async def demo_markdown_source_config():
+    print("\n=== Demo: Configuring Markdown Source ===")
+
+    # Example 1: Generate markdown from cleaned HTML (default behavior)
+    cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
+    config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
+        print("Markdown from Cleaned HTML (default):")
+        print(f"  Length: {len(result_cleaned.markdown.raw_markdown)}")
+        print(f"  Start: {result_cleaned.markdown.raw_markdown[:100]}...")
+
+    # Example 2: Generate markdown directly from raw HTML
+    raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
+    config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_raw = await crawler.arun(url="https://example.com", config=config_raw)
+        print("\nMarkdown from Raw HTML:")
+        print(f"  Length: {len(result_raw.markdown.raw_markdown)}")
+        print(f"  Start: {result_raw.markdown.raw_markdown[:100]}...")
+
+    # Example 3: Generate markdown from preprocessed 'fit' HTML
+    fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
+    config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_fit = await crawler.arun(url="https://example.com", config=config_fit)
+        print("\nMarkdown from Fit HTML:")
+        print(f"  Length: {len(result_fit.markdown.raw_markdown)}")
+        print(f"  Start: {result_fit.markdown.raw_markdown[:100]}...")
+
+if __name__ == "__main__":
+    asyncio.run(demo_markdown_source_config())
\ No newline at end of file
diff --git a/docs/examples/network_console_capture_example.py b/docs/examples/network_console_capture_example.py
new file mode 100644
index 00000000..0208bdce
--- /dev/null
+++ b/docs/examples/network_console_capture_example.py
@@ -0,0 +1,477 @@
+import asyncio
+import json
+import os
+import base64
+from pathlib import Path
+from typing import List, Dict, Any
+from datetime import datetime
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai import BrowserConfig
+
+__cur_dir__ = Path(__file__).parent
+
+# Create temp directory if it doesn't exist
+os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
+
+async def demo_basic_network_capture():
+    """Basic network request capturing example"""
+    print("\n=== 1. Basic Network Request Capturing ===")
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            wait_until="networkidle"  # Wait for network to be idle
+        )
+        
+        result = await crawler.arun(
+            url="https://example.com/",
+            config=config
+        )
+        
+        if result.success and result.network_requests:
+            print(f"Captured {len(result.network_requests)} network events")
+            
+            # Count by event type
+            event_types = {}
+            for req in result.network_requests:
+                event_type = req.get("event_type", "unknown")
+                event_types[event_type] = event_types.get(event_type, 0) + 1
+            
+            print("Event types:")
+            for event_type, count in event_types.items():
+                print(f"  - {event_type}: {count}")
+            
+            # Show a sample request and response
+            request = next((r for r in result.network_requests if r.get("event_type") == "request"), None)
+            response = next((r for r in result.network_requests if r.get("event_type") == "response"), None)
+            
+            if request:
+                print("\nSample request:")
+                print(f"  URL: {request.get('url')}")
+                print(f"  Method: {request.get('method')}")
+                print(f"  Headers: {list(request.get('headers', {}).keys())}")
+            
+            if response:
+                print("\nSample response:")
+                print(f"  URL: {response.get('url')}")
+                print(f"  Status: {response.get('status')} {response.get('status_text', '')}")
+                print(f"  Headers: {list(response.get('headers', {}).keys())}")
+
+async def demo_basic_console_capture():
+    """Basic console message capturing example"""
+    print("\n=== 2. Basic Console Message Capturing ===")
+    
+    # Create a simple HTML file with console messages
+    html_file = os.path.join(__cur_dir__, "tmp", "console_test.html")
+    with open(html_file, "w") as f:
+        f.write("""
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>Console Test</title>
+        </head>
+        <body>
+            <h1>Console Message Test</h1>
+            <script>
+                console.log("This is a basic log message");
+                console.info("This is an info message");
+                console.warn("This is a warning message");
+                console.error("This is an error message");
+                
+                // Generate an error
+                try {
+                    nonExistentFunction();
+                } catch (e) {
+                    console.error("Caught error:", e);
+                }
+            </script>
+        </body>
+        </html>
+        """)
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_console_messages=True,
+            wait_until="networkidle"  # Wait to make sure all scripts execute
+        )
+        
+        result = await crawler.arun(
+            url=f"file://{html_file}",
+            config=config
+        )
+        
+        if result.success and result.console_messages:
+            print(f"Captured {len(result.console_messages)} console messages")
+            
+            # Count by message type
+            message_types = {}
+            for msg in result.console_messages:
+                msg_type = msg.get("type", "unknown")
+                message_types[msg_type] = message_types.get(msg_type, 0) + 1
+            
+            print("Message types:")
+            for msg_type, count in message_types.items():
+                print(f"  - {msg_type}: {count}")
+            
+            # Show all messages
+            print("\nAll console messages:")
+            for i, msg in enumerate(result.console_messages, 1):
+                print(f"  {i}. [{msg.get('type', 'unknown')}] {msg.get('text', '')}")
+
+async def demo_combined_capture():
+    """Capturing both network requests and console messages"""
+    print("\n=== 3. Combined Network and Console Capture ===")
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            capture_console_messages=True,
+            wait_until="networkidle"
+        )
+        
+        result = await crawler.arun(
+            url="https://httpbin.org/html",
+            config=config
+        )
+        
+        if result.success:
+            network_count = len(result.network_requests) if result.network_requests else 0
+            console_count = len(result.console_messages) if result.console_messages else 0
+            
+            print(f"Captured {network_count} network events and {console_count} console messages")
+            
+            # Save the captured data to a JSON file for analysis
+            output_file = os.path.join(__cur_dir__, "tmp", "capture_data.json")
+            with open(output_file, "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "timestamp": datetime.now().isoformat(),
+                    "network_requests": result.network_requests,
+                    "console_messages": result.console_messages
+                }, f, indent=2)
+            
+            print(f"Full capture data saved to {output_file}")
+
+async def analyze_spa_network_traffic():
+    """Analyze network traffic of a Single-Page Application"""
+    print("\n=== 4. Analyzing SPA Network Traffic ===")
+    
+    async with AsyncWebCrawler(config=BrowserConfig(
+        headless=True,
+        viewport_width=1280,
+        viewport_height=800
+    )) as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            capture_console_messages=True,
+            # Wait longer to ensure all resources are loaded
+            wait_until="networkidle",
+            page_timeout=60000,  # 60 seconds
+        )
+        
+        result = await crawler.arun(
+            url="https://weather.com",
+            config=config
+        )
+        
+        if result.success and result.network_requests:
+            # Extract different types of requests
+            requests = []
+            responses = []
+            failures = []
+            
+            for event in result.network_requests:
+                event_type = event.get("event_type")
+                if event_type == "request":
+                    requests.append(event)
+                elif event_type == "response":
+                    responses.append(event)
+                elif event_type == "request_failed":
+                    failures.append(event)
+            
+            print(f"Captured {len(requests)} requests, {len(responses)} responses, and {len(failures)} failures")
+            
+            # Analyze request types
+            resource_types = {}
+            for req in requests:
+                resource_type = req.get("resource_type", "unknown")
+                resource_types[resource_type] = resource_types.get(resource_type, 0) + 1
+            
+            print("\nResource types:")
+            for resource_type, count in sorted(resource_types.items(), key=lambda x: x[1], reverse=True):
+                print(f"  - {resource_type}: {count}")
+            
+            # Analyze API calls
+            api_calls = [r for r in requests if "api" in r.get("url", "").lower()]
+            if api_calls:
+                print(f"\nDetected {len(api_calls)} API calls:")
+                for i, call in enumerate(api_calls[:5], 1):  # Show first 5
+                    print(f"  {i}. {call.get('method')} {call.get('url')}")
+                if len(api_calls) > 5:
+                    print(f"     ... and {len(api_calls) - 5} more")
+            
+            # Analyze response status codes
+            status_codes = {}
+            for resp in responses:
+                status = resp.get("status", 0)
+                status_codes[status] = status_codes.get(status, 0) + 1
+            
+            print("\nResponse status codes:")
+            for status, count in sorted(status_codes.items()):
+                print(f"  - {status}: {count}")
+            
+            # Analyze failures
+            if failures:
+                print("\nFailed requests:")
+                for i, failure in enumerate(failures[:5], 1):  # Show first 5
+                    print(f"  {i}. {failure.get('url')} - {failure.get('failure_text')}")
+                if len(failures) > 5:
+                    print(f"     ... and {len(failures) - 5} more")
+            
+            # Check for console errors
+            if result.console_messages:
+                errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+                if errors:
+                    print(f"\nDetected {len(errors)} console errors:")
+                    for i, error in enumerate(errors[:3], 1):  # Show first 3
+                        print(f"  {i}. {error.get('text', '')[:100]}...")
+                    if len(errors) > 3:
+                        print(f"     ... and {len(errors) - 3} more")
+            
+            # Save analysis to file
+            output_file = os.path.join(__cur_dir__, "tmp", "weather_network_analysis.json")
+            with open(output_file, "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "timestamp": datetime.now().isoformat(),
+                    "statistics": {
+                        "request_count": len(requests),
+                        "response_count": len(responses),
+                        "failure_count": len(failures),
+                        "resource_types": resource_types,
+                        "status_codes": {str(k): v for k, v in status_codes.items()},
+                        "api_call_count": len(api_calls),
+                        "console_error_count": len(errors) if result.console_messages else 0
+                    },
+                    "network_requests": result.network_requests,
+                    "console_messages": result.console_messages
+                }, f, indent=2)
+            
+            print(f"\nFull analysis saved to {output_file}")
+
+async def demo_security_analysis():
+    """Using network capture for security analysis"""
+    print("\n=== 5. Security Analysis with Network Capture ===")
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            capture_console_messages=True,
+            wait_until="networkidle"
+        )
+        
+        # A site that makes multiple third-party requests
+        result = await crawler.arun(
+            url="https://www.nytimes.com/",
+            config=config
+        )
+        
+        if result.success and result.network_requests:
+            print(f"Captured {len(result.network_requests)} network events")
+            
+            # Extract all domains
+            domains = set()
+            for req in result.network_requests:
+                if req.get("event_type") == "request":
+                    url = req.get("url", "")
+                    try:
+                        from urllib.parse import urlparse
+                        domain = urlparse(url).netloc
+                        if domain:
+                            domains.add(domain)
+                    except:
+                        pass
+            
+            print(f"\nDetected requests to {len(domains)} unique domains:")
+            main_domain = urlparse(result.url).netloc
+            
+            # Separate first-party vs third-party domains
+            first_party = [d for d in domains if main_domain in d]
+            third_party = [d for d in domains if main_domain not in d]
+            
+            print(f"  - First-party domains: {len(first_party)}")
+            print(f"  - Third-party domains: {len(third_party)}")
+            
+            # Look for potential trackers/analytics
+            tracking_keywords = ["analytics", "tracker", "pixel", "tag", "stats", "metric", "collect", "beacon"]
+            potential_trackers = []
+            
+            for domain in third_party:
+                if any(keyword in domain.lower() for keyword in tracking_keywords):
+                    potential_trackers.append(domain)
+            
+            if potential_trackers:
+                print(f"\nPotential tracking/analytics domains ({len(potential_trackers)}):")
+                for i, domain in enumerate(sorted(potential_trackers)[:10], 1):
+                    print(f"  {i}. {domain}")
+                if len(potential_trackers) > 10:
+                    print(f"     ... and {len(potential_trackers) - 10} more")
+            
+            # Check for insecure (HTTP) requests
+            insecure_requests = [
+                req.get("url") for req in result.network_requests 
+                if req.get("event_type") == "request" and req.get("url", "").startswith("http://")
+            ]
+            
+            if insecure_requests:
+                print(f"\nWarning: Found {len(insecure_requests)} insecure (HTTP) requests:")
+                for i, url in enumerate(insecure_requests[:5], 1):
+                    print(f"  {i}. {url}")
+                if len(insecure_requests) > 5:
+                    print(f"     ... and {len(insecure_requests) - 5} more")
+            
+            # Save security analysis to file
+            output_file = os.path.join(__cur_dir__, "tmp", "security_analysis.json")
+            with open(output_file, "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "main_domain": main_domain,
+                    "timestamp": datetime.now().isoformat(),
+                    "analysis": {
+                        "total_requests": len([r for r in result.network_requests if r.get("event_type") == "request"]),
+                        "unique_domains": len(domains),
+                        "first_party_domains": first_party,
+                        "third_party_domains": third_party,
+                        "potential_trackers": potential_trackers,
+                        "insecure_requests": insecure_requests
+                    }
+                }, f, indent=2)
+            
+            print(f"\nFull security analysis saved to {output_file}")
+
+async def demo_performance_analysis():
+    """Using network capture for performance analysis"""
+    print("\n=== 6. Performance Analysis with Network Capture ===")
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            capture_network_requests=True,
+            page_timeout=60 * 2 * 1000  # 120 seconds
+        )
+        
+        result = await crawler.arun(
+            url="https://www.cnn.com/",
+            config=config
+        )
+        
+        if result.success and result.network_requests:
+            # Filter only response events with timing information
+            responses_with_timing = [
+                r for r in result.network_requests 
+                if r.get("event_type") == "response" and r.get("request_timing")
+            ]
+            
+            if responses_with_timing:
+                print(f"Analyzing timing for {len(responses_with_timing)} network responses")
+                
+                # Group by resource type
+                resource_timings = {}
+                for resp in responses_with_timing:
+                    url = resp.get("url", "")
+                    timing = resp.get("request_timing", {})
+                    
+                    # Determine resource type from URL extension
+                    ext = url.split(".")[-1].lower() if "." in url.split("/")[-1] else "unknown"
+                    if ext in ["jpg", "jpeg", "png", "gif", "webp", "svg", "ico"]:
+                        resource_type = "image"
+                    elif ext in ["js"]:
+                        resource_type = "javascript"
+                    elif ext in ["css"]:
+                        resource_type = "css"
+                    elif ext in ["woff", "woff2", "ttf", "otf", "eot"]:
+                        resource_type = "font"
+                    else:
+                        resource_type = "other"
+                    
+                    if resource_type not in resource_timings:
+                        resource_timings[resource_type] = []
+                    
+                    # Calculate request duration if timing information is available
+                    if isinstance(timing, dict) and "requestTime" in timing and "receiveHeadersEnd" in timing:
+                        # Convert to milliseconds
+                        duration = (timing["receiveHeadersEnd"] - timing["requestTime"]) * 1000
+                        resource_timings[resource_type].append({
+                            "url": url,
+                            "duration_ms": duration
+                        })
+                    if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
+                        # Convert to milliseconds
+                        duration = (timing["responseStart"] - timing["requestStart"]) * 1000
+                        resource_timings[resource_type].append({
+                            "url": url,
+                            "duration_ms": duration
+                        })
+                
+                # Calculate statistics for each resource type
+                print("\nPerformance by resource type:")
+                for resource_type, timings in resource_timings.items():
+                    if timings:
+                        durations = [t["duration_ms"] for t in timings]
+                        avg_duration = sum(durations) / len(durations)
+                        max_duration = max(durations)
+                        slowest_resource = next(t["url"] for t in timings if t["duration_ms"] == max_duration)
+                        
+                        print(f"  {resource_type.upper()}:")
+                        print(f"    - Count: {len(timings)}")
+                        print(f"    - Avg time: {avg_duration:.2f} ms")
+                        print(f"    - Max time: {max_duration:.2f} ms")
+                        print(f"    - Slowest: {slowest_resource}")
+                
+                # Identify the slowest resources overall
+                all_timings = []
+                for resource_type, timings in resource_timings.items():
+                    for timing in timings:
+                        timing["type"] = resource_type
+                        all_timings.append(timing)
+                
+                all_timings.sort(key=lambda x: x["duration_ms"], reverse=True)
+                
+                print("\nTop 5 slowest resources:")
+                for i, timing in enumerate(all_timings[:5], 1):
+                    print(f"  {i}. [{timing['type']}] {timing['url']} - {timing['duration_ms']:.2f} ms")
+                
+                # Save performance analysis to file
+                output_file = os.path.join(__cur_dir__, "tmp", "performance_analysis.json")
+                with open(output_file, "w") as f:
+                    json.dump({
+                        "url": result.url,
+                        "timestamp": datetime.now().isoformat(),
+                        "resource_timings": resource_timings,
+                        "slowest_resources": all_timings[:10]  # Save top 10
+                    }, f, indent=2)
+                
+                print(f"\nFull performance analysis saved to {output_file}")
+
+async def main():
+    """Run all demo functions sequentially"""
+    print("=== Network and Console Capture Examples ===")
+    
+    # Make sure tmp directory exists
+    os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
+    
+    # Run basic examples
+    # await demo_basic_network_capture()
+    await demo_basic_console_capture()
+    # await demo_combined_capture()
+    
+    # Run advanced examples
+    # await analyze_spa_network_traffic()
+    # await demo_security_analysis()
+    # await demo_performance_analysis()
+    
+    print("\n=== Examples Complete ===")
+    print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/proxy_rotation_demo.py b/docs/examples/proxy_rotation_demo.py
new file mode 100644
index 00000000..7efa974d
--- /dev/null
+++ b/docs/examples/proxy_rotation_demo.py
@@ -0,0 +1,161 @@
+import os
+import re
+from typing import List, Dict
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    RoundRobinProxyStrategy
+)
+
+def load_proxies_from_env() -> List[Dict]:
+    """Load proxies from PROXIES environment variable"""
+    proxies = []
+    try:
+        proxy_list = os.getenv("PROXIES", "").split(",")
+        for proxy in proxy_list:
+            if not proxy:
+                continue
+            ip, port, username, password = proxy.split(":")
+            proxies.append({
+                "server": f"http://{ip}:{port}",
+                "username": username,
+                "password": password,
+                "ip": ip  # Store original IP for verification
+            })
+    except Exception as e:
+        print(f"Error loading proxies from environment: {e}")
+    return proxies
+
+async def demo_proxy_rotation():
+    """
+    Proxy Rotation Demo using RoundRobinProxyStrategy
+    ===============================================
+    Demonstrates proxy rotation using the strategy pattern.
+    """
+    print("\n=== Proxy Rotation Demo (Round Robin) ===")
+    
+    # Load proxies and create rotation strategy
+    proxies = load_proxies_from_env()
+    if not proxies:
+        print("No proxies found in environment. Set PROXIES env variable!")
+        return
+        
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+    
+    # Create configs
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        proxy_rotation_strategy=proxy_strategy
+    )
+    
+    # Test URLs
+    urls = ["https://httpbin.org/ip"] * len(proxies)  # Test each proxy once
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        for url in urls:
+            result = await crawler.arun(url=url, config=run_config)
+            
+            if result.success:
+                # Extract IP from response
+                ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                current_proxy = run_config.proxy_config if run_config.proxy_config else None
+                
+                if current_proxy:
+                    print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
+                    verified = ip_match and ip_match.group(0) == current_proxy['ip']
+                    if verified:
+                        print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
+                    else:
+                        print("❌ Proxy failed or IP mismatch!")
+            else:
+                print(f"Request failed: {result.error_message}")
+
+async def demo_proxy_rotation_batch():
+    """
+    Proxy Rotation Demo with Batch Processing
+    =======================================
+    Demonstrates proxy rotation using arun_many with memory dispatcher.
+    """
+    print("\n=== Proxy Rotation Batch Demo ===")
+    
+    try:
+        # Load proxies and create rotation strategy
+        proxies = load_proxies_from_env()
+        if not proxies:
+            print("No proxies found in environment. Set PROXIES env variable!")
+            return
+            
+        proxy_strategy = RoundRobinProxyStrategy(proxies)
+        
+        # Configurations
+        browser_config = BrowserConfig(headless=True, verbose=False)
+        run_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            proxy_rotation_strategy=proxy_strategy,
+            markdown_generator=DefaultMarkdownGenerator()
+        )
+
+        # Test URLs - multiple requests to test rotation
+        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
+
+        print("\n📈 Initializing crawler with proxy rotation...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            monitor = CrawlerMonitor(
+                max_visible_rows=10,
+                display_mode=DisplayMode.DETAILED
+            )
+            
+            dispatcher = MemoryAdaptiveDispatcher(
+                memory_threshold_percent=80.0,
+                check_interval=0.5,
+                max_session_permit=1, #len(proxies),  # Match concurrent sessions to proxy count
+                # monitor=monitor
+            )
+            
+            print("\n🚀 Starting batch crawl with proxy rotation...")
+            results = await crawler.arun_many(
+                urls=urls,
+                config=run_config,
+                dispatcher=dispatcher
+            )
+
+            # Verify results
+            success_count = 0
+            for result in results:
+                if result.success:
+                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
+                    
+                    if current_proxy and ip_match:
+                        print(f"URL {result.url}")
+                        print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
+                        verified = ip_match.group(0) == current_proxy['ip']
+                        if verified:
+                            print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
+                            success_count += 1
+                        else:
+                            print("❌ Proxy failed or IP mismatch!")
+                    print("---")
+                    
+            print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
+            
+    except Exception as e:
+        print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
+
+if __name__ == "__main__":
+    import asyncio
+    from crawl4ai import (
+        CrawlerMonitor, 
+        DisplayMode,
+        MemoryAdaptiveDispatcher,
+        DefaultMarkdownGenerator
+    )
+    
+    async def run_demos():
+        # await demo_proxy_rotation()  # Original single-request demo
+        await demo_proxy_rotation_batch()  # New batch processing demo
+        
+    asyncio.run(run_demos())
diff --git a/docs/examples/quickstart.ipynb b/docs/examples/quickstart.ipynb
index 4751dec8..56365cde 100644
--- a/docs/examples/quickstart.ipynb
+++ b/docs/examples/quickstart.ipynb
@@ -80,7 +80,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "003376f3",
    "metadata": {},
    "outputs": [
@@ -114,7 +114,7 @@
     "            url=\"https://www.nbcnews.com/business\",\n",
     "            bypass_cache=True # By default this is False, meaning the cache will be used\n",
     "        )\n",
-    "        print(result.markdown[:500])  # Print the first 500 characters\n",
+    "        print(result.markdown.raw_markdown[:500])  # Print the first 500 characters\n",
     "        \n",
     "asyncio.run(simple_crawl())"
    ]
@@ -129,7 +129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "id": "5bb8c1e4",
    "metadata": {},
    "outputs": [
@@ -177,7 +177,7 @@
     "            # wait_for=wait_for,\n",
     "            bypass_cache=True,\n",
     "        )\n",
-    "        print(result.markdown[:500])  # Print first 500 characters\n",
+    "        print(result.markdown.raw_markdown[:500])  # Print first 500 characters\n",
     "\n",
     "asyncio.run(crawl_dynamic_content())"
    ]
@@ -206,11 +206,11 @@
     "            word_count_threshold=10,\n",
     "            bypass_cache=True\n",
     "        )\n",
-    "        full_markdown_length = len(result.markdown)\n",
-    "        fit_markdown_length = len(result.fit_markdown)\n",
+    "        full_markdown_length = len(result.markdown.raw_markdown)\n",
+    "        fit_markdown_length = len(result.markdown.fit_markdown)\n",
     "        print(f\"Full Markdown Length: {full_markdown_length}\")\n",
     "        print(f\"Fit Markdown Length: {fit_markdown_length}\")\n",
-    "        print(result.fit_markdown[:1000])\n",
+    "        print(result.markdown.fit_markdown[:1000])\n",
     "        \n",
     "\n",
     "asyncio.run(clean_content())"
@@ -342,7 +342,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "id": "bc4d2fc8",
    "metadata": {},
    "outputs": [
@@ -387,7 +387,7 @@
     "            url=\"https://crawl4ai.com\",\n",
     "            bypass_cache=True\n",
     "        )\n",
-    "        print(result.markdown[:500])  # Display the first 500 characters\n",
+    "        print(result.markdown.raw_markdown[:500])  # Display the first 500 characters\n",
     "\n",
     "asyncio.run(custom_hook_workflow())"
    ]
@@ -465,7 +465,7 @@
     "                bypass_cache=True\n",
     "            )\n",
     "            print(f\"Page {page_number} Content:\")\n",
-    "            print(result.markdown[:500])  # Print first 500 characters\n",
+    "            print(result.markdown.raw_markdown[:500])  # Print first 500 characters\n",
     "\n",
     "# asyncio.run(multi_page_session_crawl())"
    ]
diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart.py
similarity index 88%
rename from docs/examples/quickstart_async.config.py
rename to docs/examples/quickstart.py
index b58443bd..5efb785d 100644
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart.py
@@ -1,5 +1,7 @@
 import os, sys
 
+from crawl4ai import LLMConfig
+
 sys.path.append(
     os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 )
@@ -57,8 +59,8 @@ async def clean_content():
             url="https://en.wikipedia.org/wiki/Apple",
             config=crawler_config,
         )
-        full_markdown_length = len(result.markdown_v2.raw_markdown)
-        fit_markdown_length = len(result.markdown_v2.fit_markdown)
+        full_markdown_length = len(result.markdown.raw_markdown)
+        fit_markdown_length = len(result.markdown.fit_markdown)
         print(f"Full Markdown Length: {full_markdown_length}")
         print(f"Fit Markdown Length: {fit_markdown_length}")
 
@@ -137,7 +139,7 @@ async def custom_hook_workflow(verbose=True):
 
         # Perform the crawl operation
         result = await crawler.arun(url="https://crawl4ai.com")
-        print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
+        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
 
 
 # Proxy Example
@@ -209,8 +211,7 @@ async def extract_structured_data_using_llm(
         word_count_threshold=1,
         page_timeout=80000,
         extraction_strategy=LLMExtractionStrategy(
-            provider=provider,
-            api_token=api_token,
+            llm_config=LLMConfig(provider=provider,api_token=api_token),
             schema=OpenAIModelFee.model_json_schema(),
             extraction_type="schema",
             instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
@@ -415,6 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
 
 
 async def cosine_similarity_extraction():
+    from crawl4ai.extraction_strategy import CosineStrategy
     crawl_config = CrawlerRunConfig(
         cache_mode=CacheMode.BYPASS,
         extraction_strategy=CosineStrategy(
@@ -506,6 +508,9 @@ async def ssl_certification():
         if result.success and result.ssl_certificate:
             cert = result.ssl_certificate
 
+            tmp_dir = os.path.join(__location__, "tmp")
+            os.makedirs(tmp_dir, exist_ok=True)
+
             # 1. Access certificate properties directly
             print("\nCertificate Information:")
             print(f"Issuer: {cert.issuer.get('CN', '')}")
@@ -528,67 +533,6 @@ async def ssl_certification():
             print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
 
 
-# Speed Comparison
-async def speed_comparison():
-    print("\n--- Speed Comparison ---")
-
-    # Firecrawl comparison
-    from firecrawl import FirecrawlApp
-
-    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
-    start = time.time()
-    scrape_status = app.scrape_url(
-        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
-    )
-    end = time.time()
-    print("Firecrawl:")
-    print(f"Time taken: {end - start:.2f} seconds")
-    print(f"Content length: {len(scrape_status['markdown'])} characters")
-    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
-    print()
-
-    # Crawl4AI comparisons
-    browser_config = BrowserConfig(headless=True)
-
-    # Simple crawl
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=CrawlerRunConfig(
-                cache_mode=CacheMode.BYPASS, word_count_threshold=0
-            ),
-        )
-        end = time.time()
-        print("Crawl4AI (simple crawl):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Advanced filtering
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=CrawlerRunConfig(
-                cache_mode=CacheMode.BYPASS,
-                word_count_threshold=0,
-                markdown_generator=DefaultMarkdownGenerator(
-                    content_filter=PruningContentFilter(
-                        threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                    )
-                ),
-            ),
-        )
-        end = time.time()
-        print("Crawl4AI (Markdown Plus):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-
 # Main execution
 async def main():
     # Basic examples
diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py
deleted file mode 100644
index 1585ebea..00000000
--- a/docs/examples/quickstart_async.py
+++ /dev/null
@@ -1,675 +0,0 @@
-import os, sys
-
-# append parent directory to system path
-sys.path.append(
-    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-)
-os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"
-
-import asyncio
-# import nest_asyncio
-# nest_asyncio.apply()
-
-import time
-import json
-import os
-import re
-from typing import Dict, List
-from bs4 import BeautifulSoup
-from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.extraction_strategy import (
-    JsonCssExtractionStrategy,
-    LLMExtractionStrategy,
-)
-
-__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-print("Crawl4AI: Advanced Web Crawling and Data Extraction")
-print("GitHub Repository: https://github.com/unclecode/crawl4ai")
-print("Twitter: @unclecode")
-print("Website: https://crawl4ai.com")
-
-
-async def simple_crawl():
-    print("\n--- Basic Usage ---")
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def simple_example_with_running_js_code():
-    print("\n--- Executing JavaScript and Using CSS Selectors ---")
-    # New code to handle the wait_for parameter
-    wait_for = """() => {
-        return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
-    }"""
-
-    # wait_for can be also just a css selector
-    # wait_for = "article.tease-card:nth-child(10)"
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        js_code = [
-            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-        ]
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            js_code=js_code,
-            # wait_for=wait_for,
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def simple_example_with_css_selector():
-    print("\n--- Using CSS Selectors ---")
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            css_selector=".wide-tease-item__description",
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.markdown[:500])  # Print first 500 characters
-
-
-async def use_proxy():
-    print("\n--- Using a Proxy ---")
-    print(
-        "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
-    )
-    # Uncomment and modify the following lines to use a proxy
-    async with AsyncWebCrawler(
-        verbose=True, proxy="http://your-proxy-url:port"
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
-        )
-        if result.success:
-            print(result.markdown[:500])  # Print first 500 characters
-
-
-async def capture_and_save_screenshot(url: str, output_path: str):
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url=url, screenshot=True, cache_mode=CacheMode.BYPASS
-        )
-
-        if result.success and result.screenshot:
-            import base64
-
-            # Decode the base64 screenshot data
-            screenshot_data = base64.b64decode(result.screenshot)
-
-            # Save the screenshot as a JPEG file
-            with open(output_path, "wb") as f:
-                f.write(screenshot_data)
-
-            print(f"Screenshot saved successfully to {output_path}")
-        else:
-            print("Failed to capture screenshot")
-
-
-class OpenAIModelFee(BaseModel):
-    model_name: str = Field(..., description="Name of the OpenAI model.")
-    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(
-        ..., description="Fee for output token for the OpenAI model."
-    )
-
-
-async def extract_structured_data_using_llm(
-    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
-):
-    print(f"\n--- Extracting Structured Data with {provider} ---")
-
-    if api_token is None and provider != "ollama":
-        print(f"API token is required for {provider}. Skipping this example.")
-        return
-
-    # extra_args = {}
-    extra_args = {
-        "temperature": 0,
-        "top_p": 0.9,
-        "max_tokens": 2000,
-        # any other supported parameters for litellm
-    }
-    if extra_headers:
-        extra_args["extra_headers"] = extra_headers
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://openai.com/api/pricing/",
-            word_count_threshold=1,
-            extraction_strategy=LLMExtractionStrategy(
-                provider=provider,
-                api_token=api_token,
-                schema=OpenAIModelFee.model_json_schema(),
-                extraction_type="schema",
-                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
-                Do not miss any models in the entire content. One extracted model JSON format should look like this: 
-                {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
-                extra_args=extra_args,
-            ),
-            cache_mode=CacheMode.BYPASS,
-        )
-        print(result.extracted_content)
-
-
-async def extract_structured_data_using_css_extractor():
-    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
-    schema = {
-        "name": "KidoCode Courses",
-        "baseSelector": "section.charge-methodology .w-tab-content > div",
-        "fields": [
-            {
-                "name": "section_title",
-                "selector": "h3.heading-50",
-                "type": "text",
-            },
-            {
-                "name": "section_description",
-                "selector": ".charge-content",
-                "type": "text",
-            },
-            {
-                "name": "course_name",
-                "selector": ".text-block-93",
-                "type": "text",
-            },
-            {
-                "name": "course_description",
-                "selector": ".course-content-text",
-                "type": "text",
-            },
-            {
-                "name": "course_icon",
-                "selector": ".image-92",
-                "type": "attribute",
-                "attribute": "src",
-            },
-        ],
-    }
-
-    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
-        # Create the JavaScript that handles clicking multiple times
-        js_click_tabs = """
-        (async () => {
-            const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
-            
-            for(let tab of tabs) {
-                // scroll to the tab
-                tab.scrollIntoView();
-                tab.click();
-                // Wait for content to load and animations to complete
-                await new Promise(r => setTimeout(r, 500));
-            }
-        })();
-        """
-
-        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology",
-            extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
-            js_code=[js_click_tabs],
-            cache_mode=CacheMode.BYPASS,
-        )
-
-        companies = json.loads(result.extracted_content)
-        print(f"Successfully extracted {len(companies)} companies")
-        print(json.dumps(companies[0], indent=2))
-
-
-# Advanced Session-Based Crawling with Dynamic Content 🔄
-async def crawl_dynamic_content_pages_method_1():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
-    first_commit = ""
-
-    async def on_execution_started(page):
-        nonlocal first_commit
-        try:
-            while True:
-                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
-                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
-                commit = await commit.evaluate("(element) => element.textContent")
-                commit = re.sub(r"\s+", "", commit)
-                if commit and commit != first_commit:
-                    first_commit = commit
-                    break
-                await asyncio.sleep(0.5)
-        except Exception as e:
-            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
-
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-
-        js_next_page = """
-        (() => {
-            const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            if (button) button.click();
-        })();
-        """
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                js=js_next_page if page > 0 else None,
-                cache_mode=CacheMode.BYPASS,
-                js_only=page > 0,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            soup = BeautifulSoup(result.cleaned_html, "html.parser")
-            commits = soup.select("li")
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_2():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-        last_commit = ""
-
-        js_next_page_and_wait = """
-        (async () => {
-            const getCurrentCommit = () => {
-                const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-                return commits.length > 0 ? commits[0].textContent.trim() : null;
-            };
-
-            const initialCommit = getCurrentCommit();
-            const button = document.querySelector('a[data-testid="pagination-next-button"]');
-            if (button) button.click();
-
-            // Poll for changes
-            while (true) {
-                await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
-                const newCommit = getCurrentCommit();
-                if (newCommit && newCommit !== initialCommit) {
-                    break;
-                }
-            }
-        })();
-        """
-
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h4.markdown-title",
-                    "type": "text",
-                    "transform": "strip",
-                },
-            ],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                extraction_strategy=extraction_strategy,
-                js_code=js_next_page_and_wait if page > 0 else None,
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            commits = json.loads(result.extracted_content)
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_3():
-    print(
-        "\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
-    )
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        session_id = "typescript_commits_session"
-        all_commits = []
-
-        js_next_page = """
-        const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-        if (commits.length > 0) {
-            window.firstCommit = commits[0].textContent.trim();
-        }
-        const button = document.querySelector('a[data-testid="pagination-next-button"]');
-        if (button) button.click();
-        """
-
-        wait_for = """() => {
-            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-            if (commits.length === 0) return false;
-            const firstCommit = commits[0].textContent.trim();
-            return firstCommit !== window.firstCommit;
-        }"""
-
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h4.markdown-title",
-                    "type": "text",
-                    "transform": "strip",
-                },
-            ],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-        for page in range(3):  # Crawl 3 pages
-            result = await crawler.arun(
-                url=url,
-                session_id=session_id,
-                css_selector="li.Box-sc-g0xbh4-0",
-                extraction_strategy=extraction_strategy,
-                js_code=js_next_page if page > 0 else None,
-                wait_for=wait_for if page > 0 else None,
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS,
-                headless=False,
-            )
-
-            assert result.success, f"Failed to crawl page {page + 1}"
-
-            commits = json.loads(result.extracted_content)
-            all_commits.extend(commits)
-
-            print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_custom_browser_type():
-    # Use Firefox
-    start = time.time()
-    async with AsyncWebCrawler(
-        browser_type="firefox", verbose=True, headless=True
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-    # Use WebKit
-    start = time.time()
-    async with AsyncWebCrawler(
-        browser_type="webkit", verbose=True, headless=True
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-    # Use Chromium (default)
-    start = time.time()
-    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com", cache_mode=CacheMode.BYPASS
-        )
-        print(result.markdown[:500])
-        print("Time taken: ", time.time() - start)
-
-
-async def crawl_with_user_simultion():
-    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
-        url = "YOUR-URL-HERE"
-        result = await crawler.arun(
-            url=url,
-            cache_mode=CacheMode.BYPASS,
-            magic=True,  # Automatically detects and removes overlays, popups, and other elements that block content
-            # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
-            # override_navigator = True # Overrides the navigator object to make it look like a real user
-        )
-
-        print(result.markdown)
-
-
-async def speed_comparison():
-    # print("\n--- Speed Comparison ---")
-    # print("Firecrawl (simulated):")
-    # print("Time taken: 7.02 seconds")
-    # print("Content length: 42074 characters")
-    # print("Images found: 49")
-    # print()
-    # Simulated Firecrawl performance
-    from firecrawl import FirecrawlApp
-
-    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
-    start = time.time()
-    scrape_status = app.scrape_url(
-        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
-    )
-    end = time.time()
-    print("Firecrawl:")
-    print(f"Time taken: {end - start:.2f} seconds")
-    print(f"Content length: {len(scrape_status['markdown'])} characters")
-    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
-    print()
-
-    async with AsyncWebCrawler() as crawler:
-        # Crawl4AI simple crawl
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            word_count_threshold=0,
-            cache_mode=CacheMode.BYPASS,
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (simple crawl):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Crawl4AI with advanced content filtering
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            word_count_threshold=0,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
-                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
-            ),
-            cache_mode=CacheMode.BYPASS,
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (Markdown Plus):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
-        print()
-
-        # Crawl4AI with JavaScript execution
-        start = time.time()
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            js_code=[
-                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-            ],
-            word_count_threshold=0,
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
-                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
-            ),
-            verbose=False,
-        )
-        end = time.time()
-        print("Crawl4AI (with JavaScript execution):")
-        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
-
-    print("\nNote on Speed Comparison:")
-    print("The speed test conducted here may not reflect optimal conditions.")
-    print("When we call Firecrawl's API, we're seeing its best performance,")
-    print("while Crawl4AI's performance is limited by the local network speed.")
-    print("For a more accurate comparison, it's recommended to run these tests")
-    print("on servers with a stable and fast internet connection.")
-    print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
-    print("If you run these tests in an environment with better network conditions,")
-    print("you may observe an even more significant speed advantage for Crawl4AI.")
-
-
-async def generate_knowledge_graph():
-    class Entity(BaseModel):
-        name: str
-        description: str
-
-    class Relationship(BaseModel):
-        entity1: Entity
-        entity2: Entity
-        description: str
-        relation_type: str
-
-    class KnowledgeGraph(BaseModel):
-        entities: List[Entity]
-        relationships: List[Relationship]
-
-    extraction_strategy = LLMExtractionStrategy(
-        provider="openai/gpt-4o-mini",  # Or any other provider, including Ollama and open source models
-        api_token=os.getenv("OPENAI_API_KEY"),  # In case of Ollama just pass "no-token"
-        schema=KnowledgeGraph.model_json_schema(),
-        extraction_type="schema",
-        instruction="""Extract entities and relationships from the given text.""",
-    )
-    async with AsyncWebCrawler() as crawler:
-        url = "https://paulgraham.com/love.html"
-        result = await crawler.arun(
-            url=url,
-            cache_mode=CacheMode.BYPASS,
-            extraction_strategy=extraction_strategy,
-            # magic=True
-        )
-        # print(result.extracted_content)
-        with open(os.path.join(__location__, "kb.json"), "w") as f:
-            f.write(result.extracted_content)
-
-
-async def fit_markdown_remove_overlay():
-    async with AsyncWebCrawler(
-        headless=True,  # Set to False to see what is happening
-        verbose=True,
-        user_agent_mode="random",
-        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology",
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                ),
-                options={"ignore_links": True},
-            ),
-            # markdown_generator=DefaultMarkdownGenerator(
-            #     content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
-            #     options={
-            #         "ignore_links": True
-            #     }
-            # ),
-        )
-
-        if result.success:
-            print(len(result.markdown_v2.raw_markdown))
-            print(len(result.markdown_v2.markdown_with_citations))
-            print(len(result.markdown_v2.fit_markdown))
-
-            # Save clean html
-            with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
-                f.write(result.cleaned_html)
-
-            with open(
-                os.path.join(__location__, "output/output_raw_markdown.md"), "w"
-            ) as f:
-                f.write(result.markdown_v2.raw_markdown)
-
-            with open(
-                os.path.join(__location__, "output/output_markdown_with_citations.md"),
-                "w",
-            ) as f:
-                f.write(result.markdown_v2.markdown_with_citations)
-
-            with open(
-                os.path.join(__location__, "output/output_fit_markdown.md"), "w"
-            ) as f:
-                f.write(result.markdown_v2.fit_markdown)
-
-    print("Done")
-
-
-async def main():
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-
-    # await simple_crawl()
-    # await simple_example_with_running_js_code()
-    # await simple_example_with_css_selector()
-    # # await use_proxy()
-    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    # await extract_structured_data_using_css_extractor()
-
-    # LLM extraction examples
-    # await extract_structured_data_using_llm()
-    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-    # await extract_structured_data_using_llm("ollama/llama3.2")
-
-    # You always can pass custom headers to the extraction strategy
-    # custom_headers = {
-    #     "Authorization": "Bearer your-custom-token",
-    #     "X-Custom-Header": "Some-Value"
-    # }
-    # await extract_structured_data_using_llm(extra_headers=custom_headers)
-
-    # await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
-    await crawl_dynamic_content_pages_method_3()
-
-    # await crawl_custom_browser_type()
-
-    # await speed_comparison()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/docs/examples/quickstart_examples_set_1.py b/docs/examples/quickstart_examples_set_1.py
new file mode 100644
index 00000000..078d1c4a
--- /dev/null
+++ b/docs/examples/quickstart_examples_set_1.py
@@ -0,0 +1,412 @@
+import asyncio
+import os
+import json
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai import ProxyConfig
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai import RoundRobinProxyStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import LLMConfig
+from crawl4ai import PruningContentFilter, BM25ContentFilter
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
+from crawl4ai import BrowserConfig
+
+__cur_dir__ = Path(__file__).parent
+
+async def demo_basic_crawl():
+    """Basic web crawling with markdown generation"""
+    print("\n=== 1. Basic Web Crawling ===")
+    async with AsyncWebCrawler(config = BrowserConfig(
+        viewport_height=800,
+        viewport_width=1200,
+        headless=True,
+        verbose=True,
+    )) as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com/"
+        )
+
+        for i, result in enumerate(results):
+            print(f"Result {i + 1}:")
+            print(f"Success: {result.success}")
+            if result.success:
+                print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
+                print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
+            else:
+                print("Failed to crawl the URL")
+
+async def demo_parallel_crawl():
+    """Crawl multiple URLs in parallel"""
+    print("\n=== 2. Parallel Crawling ===")
+
+    urls = [
+        "https://news.ycombinator.com/",
+        "https://example.com/",
+        "https://httpbin.org/html",
+    ]
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun_many(
+            urls=urls,
+        )
+
+        print(f"Crawled {len(results)} URLs in parallel:")
+        for i, result in enumerate(results):
+            print(
+                f"  {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
+            )
+
+async def demo_fit_markdown():
+    """Generate focused markdown with LLM content filter"""
+    print("\n=== 3. Fit Markdown with LLM Content Filter ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(
+            url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
+            config=CrawlerRunConfig(
+                markdown_generator=DefaultMarkdownGenerator(
+                    content_filter=PruningContentFilter()
+                )
+            ),
+        )
+
+        # Print stats and save the fit markdown
+        print(f"Raw: {len(result.markdown.raw_markdown)} chars")
+        print(f"Fit: {len(result.markdown.fit_markdown)} chars")
+
+async def demo_llm_structured_extraction_no_schema():
+    # Create a simple LLM extraction strategy (no schema required)
+    extraction_strategy = LLMExtractionStrategy(
+        llm_config=LLMConfig(
+            provider="groq/qwen-2.5-32b",
+            api_token="env:GROQ_API_KEY",
+        ),
+        instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
+        extract_type="schema",
+        schema="{title: string, url: string, comments: int}",
+        extra_args={
+            "temperature": 0.0,
+            "max_tokens": 4096,
+        },
+        verbose=True,
+    )
+
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://news.ycombinator.com/", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_css_structured_extraction_no_schema():
+    """Extract structured data using CSS selectors"""
+    print("\n=== 5. CSS-Based Structured Extraction ===")
+    # Sample HTML for schema generation (one-time cost)
+    sample_html = """
+<div class="body-post clear">
+    <a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
+        <div class="clear home-post-box cf">
+            <div class="home-img clear">
+                <div class="img-ratio">
+                    <img alt="..." src="...">
+                </div>
+            </div>
+            <div class="clear home-right">
+                <h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
+                <div class="item-label">
+                    <span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
+                    <span class="h-tags">Malware / Supply Chain Attack</span>
+                </div>
+                <div class="home-desc"> Cybersecurity researchers have...</div>
+            </div>
+        </div>
+    </a>
+</div>
+    """
+
+    # Check if schema file exists
+    schema_file_path = f"{__cur_dir__}/tmp/schema.json"
+    if os.path.exists(schema_file_path):
+        with open(schema_file_path, "r") as f:
+            schema = json.load(f)
+    else:
+        # Generate schema using LLM (one-time setup)
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=sample_html,
+            llm_config=LLMConfig(
+                provider="groq/qwen-2.5-32b",
+                api_token="env:GROQ_API_KEY",
+            ),
+            query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
+        )
+
+    print(f"Generated schema: {json.dumps(schema, indent=2)}")
+    # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
+    with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
+        json.dump(schema, f, indent=2)
+
+    # Create no-LLM extraction strategy with the generated schema
+    extraction_strategy = JsonCssExtractionStrategy(schema)
+    config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+    # Use the fast CSS extraction (no LLM calls during extraction)
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            "https://thehackernews.com", config=config
+        )
+
+        for result in results:
+            print(f"URL: {result.url}")
+            print(f"Success: {result.success}")
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+async def demo_deep_crawl():
+    """Deep crawling with BFS strategy"""
+    print("\n=== 6. Deep Crawling ===")
+
+    filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
+
+    deep_crawl_strategy = BFSDeepCrawlStrategy(
+        max_depth=1, max_pages=5, filter_chain=filter_chain
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://docs.crawl4ai.com",
+            config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
+        )
+
+        print(f"Deep crawl returned {len(results)} pages:")
+        for i, result in enumerate(results):
+            depth = result.metadata.get("depth", "unknown")
+            print(f"  {i + 1}. {result.url} (Depth: {depth})")
+
+async def demo_js_interaction():
+    """Execute JavaScript to load more content"""
+    print("\n=== 7. JavaScript Interaction ===")
+
+    # A simple page that needs JS to reveal content
+    async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
+        # Initial load
+
+        news_schema = {
+            "name": "news",
+            "baseSelector": "tr.athing",
+            "fields": [
+                {
+                    "name": "title",
+                    "selector": "span.titleline",
+                    "type": "text",
+                }
+            ],
+        }
+        results: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com",
+            config=CrawlerRunConfig(
+                session_id="hn_session",  # Keep session
+                extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
+            ),
+        )
+
+        news = []
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+
+        print(f"Initial items: {len(news)}")
+
+        # Click "More" link
+        more_config = CrawlerRunConfig(
+            js_code="document.querySelector('a.morelink').click();",
+            js_only=True,  # Continue in same page
+            session_id="hn_session",  # Keep session
+            extraction_strategy=JsonCssExtractionStrategy(
+                schema=news_schema,
+            ),
+        )
+
+        result: List[CrawlResult] = await crawler.arun(
+            url="https://news.ycombinator.com", config=more_config
+        )
+
+        # Extract new items
+        for result in results:
+            if result.success:
+                data = json.loads(result.extracted_content)
+                news.extend(data)
+                print(json.dumps(data, indent=2))
+            else:
+                print("Failed to extract structured data")
+        print(f"Total items: {len(news)}")
+
+async def demo_media_and_links():
+    """Extract media and links from a page"""
+    print("\n=== 8. Media and Links Extraction ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
+
+        for i, result in enumerate(result):
+            # Extract and save all images
+            images = result.media.get("images", [])
+            print(f"Found {len(images)} images")
+
+            # Extract and save all links (internal and external)
+            internal_links = result.links.get("internal", [])
+            external_links = result.links.get("external", [])
+            print(f"Found {len(internal_links)} internal links")
+            print(f"Found {len(external_links)} external links")
+
+            # Print some of the images and links
+            for image in images[:3]:
+                print(f"Image: {image['src']}")
+            for link in internal_links[:3]:
+                print(f"Internal link: {link['href']}")
+            for link in external_links[:3]:
+                print(f"External link: {link['href']}")
+
+            # # Save everything to files
+            with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
+                json.dump(images, f, indent=2)
+
+            with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
+                json.dump(
+                    {"internal": internal_links, "external": external_links},
+                    f,
+                    indent=2,
+                )
+
+async def demo_screenshot_and_pdf():
+    """Capture screenshot and PDF of a page"""
+    print("\n=== 9. Screenshot and PDF Capture ===")
+
+    async with AsyncWebCrawler() as crawler:
+        result: List[CrawlResult] = await crawler.arun(
+            # url="https://example.com",
+            url="https://en.wikipedia.org/wiki/Giant_anteater",
+            config=CrawlerRunConfig(screenshot=True, pdf=True),
+        )
+
+        for i, result in enumerate(result):
+            # if result.screenshot_data:
+            if result.screenshot:
+                # Save screenshot
+                screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
+                with open(screenshot_path, "wb") as f:
+                    f.write(base64.b64decode(result.screenshot))
+                print(f"Screenshot saved to {screenshot_path}")
+
+            # if result.pdf_data:
+            if result.pdf:
+                # Save PDF
+                pdf_path = f"{__cur_dir__}/tmp/example.pdf"
+                with open(pdf_path, "wb") as f:
+                    f.write(result.pdf)
+                print(f"PDF saved to {pdf_path}")
+
+async def demo_proxy_rotation():
+    """Proxy rotation for multiple requests"""
+    print("\n=== 10. Proxy Rotation ===")
+
+    # Example proxies (replace with real ones)
+    proxies = [
+        ProxyConfig(server="http://proxy1.example.com:8080"),
+        ProxyConfig(server="http://proxy2.example.com:8080"),
+    ]
+
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+
+    print(f"Using {len(proxies)} proxies in rotation")
+    print(
+        "Note: This example uses placeholder proxies - replace with real ones to test"
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            proxy_rotation_strategy=proxy_strategy
+        )
+
+        # In a real scenario, these would be run and the proxies would rotate
+        print("In a real scenario, requests would rotate through the available proxies")
+
+async def demo_raw_html_and_file():
+    """Process raw HTML and local files"""
+    print("\n=== 11. Raw HTML and Local Files ===")
+
+    raw_html = """
+    <html><body>
+        <h1>Sample Article</h1>
+        <p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
+    </body></html>
+    """
+
+    # Save to file
+    file_path = Path("docs/examples/tmp/sample.html").absolute()
+    with open(file_path, "w") as f:
+        f.write(raw_html)
+
+    async with AsyncWebCrawler() as crawler:
+        # Crawl raw HTML
+        raw_result = await crawler.arun(
+            url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        print("Raw HTML processing:")
+        print(f"  Markdown: {raw_result.markdown.raw_markdown[:50]}...")
+
+        # Crawl local file
+        file_result = await crawler.arun(
+            url=f"file://{file_path}",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("\nLocal file processing:")
+        print(f"  Markdown: {file_result.markdown.raw_markdown[:50]}...")
+
+    # Clean up
+    os.remove(file_path)
+    print(f"Processed both raw HTML and local file ({file_path})")
+
+async def main():
+    """Run all demo functions sequentially"""
+    print("=== Comprehensive Crawl4AI Demo ===")
+    print("Note: Some examples require API keys or other configurations")
+
+    # Run all demos
+    await demo_basic_crawl()
+    await demo_parallel_crawl()
+    await demo_fit_markdown()
+    await demo_llm_structured_extraction_no_schema()
+    await demo_css_structured_extraction_no_schema()
+    await demo_deep_crawl()
+    await demo_js_interaction()
+    await demo_media_and_links()
+    await demo_screenshot_and_pdf()
+    # # await demo_proxy_rotation()
+    await demo_raw_html_and_file()
+
+    # Clean up any temp files that may have been created
+    print("\n=== Demo Complete ===")
+    print("Check for any generated files (screenshots, PDFs) in the current directory")
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/quickstart_examples_set_2.py b/docs/examples/quickstart_examples_set_2.py
new file mode 100644
index 00000000..3adbfc0d
--- /dev/null
+++ b/docs/examples/quickstart_examples_set_2.py
@@ -0,0 +1,562 @@
+import os, sys
+
+from crawl4ai.types import LLMConfig
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+import time
+import json
+import re
+from typing import Dict
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.extraction_strategy import (
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+)
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+
+# Basic Example - Simple Crawl
+async def simple_crawl():
+    print("\n--- Basic Usage ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def clean_content():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        excluded_tags=["nav", "footer", "aside"],
+        remove_overlay_elements=True,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, threshold_type="fixed", min_word_threshold=0
+            ),
+            options={"ignore_links": True},
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            config=crawler_config,
+        )
+        full_markdown_length = len(result.markdown.raw_markdown)
+        fit_markdown_length = len(result.markdown.fit_markdown)
+        print(f"Full Markdown Length: {full_markdown_length}")
+        print(f"Fit Markdown Length: {fit_markdown_length}")
+
+
+async def link_analysis():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config,
+        )
+        print(f"Found {len(result.links['internal'])} internal links")
+        print(f"Found {len(result.links['external'])} external links")
+
+        for link in result.links["internal"][:5]:
+            print(f"Href: {link['href']}\nText: {link['text']}\n")
+
+
+# JavaScript Execution Example
+async def simple_example_with_running_js_code():
+    print("\n--- Executing JavaScript and Using CSS Selectors ---")
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
+        # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+# CSS Selector Example
+async def simple_example_with_css_selector():
+    print("\n--- Using CSS Selectors ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+
+async def media_handling():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        for img in result.media["images"][:5]:
+            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
+
+
+async def custom_hook_workflow(verbose=True):
+    async with AsyncWebCrawler() as crawler:
+        # Set a 'before_goto' hook to run custom code just before navigation
+        crawler.crawler_strategy.set_hook(
+            "before_goto",
+            lambda page, context: print("[Hook] Preparing to navigate..."),
+        )
+
+        # Perform the crawl operation
+        result = await crawler.arun(url="https://crawl4ai.com")
+        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
+
+
+# Proxy Example
+async def use_proxy():
+    print("\n--- Using a Proxy ---")
+    browser_config = BrowserConfig(
+        headless=True,
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "username",
+            "password": "password",
+        },
+    )
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        if result.success:
+            print(result.markdown[:500])
+
+
+# Screenshot Example
+async def capture_and_save_screenshot(url: str, output_path: str):
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url=url, config=crawler_config)
+
+        if result.success and result.screenshot:
+            import base64
+
+            screenshot_data = base64.b64decode(result.screenshot)
+            with open(output_path, "wb") as f:
+                f.write(screenshot_data)
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+
+
+# LLM Extraction Example
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout=80000,
+        extraction_strategy=LLMExtractionStrategy(
+            llm_config=LLMConfig(provider=provider,api_token=api_token),
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args,
+        ),
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/", config=crawler_config
+        )
+        print(result.extracted_content)
+
+
+# CSS Extraction Example
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs],
+        delay_before_return_html=1
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+
+# Dynamic Content Examples - Method 1
+async def crawl_dynamic_content_pages_method_1():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+    first_commit = ""
+
+    async def on_execution_started(page, **kwargs):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await commit.evaluate("(element) => element.textContent")
+                commit = re.sub(r"\s+", "", commit)
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        js_next_page = """
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+        """
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                js_code=js_next_page if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            soup = BeautifulSoup(result.cleaned_html, "html.parser")
+            commits = soup.select("li")
+            all_commits.extend(commits)
+
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+# Dynamic Content Examples - Method 2
+async def crawl_dynamic_content_pages_method_2():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+    js_next_page_and_wait = """
+    (async () => {
+        const getCurrentCommit = () => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            return commits.length > 0 ? commits[0].textContent.trim() : null;
+        };
+
+        const initialCommit = getCurrentCommit();
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+
+        while (true) {
+            await new Promise(resolve => setTimeout(resolve, 100));
+            const newCommit = getCurrentCommit();
+            if (newCommit && newCommit !== initialCommit) {
+                break;
+            }
+        }
+    })();
+    """
+
+    schema = {
+        "name": "Commit Extractor",
+        "baseSelector": "li.Box-sc-g0xbh4-0",
+        "fields": [
+            {
+                "name": "title",
+                "selector": "h4.markdown-title",
+                "type": "text",
+                "transform": "strip",
+            },
+        ],
+    }
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id,
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            commits = json.loads(result.extracted_content)
+            all_commits.extend(commits)
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+async def cosine_similarity_extraction():
+    from crawl4ai.extraction_strategy import CosineStrategy
+    crawl_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=10,
+            max_dist=0.2,  # Maximum distance between two words
+            linkage_method="ward",  # Linkage method for hierarchical clustering (ward, complete, average, single)
+            top_k=3,  # Number of top keywords to extract
+            sim_threshold=0.3,  # Similarity threshold for clustering
+            semantic_filter="McDonald's economic impact, American consumer trends",  # Keywords to filter the content semantically using embeddings
+            verbose=True,
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
+            config=crawl_config,
+        )
+        print(json.loads(result.extracted_content)[:5])
+
+
+# Browser Comparison
+async def crawl_custom_browser_type():
+    print("\n--- Browser Comparison ---")
+
+    # Firefox
+    browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Firefox:", time.time() - start)
+        print(result.markdown[:500])
+
+    # WebKit
+    browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("WebKit:", time.time() - start)
+        print(result.markdown[:500])
+
+    # Chromium (default)
+    browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+        )
+        print("Chromium:", time.time() - start)
+        print(result.markdown[:500])
+
+
+# Anti-Bot and User Simulation
+async def crawl_with_user_simulation():
+    browser_config = BrowserConfig(
+        headless=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+    )
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
+        print(result.markdown)
+
+
+async def ssl_certification():
+    # Configure crawler to fetch SSL certificate
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+
+            tmp_dir = os.path.join(__location__, "tmp")
+            os.makedirs(tmp_dir, exist_ok=True)
+
+            # 1. Access certificate properties directly
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+
+            # 2. Export certificate in different formats
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
+            print("\nCertificate exported to:")
+            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
+            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
+            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+
+# Main execution
+async def main():
+    # Basic examples
+    await simple_crawl()
+    await simple_example_with_running_js_code()
+    await simple_example_with_css_selector()
+
+    # Advanced examples
+    await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_llm(
+        "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
+    )
+    await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_2()
+
+    # Browser comparisons
+    await crawl_custom_browser_type()
+
+    # Screenshot example
+    await capture_and_save_screenshot(
+        "https://www.example.com",
+        os.path.join(__location__, "tmp/example_screenshot.jpg")
+    )
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py
deleted file mode 100644
index 0248af29..00000000
--- a/docs/examples/quickstart_sync.py
+++ /dev/null
@@ -1,406 +0,0 @@
-import os
-import time
-from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
-from crawl4ai.extraction_strategy import *
-from crawl4ai.crawler_strategy import *
-from rich import print
-from rich.console import Console
-from functools import lru_cache
-
-console = Console()
-
-
-@lru_cache()
-def create_crawler():
-    crawler = WebCrawler(verbose=True)
-    crawler.warmup()
-    return crawler
-
-
-def print_result(result):
-    # Print each key in one line and just the first 10 characters of each one's value and three dots
-    console.print("\t[bold]Result:[/bold]")
-    for key, value in result.model_dump().items():
-        if isinstance(value, str) and value:
-            console.print(f"\t{key}: [green]{value[:20]}...[/green]")
-    if result.extracted_content:
-        items = json.loads(result.extracted_content)
-        print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
-
-
-def cprint(message, press_any_key=False):
-    console.print(message)
-    if press_any_key:
-        console.print("Press any key to continue...", style="")
-        input()
-
-
-def basic_usage(crawler):
-    cprint(
-        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
-    )
-    result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
-    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
-    print_result(result)
-
-
-def basic_usage_some_params(crawler):
-    cprint(
-        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
-    )
-    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
-    print_result(result)
-
-
-def screenshot_usage(crawler):
-    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
-    cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
-    # Save the screenshot to a file
-    with open("screenshot.png", "wb") as f:
-        f.write(base64.b64decode(result.screenshot))
-    cprint("Screenshot saved to 'screenshot.png'!")
-    print_result(result)
-
-
-def understanding_parameters(crawler):
-    cprint(
-        "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
-    )
-    cprint(
-        "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
-    )
-
-    # First crawl (reads from cache)
-    cprint("1️⃣ First crawl (caches the result):", True)
-    start_time = time.time()
-    result = crawler.run(url="https://www.nbcnews.com/business")
-    end_time = time.time()
-    cprint(
-        f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
-    )
-    print_result(result)
-
-    # Force to crawl again
-    cprint("2️⃣ Second crawl (Force to crawl again):", True)
-    start_time = time.time()
-    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
-    end_time = time.time()
-    cprint(
-        f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
-    )
-    print_result(result)
-
-
-def add_chunking_strategy(crawler):
-    # Adding a chunking strategy: RegexChunking
-    cprint(
-        "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
-        True,
-    )
-    cprint(
-        "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        chunking_strategy=RegexChunking(patterns=["\n\n"]),
-    )
-    cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
-    print_result(result)
-
-    # Adding another chunking strategy: NlpSentenceChunking
-    cprint(
-        "\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
-        True,
-    )
-    cprint(
-        "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
-    )
-    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
-    print_result(result)
-
-
-def add_extraction_strategy(crawler):
-    # Adding an extraction strategy: CosineStrategy
-    cprint(
-        "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
-        True,
-    )
-    cprint(
-        "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(
-            word_count_threshold=10,
-            max_dist=0.2,
-            linkage_method="ward",
-            top_k=3,
-            sim_threshold=0.3,
-            verbose=True,
-        ),
-    )
-    cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
-    print_result(result)
-
-    # Using semantic_filter with CosineStrategy
-    cprint(
-        "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(
-            semantic_filter="inflation rent prices",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def add_llm_extraction_strategy(crawler):
-    # Adding an LLM extraction strategy without instructions
-    cprint(
-        "\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
-        True,
-    )
-    cprint(
-        "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
-    )
-    print_result(result)
-
-    # Adding an LLM extraction strategy with instructions
-    cprint(
-        "\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
-        True,
-    )
-    cprint(
-        "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
-    )
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            provider="openai/gpt-4o",
-            api_token=os.getenv("OPENAI_API_KEY"),
-            instruction="I am interested in only financial news",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
-    )
-    print_result(result)
-
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(
-            provider="openai/gpt-4o",
-            api_token=os.getenv("OPENAI_API_KEY"),
-            instruction="Extract only content related to technology",
-        ),
-    )
-    cprint(
-        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def targeted_extraction(crawler):
-    # Using a CSS selector to extract only H2 tags
-    cprint(
-        "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
-        True,
-    )
-    result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
-    cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
-    print_result(result)
-
-
-def interactive_extraction(crawler):
-    # Passing JavaScript code to interact with the page
-    cprint(
-        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
-        True,
-    )
-    cprint(
-        "In this example we try to click the 'Load More' button on the page using JavaScript code."
-    )
-    js_code = """
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
-    cprint(
-        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def multiple_scrip(crawler):
-    # Passing JavaScript code to interact with the page
-    cprint(
-        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
-        True,
-    )
-    cprint(
-        "In this example we try to click the 'Load More' button on the page using JavaScript code."
-    )
-    js_code = [
-        """
-    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
-    loadMoreButton && loadMoreButton.click();
-    """
-    ] * 2
-    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
-    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
-    cprint(
-        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
-    )
-    print_result(result)
-
-
-def using_crawler_hooks(crawler):
-    # Example usage of the hooks for authentication and setting a cookie
-    def on_driver_created(driver):
-        print("[HOOK] on_driver_created")
-        # Example customization: maximize the window
-        driver.maximize_window()
-
-        # Example customization: logging in to a hypothetical website
-        driver.get("https://example.com/login")
-
-        from selenium.webdriver.support.ui import WebDriverWait
-        from selenium.webdriver.common.by import By
-        from selenium.webdriver.support import expected_conditions as EC
-
-        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.NAME, "username"))
-        )
-        driver.find_element(By.NAME, "username").send_keys("testuser")
-        driver.find_element(By.NAME, "password").send_keys("password123")
-        driver.find_element(By.NAME, "login").click()
-        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.ID, "welcome"))
-        )
-        # Add a custom cookie
-        driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
-        return driver
-
-    def before_get_url(driver):
-        print("[HOOK] before_get_url")
-        # Example customization: add a custom header
-        # Enable Network domain for sending headers
-        driver.execute_cdp_cmd("Network.enable", {})
-        # Add a custom header
-        driver.execute_cdp_cmd(
-            "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
-        )
-        return driver
-
-    def after_get_url(driver):
-        print("[HOOK] after_get_url")
-        # Example customization: log the URL
-        print(driver.current_url)
-        return driver
-
-    def before_return_html(driver, html):
-        print("[HOOK] before_return_html")
-        # Example customization: log the HTML
-        print(len(html))
-        return driver
-
-    cprint(
-        "\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
-        True,
-    )
-
-    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-    crawler_strategy.set_hook("on_driver_created", on_driver_created)
-    crawler_strategy.set_hook("before_get_url", before_get_url)
-    crawler_strategy.set_hook("after_get_url", after_get_url)
-    crawler_strategy.set_hook("before_return_html", before_return_html)
-
-    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-    crawler.warmup()
-    result = crawler.run(url="https://example.com")
-
-    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result=result)
-
-
-def using_crawler_hooks_dleay_example(crawler):
-    def delay(driver):
-        print("Delaying for 5 seconds...")
-        time.sleep(5)
-        print("Resuming...")
-
-    def create_crawler():
-        crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-        crawler_strategy.set_hook("after_get_url", delay)
-        crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-        crawler.warmup()
-        return crawler
-
-    cprint(
-        "\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
-    )
-    crawler = create_crawler()
-    result = crawler.run(url="https://google.com", bypass_cache=True)
-
-    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result)
-
-
-def main():
-    cprint(
-        "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
-    )
-    cprint(
-        "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
-    )
-    cprint(
-        "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
-    )
-
-    crawler = create_crawler()
-
-    crawler.always_by_pass_cache = True
-    basic_usage(crawler)
-    # basic_usage_some_params(crawler)
-    understanding_parameters(crawler)
-
-    crawler.always_by_pass_cache = True
-    screenshot_usage(crawler)
-    add_chunking_strategy(crawler)
-    add_extraction_strategy(crawler)
-    add_llm_extraction_strategy(crawler)
-    targeted_extraction(crawler)
-    interactive_extraction(crawler)
-    multiple_scrip(crawler)
-
-    cprint(
-        "\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/docs/examples/quickstart_v0.ipynb b/docs/examples/quickstart_v0.ipynb
deleted file mode 100644
index 0282aa12..00000000
--- a/docs/examples/quickstart_v0.ipynb
+++ /dev/null
@@ -1,735 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6yLvrXn7yZQI"
-      },
-      "source": [
-        "# Crawl4AI: Advanced Web Crawling and Data Extraction\n",
-        "\n",
-        "Welcome to this interactive notebook showcasing Crawl4AI, an advanced asynchronous web crawling and data extraction library.\n",
-        "\n",
-        "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n",
-        "- Twitter: [@unclecode](https://twitter.com/unclecode)\n",
-        "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n",
-        "\n",
-        "Let's explore the powerful features of Crawl4AI!"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "KIn_9nxFyZQK"
-      },
-      "source": [
-        "## Installation\n",
-        "\n",
-        "First, let's install Crawl4AI from GitHub:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "mSnaxLf3zMog"
-      },
-      "outputs": [],
-      "source": [
-        "!sudo apt-get update && sudo apt-get install -y libwoff1 libopus0 libwebp6 libwebpdemux2 libenchant1c2a libgudev-1.0-0 libsecret-1-0 libhyphen0 libgdk-pixbuf2.0-0 libegl1 libnotify4 libxslt1.1 libevent-2.1-7 libgles2 libvpx6 libxcomposite1 libatk1.0-0 libatk-bridge2.0-0 libepoxy0 libgtk-3-0 libharfbuzz-icu0"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xlXqaRtayZQK"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install crawl4ai\n",
-        "!pip install nest-asyncio\n",
-        "!playwright install"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "qKCE7TI7yZQL"
-      },
-      "source": [
-        "Now, let's import the necessary libraries:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {
-        "id": "I67tr7aAyZQL"
-      },
-      "outputs": [],
-      "source": [
-        "import asyncio\n",
-        "import nest_asyncio\n",
-        "from crawl4ai import AsyncWebCrawler\n",
-        "from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy\n",
-        "import json\n",
-        "import time\n",
-        "from pydantic import BaseModel, Field\n",
-        "\n",
-        "nest_asyncio.apply()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "h7yR_Rt_yZQM"
-      },
-      "source": [
-        "## Basic Usage\n",
-        "\n",
-        "Let's start with a simple crawl example:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "yBh6hf4WyZQM",
-        "outputId": "0f83af5c-abba-4175-ed95-70b7512e6bcc"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.05 seconds\n",
-            "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.05 seconds.\n",
-            "18102\n"
-          ]
-        }
-      ],
-      "source": [
-        "async def simple_crawl():\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        result = await crawler.arun(url=\"https://www.nbcnews.com/business\")\n",
-        "        print(len(result.markdown))\n",
-        "await simple_crawl()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9rtkgHI28uI4"
-      },
-      "source": [
-        "💡 By default, **Crawl4AI** caches the result of every URL, so the next time you call it, you’ll get an instant result. But if you want to bypass the cache, just set `bypass_cache=True`."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MzZ0zlJ9yZQM"
-      },
-      "source": [
-        "## Advanced Features\n",
-        "\n",
-        "### Executing JavaScript and Using CSS Selectors"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "gHStF86xyZQM",
-        "outputId": "34d0fb6d-4dec-4677-f76e-85a1f082829b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
-            "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 6.06 seconds\n",
-            "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n",
-            "41135\n"
-          ]
-        }
-      ],
-      "source": [
-        "async def js_and_css():\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        js_code = [\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"]\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            js_code=js_code,\n",
-        "            # css_selector=\"YOUR_CSS_SELECTOR_HERE\",\n",
-        "            bypass_cache=True\n",
-        "        )\n",
-        "        print(len(result.markdown))\n",
-        "\n",
-        "await js_and_css()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "cqE_W4coyZQM"
-      },
-      "source": [
-        "### Using a Proxy\n",
-        "\n",
-        "Note: You'll need to replace the proxy URL with a working proxy for this example to run successfully."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "QjAyiAGqyZQM"
-      },
-      "outputs": [],
-      "source": [
-        "async def use_proxy():\n",
-        "    async with AsyncWebCrawler(verbose=True, proxy=\"http://your-proxy-url:port\") as crawler:\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            bypass_cache=True\n",
-        "        )\n",
-        "        print(result.markdown[:500])  # Print first 500 characters\n",
-        "\n",
-        "# Uncomment the following line to run the proxy example\n",
-        "# await use_proxy()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XTZ88lbayZQN"
-      },
-      "source": [
-        "### Extracting Structured Data with OpenAI\n",
-        "\n",
-        "Note: You'll need to set your OpenAI API key as an environment variable for this example to work."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 14,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "fIOlDayYyZQN",
-        "outputId": "cb8359cc-dee0-4762-9698-5dfdcee055b8"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n",
-            "[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 3.77 seconds\n",
-            "[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.21 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 1\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 2\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 3\n",
-            "[LOG] Extracted 4 blocks from URL: https://openai.com/api/pricing/ block index: 3\n",
-            "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 4\n",
-            "[LOG] Extracted 5 blocks from URL: https://openai.com/api/pricing/ block index: 0\n",
-            "[LOG] Extracted 1 blocks from URL: https://openai.com/api/pricing/ block index: 4\n",
-            "[LOG] Extracted 8 blocks from URL: https://openai.com/api/pricing/ block index: 1\n",
-            "[LOG] Extracted 12 blocks from URL: https://openai.com/api/pricing/ block index: 2\n",
-            "[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 8.55 seconds.\n",
-            "5029\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "from google.colab import userdata\n",
-        "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
-        "\n",
-        "class OpenAIModelFee(BaseModel):\n",
-        "    model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n",
-        "    input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n",
-        "    output_fee: str = Field(..., description=\"Fee for output token for the OpenAI model.\")\n",
-        "\n",
-        "async def extract_openai_fees():\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        result = await crawler.arun(\n",
-        "            url='https://openai.com/api/pricing/',\n",
-        "            word_count_threshold=1,\n",
-        "            extraction_strategy=LLMExtractionStrategy(\n",
-        "                provider=\"openai/gpt-4o\", api_token=os.getenv('OPENAI_API_KEY'),\n",
-        "                schema=OpenAIModelFee.schema(),\n",
-        "                extraction_type=\"schema\",\n",
-        "                instruction=\"\"\"From the crawled content, extract all mentioned model names along with their fees for input and output tokens.\n",
-        "                Do not miss any models in the entire content. One extracted model JSON format should look like this:\n",
-        "                {\"model_name\": \"GPT-4\", \"input_fee\": \"US$10.00 / 1M tokens\", \"output_fee\": \"US$30.00 / 1M tokens\"}.\"\"\"\n",
-        "            ),\n",
-        "            bypass_cache=True,\n",
-        "        )\n",
-        "        print(len(result.extracted_content))\n",
-        "\n",
-        "# Uncomment the following line to run the OpenAI extraction example\n",
-        "await extract_openai_fees()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BypA5YxEyZQN"
-      },
-      "source": [
-        "### Advanced Multi-Page Crawling with JavaScript Execution"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "tfkcVQ0b7mw-"
-      },
-      "source": [
-        "## Advanced Multi-Page Crawling with JavaScript Execution\n",
-        "\n",
-        "This example demonstrates Crawl4AI's ability to handle complex crawling scenarios, specifically extracting commits from multiple pages of a GitHub repository. The challenge here is that clicking the \"Next\" button doesn't load a new page, but instead uses asynchronous JavaScript to update the content. This is a common hurdle in modern web crawling.\n",
-        "\n",
-        "To overcome this, we use Crawl4AI's custom JavaScript execution to simulate clicking the \"Next\" button, and implement a custom hook to detect when new data has loaded. Our strategy involves comparing the first commit's text before and after \"clicking\" Next, waiting until it changes to confirm new data has rendered. This showcases Crawl4AI's flexibility in handling dynamic content and its ability to implement custom logic for even the most challenging crawling tasks."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 11,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "qUBKGpn3yZQN",
-        "outputId": "3e555b6a-ed33-42f4-cce9-499a923fbe17"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
-            "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 5.16 seconds\n",
-            "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.28 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.28 seconds.\n",
-            "Page 1: Found 35 commits\n",
-            "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
-            "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.78 seconds\n",
-            "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.90 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.90 seconds.\n",
-            "Page 2: Found 35 commits\n",
-            "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
-            "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 2.00 seconds\n",
-            "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.74 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.75 seconds.\n",
-            "Page 3: Found 35 commits\n",
-            "Successfully crawled 105 commits across 3 pages\n"
-          ]
-        }
-      ],
-      "source": [
-        "import re\n",
-        "from bs4 import BeautifulSoup\n",
-        "\n",
-        "async def crawl_typescript_commits():\n",
-        "    first_commit = \"\"\n",
-        "    async def on_execution_started(page):\n",
-        "        nonlocal first_commit\n",
-        "        try:\n",
-        "            while True:\n",
-        "                await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')\n",
-        "                commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')\n",
-        "                commit = await commit.evaluate('(element) => element.textContent')\n",
-        "                commit = re.sub(r'\\s+', '', commit)\n",
-        "                if commit and commit != first_commit:\n",
-        "                    first_commit = commit\n",
-        "                    break\n",
-        "                await asyncio.sleep(0.5)\n",
-        "        except Exception as e:\n",
-        "            print(f\"Warning: New content didn't appear after JavaScript execution: {e}\")\n",
-        "\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)\n",
-        "\n",
-        "        url = \"https://github.com/microsoft/TypeScript/commits/main\"\n",
-        "        session_id = \"typescript_commits_session\"\n",
-        "        all_commits = []\n",
-        "\n",
-        "        js_next_page = \"\"\"\n",
-        "        const button = document.querySelector('a[data-testid=\"pagination-next-button\"]');\n",
-        "        if (button) button.click();\n",
-        "        \"\"\"\n",
-        "\n",
-        "        for page in range(3):  # Crawl 3 pages\n",
-        "            result = await crawler.arun(\n",
-        "                url=url,\n",
-        "                session_id=session_id,\n",
-        "                css_selector=\"li.Box-sc-g0xbh4-0\",\n",
-        "                js=js_next_page if page > 0 else None,\n",
-        "                bypass_cache=True,\n",
-        "                js_only=page > 0\n",
-        "            )\n",
-        "\n",
-        "            assert result.success, f\"Failed to crawl page {page + 1}\"\n",
-        "\n",
-        "            soup = BeautifulSoup(result.cleaned_html, 'html.parser')\n",
-        "            commits = soup.select(\"li\")\n",
-        "            all_commits.extend(commits)\n",
-        "\n",
-        "            print(f\"Page {page + 1}: Found {len(commits)} commits\")\n",
-        "\n",
-        "        await crawler.crawler_strategy.kill_session(session_id)\n",
-        "        print(f\"Successfully crawled {len(all_commits)} commits across 3 pages\")\n",
-        "\n",
-        "await crawl_typescript_commits()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "EJRnYsp6yZQN"
-      },
-      "source": [
-        "### Using JsonCssExtractionStrategy for Fast Structured Output"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1ZMqIzB_8SYp"
-      },
-      "source": [
-        "The JsonCssExtractionStrategy is a powerful feature of Crawl4AI that allows for precise, structured data extraction from web pages. Here's how it works:\n",
-        "\n",
-        "1. You define a schema that describes the pattern of data you're interested in extracting.\n",
-        "2. The schema includes a base selector that identifies repeating elements on the page.\n",
-        "3. Within the schema, you define fields, each with its own selector and type.\n",
-        "4. These field selectors are applied within the context of each base selector element.\n",
-        "5. The strategy supports nested structures, lists within lists, and various data types.\n",
-        "6. You can even include computed fields for more complex data manipulation.\n",
-        "\n",
-        "This approach allows for highly flexible and precise data extraction, transforming semi-structured web content into clean, structured JSON data. It's particularly useful for extracting consistent data patterns from pages like product listings, news articles, or search results.\n",
-        "\n",
-        "For more details and advanced usage, check out the full documentation on the Crawl4AI website."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 12,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "trCMR2T9yZQN",
-        "outputId": "718d36f4-cccf-40f4-8d8c-c3ba73524d16"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[LOG] 🌤️  Warming up the AsyncWebCrawler\n",
-            "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
-            "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
-            "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
-            "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 7.00 seconds\n",
-            "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.32 seconds\n",
-            "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
-            "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.48 seconds.\n",
-            "Successfully extracted 11 news teasers\n",
-            "{\n",
-            "  \"category\": \"Business News\",\n",
-            "  \"headline\": \"NBC ripped up its Olympics playbook for 2024 \\u2014 so far, the new strategy paid off\",\n",
-            "  \"summary\": \"The Olympics have long been key to NBCUniversal. Paris marked the 18th Olympic Games broadcast by NBC in the U.S.\",\n",
-            "  \"time\": \"13h ago\",\n",
-            "  \"image\": {\n",
-            "    \"src\": \"https://media-cldnry.s-nbcnews.com/image/upload/t_focal-200x100,f_auto,q_auto:best/rockcms/2024-09/240903-nbc-olympics-ch-1344-c7a486.jpg\",\n",
-            "    \"alt\": \"Mike Tirico.\"\n",
-            "  },\n",
-            "  \"link\": \"https://www.nbcnews.com/business\"\n",
-            "}\n"
-          ]
-        }
-      ],
-      "source": [
-        "async def extract_news_teasers():\n",
-        "    schema = {\n",
-        "        \"name\": \"News Teaser Extractor\",\n",
-        "        \"baseSelector\": \".wide-tease-item__wrapper\",\n",
-        "        \"fields\": [\n",
-        "            {\n",
-        "                \"name\": \"category\",\n",
-        "                \"selector\": \".unibrow span[data-testid='unibrow-text']\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"headline\",\n",
-        "                \"selector\": \".wide-tease-item__headline\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"summary\",\n",
-        "                \"selector\": \".wide-tease-item__description\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"time\",\n",
-        "                \"selector\": \"[data-testid='wide-tease-date']\",\n",
-        "                \"type\": \"text\",\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"image\",\n",
-        "                \"type\": \"nested\",\n",
-        "                \"selector\": \"picture.teasePicture img\",\n",
-        "                \"fields\": [\n",
-        "                    {\"name\": \"src\", \"type\": \"attribute\", \"attribute\": \"src\"},\n",
-        "                    {\"name\": \"alt\", \"type\": \"attribute\", \"attribute\": \"alt\"},\n",
-        "                ],\n",
-        "            },\n",
-        "            {\n",
-        "                \"name\": \"link\",\n",
-        "                \"selector\": \"a[href]\",\n",
-        "                \"type\": \"attribute\",\n",
-        "                \"attribute\": \"href\",\n",
-        "            },\n",
-        "        ],\n",
-        "    }\n",
-        "\n",
-        "    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)\n",
-        "\n",
-        "    async with AsyncWebCrawler(verbose=True) as crawler:\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            extraction_strategy=extraction_strategy,\n",
-        "            bypass_cache=True,\n",
-        "        )\n",
-        "\n",
-        "        assert result.success, \"Failed to crawl the page\"\n",
-        "\n",
-        "        news_teasers = json.loads(result.extracted_content)\n",
-        "        print(f\"Successfully extracted {len(news_teasers)} news teasers\")\n",
-        "        print(json.dumps(news_teasers[0], indent=2))\n",
-        "\n",
-        "await extract_news_teasers()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FnyVhJaByZQN"
-      },
-      "source": [
-        "## Speed Comparison\n",
-        "\n",
-        "Let's compare the speed of Crawl4AI with Firecrawl, a paid service. Note that we can't run Firecrawl in this Colab environment, so we'll simulate its performance based on previously recorded data."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "agDD186f3wig"
-      },
-      "source": [
-        "💡 **Note on Speed Comparison:**\n",
-        "\n",
-        "The speed test conducted here is running on Google Colab, where the internet speed and performance can vary and may not reflect optimal conditions. When we call Firecrawl's API, we're seeing its best performance, while Crawl4AI's performance is limited by Colab's network speed.\n",
-        "\n",
-        "For a more accurate comparison, it's recommended to run these tests on your own servers or computers with a stable and fast internet connection. Despite these limitations, Crawl4AI still demonstrates faster performance in this environment.\n",
-        "\n",
-        "If you run these tests locally, you may observe an even more significant speed advantage for Crawl4AI compared to other services."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "F7KwHv8G1LbY"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install firecrawl"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "91813zILyZQN",
-        "outputId": "663223db-ab89-4976-b233-05ceca62b19b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Firecrawl (simulated):\n",
-            "Time taken: 4.38 seconds\n",
-            "Content length: 41967 characters\n",
-            "Images found: 49\n",
-            "\n",
-            "Crawl4AI (simple crawl):\n",
-            "Time taken: 4.22 seconds\n",
-            "Content length: 18221 characters\n",
-            "Images found: 49\n",
-            "\n",
-            "Crawl4AI (with JavaScript execution):\n",
-            "Time taken: 9.13 seconds\n",
-            "Content length: 34243 characters\n",
-            "Images found: 89\n"
-          ]
-        }
-      ],
-      "source": [
-        "import os\n",
-        "from google.colab import userdata\n",
-        "os.environ['FIRECRAWL_API_KEY'] = userdata.get('FIRECRAWL_API_KEY')\n",
-        "import time\n",
-        "from firecrawl import FirecrawlApp\n",
-        "\n",
-        "async def speed_comparison():\n",
-        "    # Simulated Firecrawl performance\n",
-        "    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])\n",
-        "    start = time.time()\n",
-        "    scrape_status = app.scrape_url(\n",
-        "    'https://www.nbcnews.com/business',\n",
-        "    params={'formats': ['markdown', 'html']}\n",
-        "    )\n",
-        "    end = time.time()\n",
-        "    print(\"Firecrawl (simulated):\")\n",
-        "    print(f\"Time taken: {end - start:.2f} seconds\")\n",
-        "    print(f\"Content length: {len(scrape_status['markdown'])} characters\")\n",
-        "    print(f\"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}\")\n",
-        "    print()\n",
-        "\n",
-        "    async with AsyncWebCrawler() as crawler:\n",
-        "        # Crawl4AI simple crawl\n",
-        "        start = time.time()\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            word_count_threshold=0,\n",
-        "            bypass_cache=True,\n",
-        "            verbose=False\n",
-        "        )\n",
-        "        end = time.time()\n",
-        "        print(\"Crawl4AI (simple crawl):\")\n",
-        "        print(f\"Time taken: {end - start:.2f} seconds\")\n",
-        "        print(f\"Content length: {len(result.markdown)} characters\")\n",
-        "        print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
-        "        print()\n",
-        "\n",
-        "        # Crawl4AI with JavaScript execution\n",
-        "        start = time.time()\n",
-        "        result = await crawler.arun(\n",
-        "            url=\"https://www.nbcnews.com/business\",\n",
-        "            js_code=[\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"],\n",
-        "            word_count_threshold=0,\n",
-        "            bypass_cache=True,\n",
-        "            verbose=False\n",
-        "        )\n",
-        "        end = time.time()\n",
-        "        print(\"Crawl4AI (with JavaScript execution):\")\n",
-        "        print(f\"Time taken: {end - start:.2f} seconds\")\n",
-        "        print(f\"Content length: {len(result.markdown)} characters\")\n",
-        "        print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
-        "\n",
-        "await speed_comparison()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "OBFFYVJIyZQN"
-      },
-      "source": [
-        "If you run on a local machine with a proper internet speed:\n",
-        "- Simple crawl: Crawl4AI is typically over 3-4 times faster than Firecrawl.\n",
-        "- With JavaScript execution: Even when executing JavaScript to load more content (potentially doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.\n",
-        "\n",
-        "Please note that actual performance may vary depending on network conditions and the specific content being crawled."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "A6_1RK1_yZQO"
-      },
-      "source": [
-        "## Conclusion\n",
-        "\n",
-        "In this notebook, we've explored the powerful features of Crawl4AI, including:\n",
-        "\n",
-        "1. Basic crawling\n",
-        "2. JavaScript execution and CSS selector usage\n",
-        "3. Proxy support\n",
-        "4. Structured data extraction with OpenAI\n",
-        "5. Advanced multi-page crawling with JavaScript execution\n",
-        "6. Fast structured output using JsonCssExtractionStrategy\n",
-        "7. Speed comparison with other services\n",
-        "\n",
-        "Crawl4AI offers a fast, flexible, and powerful solution for web crawling and data extraction tasks. Its asynchronous architecture and advanced features make it suitable for a wide range of applications, from simple web scraping to complex, multi-page data extraction scenarios.\n",
-        "\n",
-        "For more information and advanced usage, please visit the [Crawl4AI documentation](https://docs.crawl4ai.com/).\n",
-        "\n",
-        "Happy crawling!"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "venv",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.10.13"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/docs/examples/regex_extraction_quickstart.py b/docs/examples/regex_extraction_quickstart.py
new file mode 100644
index 00000000..54b9c384
--- /dev/null
+++ b/docs/examples/regex_extraction_quickstart.py
@@ -0,0 +1,143 @@
+# == File: regex_extraction_quickstart.py ==
+"""
+Mini–quick-start for RegexExtractionStrategy
+────────────────────────────────────────────
+3 bite-sized demos that parallel the style of *quickstart_examples_set_1.py*:
+
+1.  **Default catalog** – scrape a page and pull out e-mails / phones / URLs, etc.
+2.  **Custom pattern**  – add your own regex at instantiation time.
+3.  **LLM-assisted schema** – ask the model to write a pattern, cache it, then
+    run extraction _without_ further LLM calls.
+
+Run the whole thing with::
+
+    python regex_extraction_quickstart.py
+"""
+
+import os, json, asyncio
+from pathlib import Path
+from typing import List
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    CrawlResult,
+    RegexExtractionStrategy,
+    LLMConfig,
+)
+
+# ────────────────────────────────────────────────────────────────────────────
+# 1. Default-catalog extraction
+# ────────────────────────────────────────────────────────────────────────────
+async def demo_regex_default() -> None:
+    print("\n=== 1. Regex extraction – default patterns ===")
+
+    url = "https://www.iana.org/domains/example"      # has e-mail + URLs
+    strategy = RegexExtractionStrategy(
+        pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
+    )               
+    config   = CrawlerRunConfig(extraction_strategy=strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(url, config=config)
+
+    print(f"Fetched {url} - success={result.success}")
+    if result.success:
+        data = json.loads(result.extracted_content)
+        for d in data[:10]:
+            print(f"  {d['label']:<12} {d['value']}")
+        print(f"... total matches: {len(data)}")
+    else:
+        print("  !!! crawl failed")
+
+
+# ────────────────────────────────────────────────────────────────────────────
+# 2. Custom pattern override / extension
+# ────────────────────────────────────────────────────────────────────────────
+async def demo_regex_custom() -> None:
+    print("\n=== 2. Regex extraction – custom price pattern ===")
+
+    url = "https://www.apple.com/shop/buy-mac/macbook-pro"
+    price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
+
+    strategy = RegexExtractionStrategy(custom = price_pattern)
+    config   = CrawlerRunConfig(extraction_strategy=strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(url, config=config)
+
+    if result.success:
+        data = json.loads(result.extracted_content)
+        for d in data:
+            print(f"  {d['value']}")
+        if not data:
+            print("  (No prices found - page layout may have changed)")
+    else:
+        print("  !!! crawl failed")
+
+
+# ────────────────────────────────────────────────────────────────────────────
+# 3. One-shot LLM pattern generation, then fast extraction
+# ────────────────────────────────────────────────────────────────────────────
+async def demo_regex_generate_pattern() -> None:
+    print("\n=== 3. generate_pattern → regex extraction ===")
+
+    cache_dir   = Path(__file__).parent / "tmp"
+    cache_dir.mkdir(exist_ok=True)
+    pattern_file = cache_dir / "price_pattern.json"
+
+    url = "https://www.lazada.sg/tag/smartphone/"
+
+    # ── 3-A. build or load the cached pattern
+    if pattern_file.exists():
+        pattern = json.load(pattern_file.open(encoding="utf-8"))
+        print("Loaded cached pattern:", pattern)
+    else:
+        print("Generating pattern via LLM…")
+
+        llm_cfg = LLMConfig(
+            provider="openai/gpt-4o-mini",
+            api_token="env:OPENAI_API_KEY",
+        )
+
+        # pull one sample page as HTML context
+        async with AsyncWebCrawler() as crawler:
+            html = (await crawler.arun(url)).fit_html 
+
+        pattern = RegexExtractionStrategy.generate_pattern(
+            label="price",
+            html=html,
+            query="Prices in Malaysian Ringgit (e.g. RM1,299.00 or RM200)",
+            llm_config=llm_cfg,
+        )
+
+        json.dump(pattern, pattern_file.open("w", encoding="utf-8"), indent=2)
+        print("Saved pattern:", pattern_file)
+
+    # ── 3-B. extraction pass – zero LLM calls
+    strategy = RegexExtractionStrategy(custom=pattern)
+    config   = CrawlerRunConfig(extraction_strategy=strategy, delay_before_return_html=3)
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(url, config=config)
+
+    if result.success:
+        data = json.loads(result.extracted_content)
+        for d in data[:15]:
+            print(f"  {d['value']}")
+        print(f"... total matches: {len(data)}")
+    else:
+        print("  !!! crawl failed")
+
+
+# ────────────────────────────────────────────────────────────────────────────
+# Entrypoint
+# ────────────────────────────────────────────────────────────────────────────
+async def main() -> None:
+    # await demo_regex_default()
+    # await demo_regex_custom()
+    await demo_regex_generate_pattern()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/serp_api_project_11_feb.py b/docs/examples/serp_api_project_11_feb.py
new file mode 100644
index 00000000..df0768ed
--- /dev/null
+++ b/docs/examples/serp_api_project_11_feb.py
@@ -0,0 +1,305 @@
+import asyncio
+import json
+from typing import Any, Dict, List, Optional
+
+from regex import P
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    LLMExtractionStrategy,
+    JsonCssExtractionStrategy,
+    CrawlerHub,
+    CrawlResult,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+)
+from pathlib import Path
+from pydantic import BaseModel
+
+__current_dir = Path(__file__).parent
+
+# Crawl4ai Hello Web
+async def little_hello_web():
+    async with AsyncWebCrawler() as crawler:
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org"
+        )
+        print(result.markdown.raw_markdown[:500])
+
+async def hello_web():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
+            ),        
+        )
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org", config=crawler_config
+        )
+        print(result.markdown.fit_markdown[:500])
+
+# Naive Approach Using Large Language Models
+async def extract_using_llm():
+    print("Extracting using Large Language Models")
+
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler = AsyncWebCrawler(config=browser_config) 
+
+    await crawler.start()
+    try:
+        class Sitelink(BaseModel):
+            title: str
+            link: str
+
+        class GoogleSearchResult(BaseModel):
+            title: str
+            link: str
+            snippet: str
+            sitelinks: Optional[List[Sitelink]] = None        
+
+        llm_extraction_strategy = LLMExtractionStrategy(
+            provider = "openai/gpt-4o",
+            schema = GoogleSearchResult.model_json_schema(),
+            instruction="""I want to extract the title, link, snippet, and sitelinks from a Google search result. I shared here the content of div#search from the search result page. We are just interested in organic search results.
+            Example: 
+            {
+                "title": "Google",
+                "link": "https://www.google.com",
+                "snippet": "Google is a search engine.",
+                "sitelinks": [
+                    {
+                        "title": "Gmail",
+                        "link": "https://mail.google.com"
+                    },
+                    {
+                        "title": "Google Drive",
+                        "link": "https://drive.google.com"
+                    }
+                ]
+            }""",
+            # apply_chunking=False,
+            chunk_token_threshold=2 ** 12, # 2^12 = 4096
+            verbose=True,
+            # input_format="html", # html, markdown, cleaned_html
+            input_format="cleaned_html"
+        )
+
+
+        crawl_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            keep_attrs=["id", "class"],
+            keep_data_attributes=True,
+            delay_before_return_html=2,
+            extraction_strategy=llm_extraction_strategy,
+            css_selector="div#search",
+        )
+
+        result : CrawlResult = await crawler.arun(
+            url="https://www.google.com/search?q=apple%20inc&start=0&num=10",
+            config=crawl_config,
+        )
+    
+        search_result = {}
+        if result.success:
+            search_result = json.loads(result.extracted_content)
+
+            # save search result to file
+            with open(__current_dir / "search_result_using_llm.json", "w") as f:
+                f.write(json.dumps(search_result, indent=4))
+            print(json.dumps(search_result, indent=4)) 
+
+    finally:
+        await crawler.close()
+
+# Example of using CrawlerHub
+async def schema_generator():
+    print("Generating schema")
+    html = ""
+
+    # Load html from file
+    with open(__current_dir / "google_search_item.html", "r") as f:
+        html = f.read()
+    
+    organic_schema = JsonCssExtractionStrategy.generate_schema(
+            html=html,
+            target_json_example="""{
+                "title": "...",
+                "link": "...",
+                "snippet": "...",
+                "date": "1 hour ago",
+                "sitelinks": [
+                    {
+                        "title": "...",
+                        "link": "..."
+                    }
+                ]
+            }""",
+            query="""The given HTML is the crawled HTML from the Google search result, which refers to one HTML element representing one organic Google search result. Please find the schema for the organic search item based on the given HTML. I am interested in the title, link, snippet text, sitelinks, and date.""",
+        )
+    
+    print(json.dumps(organic_schema, indent=4))    
+    pass
+
+# Golden Standard
+async def build_schema(html:str, force: bool = False) -> Dict[str, Any]:
+    print("Building schema")
+    schemas = {}
+    if (__current_dir / "organic_schema.json").exists() and not force:
+        with open(__current_dir / "organic_schema.json", "r") as f:
+            schemas["organic"] = json.loads(f.read())
+    else:        
+        # Extract schema from html
+        organic_schema = JsonCssExtractionStrategy.generate_schema(
+            html=html,
+            target_json_example="""{
+                "title": "...",
+                "link": "...",
+                "snippet": "...",
+                "date": "1 hour ago",
+                "sitelinks": [
+                    {
+                        "title": "...",
+                        "link": "..."
+                    }
+                ]
+            }""",
+            query="""The given html is the crawled html from Google search result. Please find the schema for organic search item in the given html, I am interested in title, link, snippet text, sitelinks and date. Usually they are all inside a div#search.""",
+        )
+
+        # Save schema to file current_dir/organic_schema.json
+        with open(__current_dir / "organic_schema.json", "w") as f:
+            f.write(json.dumps(organic_schema, indent=4))
+        
+        schemas["organic"] = organic_schema    
+
+    # Repeat the same for top_stories_schema
+    if (__current_dir / "top_stories_schema.json").exists():
+        with open(__current_dir / "top_stories_schema.json", "r") as f:
+            schemas["top_stories"] = json.loads(f.read())
+    else:
+        top_stories_schema = JsonCssExtractionStrategy.generate_schema(
+            html=html,
+            target_json_example="""{
+            "title": "...",
+            "link": "...",
+            "source": "Insider Monkey",
+            "date": "1 hour ago",
+        }""",
+            query="""The given HTML is the crawled HTML from the Google search result. Please find the schema for the Top Stories item in the given HTML. I am interested in the title, link, source, and date.""",
+        )
+
+        with open(__current_dir / "top_stories_schema.json", "w") as f:
+            f.write(json.dumps(top_stories_schema, indent=4))
+        
+        schemas["top_stories"] = top_stories_schema
+
+    # Repeat the same for suggested_queries_schema
+    if (__current_dir / "suggested_queries_schema.json").exists():
+        with open(__current_dir / "suggested_queries_schema.json", "r") as f:
+            schemas["suggested_queries"] = json.loads(f.read())
+    else:
+        suggested_queries_schema = JsonCssExtractionStrategy.generate_schema(
+            html=html,
+            target_json_example="""{
+            "query": "A for Apple",
+        }""",
+            query="""The given HTML contains the crawled HTML from Google search results. Please find the schema for each suggested query in the section "relatedSearches" at the bottom of the page. I am interested in the queries only.""",
+        )
+
+        with open(__current_dir / "suggested_queries_schema.json", "w") as f:
+            f.write(json.dumps(suggested_queries_schema, indent=4))
+        
+        schemas["suggested_queries"] = suggested_queries_schema
+    
+    return schemas
+
+async def search(q: str = "apple inc") -> Dict[str, Any]:
+    print("Searching for:", q)
+
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler = AsyncWebCrawler(config=browser_config)
+    search_result: Dict[str, List[Dict[str, Any]]] = {} 
+
+    await crawler.start()
+    try:
+        crawl_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            keep_attrs=["id", "class"],
+            keep_data_attributes=True,
+            delay_before_return_html=2,
+        )
+        from urllib.parse import quote
+        result: CrawlResult = await crawler.arun(
+            f"https://www.google.com/search?q={quote(q)}&start=0&num=10",
+            config=crawl_config
+        )
+
+        if result.success:
+            schemas : Dict[str, Any] = await build_schema(result.html)
+
+            for schema in schemas.values():
+                schema_key = schema["name"].lower().replace(' ', '_')
+                search_result[schema_key] = JsonCssExtractionStrategy(
+                    schema=schema
+                ).run(
+                    url="",
+                    sections=[result.html],
+                )
+
+            # save search result to file
+            with open(__current_dir / "search_result.json", "w") as f:
+                f.write(json.dumps(search_result, indent=4))
+            print(json.dumps(search_result, indent=4))        
+
+    finally:
+        await crawler.close()
+
+    return search_result
+
+# Example of using CrawlerHub
+async def hub_example(query: str = "apple inc"):
+    print("Using CrawlerHub")
+    crawler_cls = CrawlerHub.get("google_search")
+    crawler = crawler_cls()
+
+    # Text search
+    text_results = await crawler.run(
+        query=query,
+        search_type="text",  
+        schema_cache_path="/Users/unclecode/.crawl4ai"
+    )
+    # Save search result to file
+    with open(__current_dir / "search_result_using_hub.json", "w") as f:
+        f.write(json.dumps(json.loads(text_results), indent=4))
+
+    print(json.dumps(json.loads(text_results), indent=4))
+
+
+async def demo():
+    # Step 1: Introduction & Overview 
+    # await little_hello_web()
+    # await hello_web()
+
+    # Step 2: Demo end result, using hub
+    # await hub_example()
+
+    # Step 3: Using LLm for extraction
+    # await extract_using_llm()
+
+    # Step 4: GEt familiar with schema generation
+    # await schema_generator()
+
+    # Step 5: Golden Standard
+    # await search()
+
+    # Step 6: Introduction to CrawlerHub
+    await hub_example()
+
+if __name__ == "__main__":
+    asyncio.run(demo())
diff --git a/docs/examples/session_id_example.py b/docs/examples/session_id_example.py
new file mode 100644
index 00000000..e49b7819
--- /dev/null
+++ b/docs/examples/session_id_example.py
@@ -0,0 +1,38 @@
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    CrawlResult
+)
+
+                   
+
+async def main():    
+    browser_config = BrowserConfig(
+        headless=False, 
+        verbose=True,
+    )
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler_config = CrawlerRunConfig(
+            session_id= "hello_world", # This help us to use the same page 
+        )
+        result : CrawlResult = await crawler.arun(
+            url="https://www.helloworld.org", config=crawler_config
+        )
+        # Add a breakpoint here, then you will the page is open and browser is not closed
+        print(result.markdown.raw_markdown[:500])
+        
+        new_config = crawler_config.clone(js_code=["(() => ({'data':'hello'}))()"], js_only=True)
+        result : CrawlResult = await crawler.arun( # This time there is no fetch and this only executes JS in the same opened page
+            url="https://www.helloworld.org", config= new_config
+        )
+        print(result.js_execution_result) # You should see {'data':'hello'} in the console
+        
+        # Get direct access to Playwright paege object. This works only if you use the same session_id and pass same config 
+        page, context = crawler.crawler_strategy.get_page(new_config)
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/tutorial_v0.5.py b/docs/examples/tutorial_v0.5.py
new file mode 100644
index 00000000..fe8e0a2b
--- /dev/null
+++ b/docs/examples/tutorial_v0.5.py
@@ -0,0 +1,460 @@
+import asyncio
+import time
+import re
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import (
+    BestFirstCrawlingStrategy,
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+)
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+from crawl4ai import ProxyConfig
+from crawl4ai import RoundRobinProxyStrategy
+from crawl4ai.content_filter_strategy import LLMContentFilter
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import LLMConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
+from pprint import pprint
+
+
+# 1️⃣ Deep Crawling with Best-First Strategy
+async def deep_crawl():
+    """
+    PART 1: Deep Crawling with Best-First Strategy
+    
+    This function demonstrates:
+    - Using the BestFirstCrawlingStrategy
+    - Creating filter chains to narrow down crawl targets
+    - Using a scorer to prioritize certain URLs
+    - Respecting robots.txt rules
+    """
+    print("\n===== DEEP CRAWLING =====")
+    print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.")
+    
+    # Create a filter chain to filter urls based on patterns, domains and content type
+    filter_chain = FilterChain(
+        [
+            DomainFilter(
+                allowed_domains=["docs.crawl4ai.com"],
+                blocked_domains=["old.docs.crawl4ai.com"],
+            ),
+            URLPatternFilter(patterns=["*core*", "*advanced*"],),
+            ContentTypeFilter(allowed_types=["text/html"]),
+        ]
+    )
+    
+    # Create a keyword scorer that prioritises the pages with certain keywords first
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"], weight=0.7
+    )
+    
+    # Set up the configuration with robots.txt compliance enabled
+    deep_crawl_config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+        check_robots_txt=True,  # Enable robots.txt compliance
+    )
+    
+    # Execute the crawl
+    async with AsyncWebCrawler() as crawler:
+        print("\n📊 Starting deep crawl with Best-First strategy...")
+        print("  - Filtering by domain, URL patterns, and content type")
+        print("  - Scoring pages based on keyword relevance")
+        print("  - Respecting robots.txt rules")
+        
+        start_time = time.perf_counter()
+        results = []
+        
+        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
+            # Print each result as it comes in
+            depth = result.metadata.get("depth", 0)
+            score = result.metadata.get("score", 0)
+            print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}")
+            results.append(result)
+            
+        duration = time.perf_counter() - start_time
+        
+        # Print summary statistics
+        print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+        
+        # Group by depth
+        if results:
+            depth_counts = {}
+            for result in results:
+                depth = result.metadata.get("depth", 0)
+                depth_counts[depth] = depth_counts.get(depth, 0) + 1
+            
+            print("\n📊 Pages crawled by depth:")
+            for depth, count in sorted(depth_counts.items()):
+                print(f"  Depth {depth}: {count} pages")
+
+
+# 2️⃣ Memory-Adaptive Dispatcher
+async def memory_adaptive_dispatcher():
+    """
+    PART 2: Memory-Adaptive Dispatcher
+    
+    This function demonstrates:
+    - Using MemoryAdaptiveDispatcher to manage system memory
+    - Batch and streaming modes with multiple URLs
+    """
+    print("\n===== MEMORY-ADAPTIVE DISPATCHER =====")
+    print("This example shows how to use the memory-adaptive dispatcher for resource management.")
+    
+    # Configure the dispatcher (optional, defaults are used if not provided)
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=80.0,  # Pause if memory usage exceeds 80%
+        check_interval=0.5,  # Check memory every 0.5 seconds
+    )
+    
+    # Test URLs
+    urls = [
+        "https://docs.crawl4ai.com", 
+        "https://github.com/unclecode/crawl4ai"
+    ]
+    
+    async def batch_mode():
+        print("\n📊 BATCH MODE:")
+        print("  In this mode, all results are collected before being returned.")
+        
+        async with AsyncWebCrawler() as crawler:
+            start_time = time.perf_counter()
+            results = await crawler.arun_many(
+                urls=urls,
+                config=CrawlerRunConfig(stream=False),  # Batch mode
+                dispatcher=dispatcher,
+            )
+            
+            print(f"  ✅ Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds")
+            for result in results:
+                print(f"  → {result.url} with status code: {result.status_code}")
+    
+    async def stream_mode():
+        print("\n📊 STREAMING MODE:")
+        print("  In this mode, results are processed as they become available.")
+        
+        async with AsyncWebCrawler() as crawler:
+            start_time = time.perf_counter()
+            count = 0
+            first_result_time = None
+            
+            async for result in await crawler.arun_many(
+                urls=urls,
+                config=CrawlerRunConfig(stream=True),  # Stream mode
+                dispatcher=dispatcher,
+            ):
+                count += 1
+                current_time = time.perf_counter() - start_time
+                
+                if count == 1:
+                    first_result_time = current_time
+                    print(f"  ✅ First result after {first_result_time:.2f} seconds: {result.url}")
+                else:
+                    print(f"  → Result #{count} after {current_time:.2f} seconds: {result.url}")
+            
+            print(f"  ✅ Total: {count} results")
+            print(f"  ✅ First result: {first_result_time:.2f} seconds")
+            print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
+    
+    # Run both examples
+    await batch_mode()
+    await stream_mode()
+    
+    print("\n🔍 Key Takeaway: The memory-adaptive dispatcher prevents OOM errors")
+    print("  and manages concurrency based on system resources.")
+
+
+# 3️⃣ HTTP Crawler Strategy
+async def http_crawler_strategy():
+    """
+    PART 3: HTTP Crawler Strategy
+    
+    This function demonstrates:
+    - Using the lightweight HTTP-only crawler
+    - Setting custom headers and configurations
+    """
+    print("\n===== HTTP CRAWLER STRATEGY =====")
+    print("This example shows how to use the fast, lightweight HTTP-only crawler.")
+    
+    # Use the HTTP crawler strategy
+    http_config = HTTPCrawlerConfig(
+        method="GET",
+        headers={"User-Agent": "MyCustomBot/1.0"},
+        follow_redirects=True,
+        verify_ssl=True
+    )
+    
+    print("\n📊 Initializing HTTP crawler strategy...")
+    print("  - Using custom User-Agent: MyCustomBot/1.0")
+    print("  - Following redirects: Enabled")
+    print("  - Verifying SSL: Enabled")
+    
+    # Create crawler with HTTP strategy
+    async with AsyncWebCrawler(
+        crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config)
+    ) as crawler:
+        start_time = time.perf_counter()
+        result = await crawler.arun("https://example.com")
+        duration = time.perf_counter() - start_time
+        
+        print(f"\n✅ Crawled in {duration:.2f} seconds")
+        print(f"✅ Status code: {result.status_code}")
+        print(f"✅ Content length: {len(result.html)} bytes")
+        
+        # Check if there was a redirect
+        if result.redirected_url and result.redirected_url != result.url:
+            print(f"ℹ️ Redirected from {result.url} to {result.redirected_url}")
+    
+    print("\n🔍 Key Takeaway: HTTP crawler is faster and more memory-efficient")
+    print("  than browser-based crawling for simple pages.")
+
+
+# 4️⃣ Proxy Rotation
+async def proxy_rotation():
+    """
+    PART 4: Proxy Rotation
+    
+    This function demonstrates:
+    - Setting up a proxy rotation strategy
+    - Using multiple proxies in a round-robin fashion
+    """
+    print("\n===== PROXY ROTATION =====")
+    print("This example shows how to implement proxy rotation for distributed crawling.")
+    
+    # Load proxies and create rotation strategy
+    proxies = ProxyConfig.from_env()
+    #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
+    if not proxies:
+        print("No proxies found in environment. Set PROXIES env variable!")
+        return
+        
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+    
+    # Create configs
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        proxy_rotation_strategy=proxy_strategy
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
+
+        print("\n📈 Initializing crawler with proxy rotation...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            print("\n🚀 Starting batch crawl with proxy rotation...")
+            results = await crawler.arun_many(
+                urls=urls,
+                config=run_config
+            )
+            for result in results:
+                if result.success:
+                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
+                    
+                    if current_proxy and ip_match:
+                        print(f"URL {result.url}")
+                        print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
+                        verified = ip_match.group(0) == current_proxy.ip
+                        if verified:
+                            print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
+                        else:
+                            print("❌ Proxy failed or IP mismatch!")
+                    print("---")
+                else:
+                    print(f"❌ Crawl via proxy failed!: {result.error_message}")
+
+
+# 5️⃣ LLM Content Filter (requires API key)
+async def llm_content_filter():
+    """
+    PART 5: LLM Content Filter
+    
+    This function demonstrates:
+    - Configuring LLM providers via LLMConfig
+    - Using LLM to generate focused markdown
+    - LLMConfig for configuration
+    
+    Note: Requires a valid API key for the chosen LLM provider
+    """
+    print("\n===== LLM CONTENT FILTER =====")
+    print("This example shows how to use LLM to generate focused markdown content.")
+    print("Note: This example requires an API key. Set it in environment variables.")
+    
+    # Create LLM configuration
+    # Replace with your actual API key or set as environment variable
+    llm_config = LLMConfig(
+        provider="gemini/gemini-1.5-pro", 
+        api_token="env:GEMINI_API_KEY"  # Will read from GEMINI_API_KEY environment variable
+    )
+    
+    print("\n📊 Setting up LLM content filter...")
+    print(f"  - Provider: {llm_config.provider}")
+    print("  - API token: Using environment variable")
+    print("  - Instruction: Extract key concepts and summaries")
+    
+    # Create markdown generator with LLM filter
+    markdown_generator = DefaultMarkdownGenerator(
+        content_filter=LLMContentFilter(
+            llm_config=llm_config,
+            instruction="Extract key concepts and summaries"
+        )
+    )
+    
+    config = CrawlerRunConfig(markdown_generator=markdown_generator)
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://docs.crawl4ai.com", config=config)
+        pprint(result.markdown.fit_markdown)
+        print("\n✅ Generated focused markdown:")
+
+
+
+# 6️⃣ PDF Processing
+async def pdf_processing():
+    """
+    PART 6: PDF Processing
+    
+    This function demonstrates:
+    - Using PDFCrawlerStrategy and PDFContentScrapingStrategy
+    - Extracting text and metadata from PDFs
+    """
+    print("\n===== PDF PROCESSING =====")
+    print("This example shows how to extract text and metadata from PDF files.")
+    
+    # Sample PDF URL
+    pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
+    
+    print("\n📊 Initializing PDF crawler...")
+    print(f"  - Target PDF: {pdf_url}")
+    print("  - Using PDFCrawlerStrategy and PDFContentScrapingStrategy")
+    
+    # Create crawler with PDF strategy
+    async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
+        print("\n🚀 Starting PDF processing...")
+        
+        start_time = time.perf_counter()
+        result = await crawler.arun(
+            pdf_url,
+            config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy())
+        )
+        duration = time.perf_counter() - start_time
+        
+        print(f"\n✅ Processed PDF in {duration:.2f} seconds")
+        
+        # Show metadata
+        print("\n📄 PDF Metadata:")
+        if result.metadata:
+            for key, value in result.metadata.items():
+                if key not in ["html", "text", "markdown"] and value:
+                    print(f"  - {key}: {value}")
+        else:
+            print("  No metadata available")
+        
+        # Show sample of content
+        if result.markdown:
+            print("\n📝 PDF Content Sample:")
+            content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown
+            print(f"---\n{content_sample}\n---")
+        else:
+            print("\n⚠️ No content extracted")
+    
+    print("\n🔍 Key Takeaway: Crawl4AI can now process PDF files")
+    print("  to extract both text content and metadata.")
+
+
+# 7️⃣ LLM Schema Generation (requires API key)
+async def llm_schema_generation():
+    """
+    PART 7: LLM Schema Generation
+    
+    This function demonstrates:
+    - Configuring LLM providers via LLMConfig
+    - Using LLM to generate extraction schemas
+    - JsonCssExtractionStrategy
+    
+    Note: Requires a valid API key for the chosen LLM provider
+    """
+    print("\n===== LLM SCHEMA GENERATION =====")
+    print("This example shows how to use LLM to automatically generate extraction schemas.")
+    print("Note: This example requires an API key. Set it in environment variables.")
+    
+    # Sample HTML
+    sample_html = """
+    <div class="product">
+        <h2 class="title">Awesome Gaming Laptop</h2>
+        <div class="price">$1,299.99</div>
+        <div class="specs">
+            <ul>
+                <li>16GB RAM</li>
+                <li>512GB SSD</li>
+                <li>RTX 3080</li>
+            </ul>
+        </div>
+        <div class="rating">4.7/5</div>
+    </div>
+    """
+    print("\n📊 Setting up LLMConfig...")
+    # Create LLM configuration
+    llm_config = LLMConfig(
+        provider="gemini/gemini-1.5-pro", 
+        api_token="env:GEMINI_API_KEY"
+    )
+    print("\n🚀 Generating schema for product extraction...")
+    print("  This would use the LLM to analyze HTML and create an extraction schema")
+    schema = JsonCssExtractionStrategy.generate_schema(
+    html=sample_html,
+    llm_config = llm_config,
+    query="Extract product name and price"
+    )
+    print("\n✅ Generated Schema:")
+    pprint(schema)
+    
+# Run all sections
+async def run_tutorial():
+    """
+    Main function to run all tutorial sections.
+    """
+    print("\n🚀 CRAWL4AI v0.5.0 TUTORIAL 🚀")
+    print("===============================")
+    print("This tutorial demonstrates the key features of Crawl4AI v0.5.0")
+    print("Including deep crawling, memory-adaptive dispatching, advanced filtering,")
+    print("and more powerful extraction capabilities.")
+    
+    # Sections to run
+    sections = [
+        deep_crawl,                 # 1. Deep Crawling with Best-First Strategy
+        memory_adaptive_dispatcher, # 2. Memory-Adaptive Dispatcher
+        http_crawler_strategy,      # 3. HTTP Crawler Strategy
+        proxy_rotation,             # 4. Proxy Rotation
+        llm_content_filter,         # 5. LLM Content Filter
+        pdf_processing,             # 6. PDF Processing
+        llm_schema_generation,      # 7. Schema Generation using LLM
+    ]
+    
+    for section in sections:
+        try:
+            await section()
+        except Exception as e:
+            print(f"⚠️ Error in {section.__name__}: {e}")
+    
+    print("\n🎉 TUTORIAL COMPLETE! 🎉")
+    print("You've now explored the key features of Crawl4AI v0.5.0")
+    print("For more information, visit https://docs.crawl4ai.com")
+
+
+# Run the tutorial
+if __name__ == "__main__":
+    asyncio.run(run_tutorial())
\ No newline at end of file
diff --git a/docs/examples/use_geo_location.py b/docs/examples/use_geo_location.py
new file mode 100644
index 00000000..2cfc866f
--- /dev/null
+++ b/docs/examples/use_geo_location.py
@@ -0,0 +1,70 @@
+# use_geo_location.py
+"""
+Example: override locale, timezone, and geolocation using Crawl4ai patterns.
+
+This demo uses `AsyncWebCrawler.arun()` to fetch a page with
+browser context primed for specific locale, timezone, and GPS,
+and saves a screenshot for visual verification.
+"""
+
+import asyncio
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    BrowserConfig,
+    GeolocationConfig,
+    CrawlResult,
+)
+
+async def demo_geo_override():
+    """Demo: Crawl a geolocation-test page with overrides and screenshot."""
+    print("\n=== Geo-Override Crawl ===")
+
+    # 1) Browser setup: use Playwright-managed contexts
+    browser_cfg = BrowserConfig(
+        headless=False,
+        viewport_width=1280,
+        viewport_height=720,
+        use_managed_browser=False,
+    )
+
+    # 2) Run config: include locale, timezone_id, geolocation, and screenshot
+    run_cfg = CrawlerRunConfig(
+        url="https://browserleaks.com/geo",          # test page that shows your location
+        locale="en-US",                              # Accept-Language & UI locale
+        timezone_id="America/Los_Angeles",           # JS Date()/Intl timezone
+        geolocation=GeolocationConfig(                 # override GPS coords
+            latitude=34.0522,
+            longitude=-118.2437,
+            accuracy=10.0,
+        ),
+        screenshot=True,                               # capture screenshot after load
+        session_id="geo_test",                       # reuse context if rerunning
+        delay_before_return_html=5
+    )
+
+    async with AsyncWebCrawler(config=browser_cfg) as crawler:
+        # 3) Run crawl (returns list even for single URL)
+        results: List[CrawlResult] = await crawler.arun(
+            url=run_cfg.url,
+            config=run_cfg,            
+        )
+        result = results[0]
+
+        # 4) Save screenshot and report path
+        if result.screenshot:
+            __current_dir = Path(__file__).parent
+            out_dir = __current_dir / "tmp"
+            out_dir.mkdir(exist_ok=True)
+            shot_path = out_dir / "geo_test.png"
+            with open(shot_path, "wb") as f:
+                f.write(base64.b64decode(result.screenshot))
+            print(f"Saved screenshot to {shot_path}")
+        else:
+            print("No screenshot captured, check configuration.")
+
+if __name__ == "__main__":
+    asyncio.run(demo_geo_override())
diff --git a/docs/md_v2/advanced/advanced-features.md b/docs/md_v2/advanced/advanced-features.md
index 6b3776d1..b56f216e 100644
--- a/docs/md_v2/advanced/advanced-features.md
+++ b/docs/md_v2/advanced/advanced-features.md
@@ -7,8 +7,8 @@ Crawl4AI offers multiple power-user features that go beyond simple crawling. Thi
 2. **Capturing PDFs & Screenshots**  
 3. **Handling SSL Certificates**  
 4. **Custom Headers**  
-5. **Session Persistence & Local Storage**
-6. **Robots.txt Compliance**
+5. **Session Persistence & Local Storage**  
+6. **Robots.txt Compliance**  
 
 > **Prerequisites**  
 > - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)  
diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md
index 702d9475..3864f840 100644
--- a/docs/md_v2/advanced/identity-based-crawling.md
+++ b/docs/md_v2/advanced/identity-based-crawling.md
@@ -167,13 +167,210 @@ async with AsyncWebCrawler() as crawler:
 
 ---
 
-## 6. Summary
+## 6. Using the BrowserProfiler Class
 
-- **Create** your user-data directory by launching Chrome/Chromium externally with `--user-data-dir=/some/path`.  
-- **Log in** or configure sites as needed, then close the browser.  
-- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`.  
-- Enjoy **persistent** sessions that reflect your real identity.  
-- If you only need quick, ephemeral automation, **Magic Mode** might suffice.
+Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing.
+
+### Creating and Managing Profiles with BrowserProfiler
+
+The `BrowserProfiler` class offers a comprehensive API for browser profile management:
+
+```python
+import asyncio
+from crawl4ai import BrowserProfiler
+
+async def manage_profiles():
+    # Create a profiler instance
+    profiler = BrowserProfiler()
+    
+    # Create a profile interactively - opens a browser window
+    profile_path = await profiler.create_profile(
+        profile_name="my-login-profile"  # Optional: name your profile
+    )
+    
+    print(f"Profile saved at: {profile_path}")
+    
+    # List all available profiles
+    profiles = profiler.list_profiles()
+    
+    for profile in profiles:
+        print(f"Profile: {profile['name']}")
+        print(f"  Path: {profile['path']}")
+        print(f"  Created: {profile['created']}")
+        print(f"  Browser type: {profile['type']}")
+    
+    # Get a specific profile path by name
+    specific_profile = profiler.get_profile_path("my-login-profile")
+    
+    # Delete a profile when no longer needed
+    success = profiler.delete_profile("old-profile-name")
+    
+asyncio.run(manage_profiles())
+```
+
+**How profile creation works:**
+1. A browser window opens for you to interact with
+2. You log in to websites, set preferences, etc.
+3. When you're done, press 'q' in the terminal to close the browser
+4. The profile is saved in the Crawl4AI profiles directory
+5. You can use the returned path with `BrowserConfig.user_data_dir`
+
+### Interactive Profile Management
+
+The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion:
+
+```python
+import asyncio
+from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig
+
+# Define a function to use a profile for crawling
+async def crawl_with_profile(profile_path, url):
+    browser_config = BrowserConfig(
+        headless=True,
+        use_managed_browser=True,
+        user_data_dir=profile_path
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url)
+        return result
+
+async def main():
+    # Create a profiler instance
+    profiler = BrowserProfiler()
+    
+    # Launch the interactive profile manager
+    # Passing the crawl function as a callback adds a "crawl with profile" option
+    await profiler.interactive_manager(crawl_callback=crawl_with_profile)
+    
+asyncio.run(main())
+```
+
+### Legacy Methods
+
+For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class:
+
+```python
+from crawl4ai.browser_manager import ManagedBrowser
+
+# These methods still work but use BrowserProfiler internally
+profiles = ManagedBrowser.list_profiles()
+```
+
+### Complete Example
+
+See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class.
+
+---
+
+## 7. Locale, Timezone, and Geolocation Control
+
+In addition to using persistent profiles, Crawl4AI supports customizing your browser's locale, timezone, and geolocation settings. These features enhance your identity-based browsing experience by allowing you to control how websites perceive your location and regional settings.
+
+### Setting Locale and Timezone
+
+You can set the browser's locale and timezone through `CrawlerRunConfig`:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        config=CrawlerRunConfig(
+            # Set browser locale (language and region formatting)
+            locale="fr-FR",  # French (France)
+            
+            # Set browser timezone
+            timezone_id="Europe/Paris",
+            
+            # Other normal options...
+            magic=True,
+            page_timeout=60000
+        )
+    )
+```
+
+**How it works:**
+- `locale` affects language preferences, date formats, number formats, etc.
+- `timezone_id` affects JavaScript's Date object and time-related functionality
+- These settings are applied when creating the browser context and maintained throughout the session
+
+### Configuring Geolocation
+
+Control the GPS coordinates reported by the browser's geolocation API:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, GeolocationConfig
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://maps.google.com",  # Or any location-aware site
+        config=CrawlerRunConfig(
+            # Configure precise GPS coordinates
+            geolocation=GeolocationConfig(
+                latitude=48.8566,   # Paris coordinates
+                longitude=2.3522,
+                accuracy=100        # Accuracy in meters (optional)
+            ),
+            
+            # This site will see you as being in Paris
+            page_timeout=60000
+        )
+    )
+```
+
+**Important notes:**
+- When `geolocation` is specified, the browser is automatically granted permission to access location
+- Websites using the Geolocation API will receive the exact coordinates you specify
+- This affects map services, store locators, delivery services, etc.
+- Combined with the appropriate `locale` and `timezone_id`, you can create a fully consistent location profile
+
+### Combining with Managed Browsers
+
+These settings work perfectly with managed browsers for a complete identity solution:
+
+```python
+from crawl4ai import (
+    AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, 
+    GeolocationConfig
+)
+
+browser_config = BrowserConfig(
+    use_managed_browser=True,
+    user_data_dir="/path/to/my-profile",
+    browser_type="chromium"
+)
+
+crawl_config = CrawlerRunConfig(
+    # Location settings
+    locale="es-MX",                  # Spanish (Mexico)
+    timezone_id="America/Mexico_City",
+    geolocation=GeolocationConfig(
+        latitude=19.4326,            # Mexico City
+        longitude=-99.1332
+    )
+)
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com", config=crawl_config)
+```
+
+Combining persistent profiles with precise geolocation and region settings gives you complete control over your digital identity.
+
+## 8. Summary
+
+- **Create** your user-data directory either:
+  - By launching Chrome/Chromium externally with `--user-data-dir=/some/path` 
+  - Or by using the built-in `BrowserProfiler.create_profile()` method
+  - Or through the interactive interface with `profiler.interactive_manager()`
+- **Log in** or configure sites as needed, then close the browser
+- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
+- **Customize** identity aspects with `locale`, `timezone_id`, and `geolocation`
+- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
+- **Manage** your profiles with the dedicated `BrowserProfiler` class
+- Enjoy **persistent** sessions that reflect your real identity
+- If you only need quick, ephemeral automation, **Magic Mode** might suffice
 
 **Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.
 
diff --git a/docs/md_v2/advanced/network-console-capture.md b/docs/md_v2/advanced/network-console-capture.md
new file mode 100644
index 00000000..4305a25f
--- /dev/null
+++ b/docs/md_v2/advanced/network-console-capture.md
@@ -0,0 +1,205 @@
+# Network Requests & Console Message Capturing
+
+Crawl4AI can capture all network requests and browser console messages during a crawl, which is invaluable for debugging, security analysis, or understanding page behavior.
+
+## Configuration
+
+To enable network and console capturing, use these configuration options:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+# Enable both network request capture and console message capture
+config = CrawlerRunConfig(
+    capture_network_requests=True,  # Capture all network requests and responses
+    capture_console_messages=True   # Capture all browser console output
+)
+```
+
+## Example Usage
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Enable both network request capture and console message capture
+    config = CrawlerRunConfig(
+        capture_network_requests=True,
+        capture_console_messages=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=config
+        )
+        
+        if result.success:
+            # Analyze network requests
+            if result.network_requests:
+                print(f"Captured {len(result.network_requests)} network events")
+                
+                # Count request types
+                request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
+                response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
+                failed_count = len([r for r in result.network_requests if r.get("event_type") == "request_failed"])
+                
+                print(f"Requests: {request_count}, Responses: {response_count}, Failed: {failed_count}")
+                
+                # Find API calls
+                api_calls = [r for r in result.network_requests 
+                            if r.get("event_type") == "request" and "api" in r.get("url", "")]
+                if api_calls:
+                    print(f"Detected {len(api_calls)} API calls:")
+                    for call in api_calls[:3]:  # Show first 3
+                        print(f"  - {call.get('method')} {call.get('url')}")
+            
+            # Analyze console messages
+            if result.console_messages:
+                print(f"Captured {len(result.console_messages)} console messages")
+                
+                # Group by type
+                message_types = {}
+                for msg in result.console_messages:
+                    msg_type = msg.get("type", "unknown")
+                    message_types[msg_type] = message_types.get(msg_type, 0) + 1
+                
+                print("Message types:", message_types)
+                
+                # Show errors (often the most important)
+                errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+                if errors:
+                    print(f"Found {len(errors)} console errors:")
+                    for err in errors[:2]:  # Show first 2
+                        print(f"  - {err.get('text', '')[:100]}")
+            
+            # Export all captured data to a file for detailed analysis
+            with open("network_capture.json", "w") as f:
+                json.dump({
+                    "url": result.url,
+                    "network_requests": result.network_requests or [],
+                    "console_messages": result.console_messages or []
+                }, f, indent=2)
+            
+            print("Exported detailed capture data to network_capture.json")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Captured Data Structure
+
+### Network Requests
+
+The `result.network_requests` contains a list of dictionaries, each representing a network event with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `event_type` | Type of event: `"request"`, `"response"`, or `"request_failed"` |
+| `url` | The URL of the request |
+| `timestamp` | Unix timestamp when the event was captured |
+
+#### Request Event Fields
+
+```json
+{
+  "event_type": "request",
+  "url": "https://example.com/api/data.json",
+  "method": "GET",
+  "headers": {"User-Agent": "...", "Accept": "..."},
+  "post_data": "key=value&otherkey=value",
+  "resource_type": "fetch",
+  "is_navigation_request": false,
+  "timestamp": 1633456789.123
+}
+```
+
+#### Response Event Fields
+
+```json
+{
+  "event_type": "response",
+  "url": "https://example.com/api/data.json",
+  "status": 200,
+  "status_text": "OK",
+  "headers": {"Content-Type": "application/json", "Cache-Control": "..."},
+  "from_service_worker": false,
+  "request_timing": {"requestTime": 1234.56, "receiveHeadersEnd": 1234.78},
+  "timestamp": 1633456789.456
+}
+```
+
+#### Failed Request Event Fields
+
+```json
+{
+  "event_type": "request_failed",
+  "url": "https://example.com/missing.png",
+  "method": "GET",
+  "resource_type": "image",
+  "failure_text": "net::ERR_ABORTED 404",
+  "timestamp": 1633456789.789
+}
+```
+
+### Console Messages
+
+The `result.console_messages` contains a list of dictionaries, each representing a console message with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `type` | Message type: `"log"`, `"error"`, `"warning"`, `"info"`, etc. |
+| `text` | The message text |
+| `timestamp` | Unix timestamp when the message was captured |
+
+#### Console Message Example
+
+```json
+{
+  "type": "error",
+  "text": "Uncaught TypeError: Cannot read property 'length' of undefined",
+  "location": "https://example.com/script.js:123:45",
+  "timestamp": 1633456790.123
+}
+```
+
+## Key Benefits
+
+- **Full Request Visibility**: Capture all network activity including:
+  - Requests (URLs, methods, headers, post data)
+  - Responses (status codes, headers, timing)
+  - Failed requests (with error messages)
+  
+- **Console Message Access**: View all JavaScript console output:
+  - Log messages
+  - Warnings
+  - Errors with stack traces
+  - Developer debugging information
+
+- **Debugging Power**: Identify issues such as:
+  - Failed API calls or resource loading
+  - JavaScript errors affecting page functionality
+  - CORS or other security issues
+  - Hidden API endpoints and data flows
+
+- **Security Analysis**: Detect:
+  - Unexpected third-party requests
+  - Data leakage in request payloads
+  - Suspicious script behavior
+
+- **Performance Insights**: Analyze:
+  - Request timing data
+  - Resource loading patterns
+  - Potential bottlenecks
+
+## Use Cases
+
+1. **API Discovery**: Identify hidden endpoints and data flows in single-page applications
+2. **Debugging**: Track down JavaScript errors affecting page functionality
+3. **Security Auditing**: Detect unwanted third-party requests or data leakage
+4. **Performance Analysis**: Identify slow-loading resources
+5. **Ad/Tracker Analysis**: Detect and catalog advertising or tracking calls
+
+This capability is especially valuable for complex sites with heavy JavaScript, single-page applications, or when you need to understand the exact communication happening between a browser and servers.
\ No newline at end of file
diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md
index b951b9a5..ea0f8176 100644
--- a/docs/md_v2/api/arun.md
+++ b/docs/md_v2/api/arun.md
@@ -1,6 +1,6 @@
 # `arun()` Parameter Guide (New Approach)
 
-In Crawl4AI’s **latest** configuration model, nearly all parameters that once went directly to `arun()` are now part of **`CrawlerRunConfig`**. When calling `arun()`, you provide:
+In Crawl4AI’s **latest** configuration model, nearly all parameters that once went directly to `arun()` are now part of **`CrawlerRunConfig`**. When calling `arun()`, you provide:
 
 ```python
 await crawler.arun(
@@ -9,11 +9,11 @@ await crawler.arun(
 )
 ```
 
-Below is an organized look at the parameters that can go inside `CrawlerRunConfig`, divided by their functional areas. For **Browser** settings (e.g., `headless`, `browser_type`), see [BrowserConfig](./parameters.md).
+Below is an organized look at the parameters that can go inside `CrawlerRunConfig`, divided by their functional areas. For **Browser** settings (e.g., `headless`, `browser_type`), see [BrowserConfig](./parameters.md).
 
 ---
 
-## 1. Core Usage
+## 1. Core Usage
 
 ```python
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
@@ -23,7 +23,7 @@ async def main():
         verbose=True,            # Detailed logging
         cache_mode=CacheMode.ENABLED,  # Use normal read/write cache
         check_robots_txt=True,   # Respect robots.txt rules
-        # ... other parameters
+        # ... other parameters
     )
 
     async with AsyncWebCrawler() as crawler:
@@ -38,15 +38,16 @@ async def main():
 ```
 
 **Key Fields**:
-- `verbose=True` logs each crawl step.  
+- `verbose=True` logs each crawl step.  
 - `cache_mode` decides how to read/write the local crawl cache.
 
 ---
 
-## 2. Cache Control
+## 2. Cache Control
 
 **`cache_mode`** (default: `CacheMode.ENABLED`)  
 Use a built-in enum from `CacheMode`:
+
 - `ENABLED`: Normal caching—reads if available, writes if missing.
 - `DISABLED`: No caching—always refetch pages.
 - `READ_ONLY`: Reads from cache only; no new writes.
@@ -60,6 +61,7 @@ run_config = CrawlerRunConfig(
 ```
 
 **Additional flags**:
+
 - `bypass_cache=True` acts like `CacheMode.BYPASS`.
 - `disable_cache=True` acts like `CacheMode.DISABLED`.
 - `no_cache_read=True` acts like `CacheMode.WRITE_ONLY`.
@@ -67,7 +69,7 @@ run_config = CrawlerRunConfig(
 
 ---
 
-## 3. Content Processing & Selection
+## 3. Content Processing & Selection
 
 ### 3.1 Text Processing
 
@@ -111,7 +113,7 @@ run_config = CrawlerRunConfig(
 
 ---
 
-## 4. Page Navigation & Timing
+## 4. Page Navigation & Timing
 
 ### 4.1 Basic Browser Flow
 
@@ -124,12 +126,13 @@ run_config = CrawlerRunConfig(
 ```
 
 **Key Fields**:
+
 - `wait_for`:  
   - `"css:selector"` or  
   - `"js:() => boolean"`  
-  e.g. `js:() => document.querySelectorAll('.item').length > 10`.
+  e.g. `js:() => document.querySelectorAll('.item').length > 10`.
 
-- `mean_delay` & `max_range`: define random delays for `arun_many()` calls.  
+- `mean_delay` & `max_range`: define random delays for `arun_many()` calls.  
 - `semaphore_count`: concurrency limit when crawling multiple URLs.
 
 ### 4.2 JavaScript Execution
@@ -144,7 +147,7 @@ run_config = CrawlerRunConfig(
 )
 ```
 
-- `js_code` can be a single string or a list of strings.  
+- `js_code` can be a single string or a list of strings.  
 - `js_only=True` means “I’m continuing in the same session with new JS steps, no new full navigation.”
 
 ### 4.3 Anti-Bot
@@ -156,13 +159,13 @@ run_config = CrawlerRunConfig(
     override_navigator=True
 )
 ```
-- `magic=True` tries multiple stealth features.  
-- `simulate_user=True` mimics mouse movements or random delays.  
+- `magic=True` tries multiple stealth features.  
+- `simulate_user=True` mimics mouse movements or random delays.  
 - `override_navigator=True` fakes some navigator properties (like user agent checks).
 
 ---
 
-## 5. Session Management
+## 5. Session Management
 
 **`session_id`**: 
 ```python
@@ -174,7 +177,7 @@ If re-used in subsequent `arun()` calls, the same tab/page context is continued
 
 ---
 
-## 6. Screenshot, PDF & Media Options
+## 6. Screenshot, PDF & Media Options
 
 ```python
 run_config = CrawlerRunConfig(
@@ -191,7 +194,7 @@ run_config = CrawlerRunConfig(
 
 ---
 
-## 7. Extraction Strategy
+## 7. Extraction Strategy
 
 **For advanced data extraction** (CSS/LLM-based), set `extraction_strategy`:
 
@@ -205,7 +208,7 @@ The extracted data will appear in `result.extracted_content`.
 
 ---
 
-## 8. Comprehensive Example
+## 8. Comprehensive Example
 
 Below is a snippet combining many parameters:
 
@@ -274,32 +277,33 @@ if __name__ == "__main__":
 ```
 
 **What we covered**:
-1. **Crawling** the main content region, ignoring external links.  
-2. Running **JavaScript** to click “.show-more”.  
-3. **Waiting** for “.loaded-block” to appear.  
-4. Generating a **screenshot** & **PDF** of the final page.  
-5. Extracting repeated “article.post” elements with a **CSS-based** extraction strategy.
+
+1. **Crawling** the main content region, ignoring external links.  
+2. Running **JavaScript** to click “.show-more”.  
+3. **Waiting** for “.loaded-block” to appear.  
+4. Generating a **screenshot** & **PDF** of the final page.  
+5. Extracting repeated “article.post” elements with a **CSS-based** extraction strategy.
 
 ---
 
-## 9. Best Practices
+## 9. Best Practices
 
-1. **Use `BrowserConfig` for global browser** settings (headless, user agent).  
-2. **Use `CrawlerRunConfig`** to handle the **specific** crawl needs: content filtering, caching, JS, screenshot, extraction, etc.  
-3. Keep your **parameters consistent** in run configs—especially if you’re part of a large codebase with multiple crawls.  
-4. **Limit** large concurrency (`semaphore_count`) if the site or your system can’t handle it.  
-5. For dynamic pages, set `js_code` or `scan_full_page` so you load all content.
+1. **Use `BrowserConfig` for global browser** settings (headless, user agent).  
+2. **Use `CrawlerRunConfig`** to handle the **specific** crawl needs: content filtering, caching, JS, screenshot, extraction, etc.  
+3. Keep your **parameters consistent** in run configs—especially if you’re part of a large codebase with multiple crawls.  
+4. **Limit** large concurrency (`semaphore_count`) if the site or your system can’t handle it.  
+5. For dynamic pages, set `js_code` or `scan_full_page` so you load all content.
 
 ---
 
-## 10. Conclusion
+## 10. Conclusion
 
-All parameters that used to be direct arguments to `arun()` now belong in **`CrawlerRunConfig`**. This approach:
+All parameters that used to be direct arguments to `arun()` now belong in **`CrawlerRunConfig`**. This approach:
 
-- Makes code **clearer** and **more maintainable**.  
-- Minimizes confusion about which arguments affect global vs. per-crawl behavior.  
+- Makes code **clearer** and **more maintainable**.  
+- Minimizes confusion about which arguments affect global vs. per-crawl behavior.  
 - Allows you to create **reusable** config objects for different pages or tasks.
 
-For a **full** reference, check out the [CrawlerRunConfig Docs](./parameters.md). 
+For a **full** reference, check out the [CrawlerRunConfig Docs](./parameters.md). 
 
 Happy crawling with your **structured, flexible** config approach!
\ No newline at end of file
diff --git a/docs/md_v2/api/arun_many.md b/docs/md_v2/api/arun_many.md
index edc01145..98d91c08 100644
--- a/docs/md_v2/api/arun_many.md
+++ b/docs/md_v2/api/arun_many.md
@@ -1,6 +1,6 @@
 # `arun_many(...)` Reference
 
-> **Note**: This function is very similar to [`arun()`](./arun.md) but focused on **concurrent** or **batch** crawling. If you’re unfamiliar with `arun()` usage, please read that doc first, then review this for differences.
+> **Note**: This function is very similar to [`arun()`](./arun.md) but focused on **concurrent** or **batch** crawling. If you’re unfamiliar with `arun()` usage, please read that doc first, then review this for differences.
 
 ## Function Signature
 
@@ -16,7 +16,7 @@ async def arun_many(
 
     :param urls: A list of URLs (or tasks) to crawl.
     :param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
-    :param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
+    :param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
     ...
     :return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled.
     """
@@ -24,22 +24,26 @@ async def arun_many(
 
 ## Differences from `arun()`
 
-1. **Multiple URLs**:  
-   - Instead of crawling a single URL, you pass a list of them (strings or tasks).  
+1. **Multiple URLs**:  
+   
+   - Instead of crawling a single URL, you pass a list of them (strings or tasks).  
    - The function returns either a **list** of `CrawlResult` or an **async generator** if streaming is enabled.
 
-2. **Concurrency & Dispatchers**:  
-   - **`dispatcher`** param allows advanced concurrency control.  
-   - If omitted, a default dispatcher (like `MemoryAdaptiveDispatcher`) is used internally.  
+2. **Concurrency & Dispatchers**:  
+
+   - **`dispatcher`** param allows advanced concurrency control.  
+   - If omitted, a default dispatcher (like `MemoryAdaptiveDispatcher`) is used internally.  
    - Dispatchers handle concurrency, rate limiting, and memory-based adaptive throttling (see [Multi-URL Crawling](../advanced/multi-url-crawling.md)).
 
-3. **Streaming Support**:  
+3. **Streaming Support**:  
+
    - Enable streaming by setting `stream=True` in your `CrawlerRunConfig`.
    - When streaming, use `async for` to process results as they become available.
    - Ideal for processing large numbers of URLs without waiting for all to complete.
 
-4. **Parallel** Execution**:  
-   - `arun_many()` can run multiple requests concurrently under the hood.  
+4. **Parallel** Execution**:  
+
+   - `arun_many()` can run multiple requests concurrently under the hood.  
    - Each `CrawlResult` might also include a **`dispatch_result`** with concurrency details (like memory usage, start/end times).
 
 ### Basic Example (Batch Mode)
@@ -93,19 +97,19 @@ results = await crawler.arun_many(
 
 **Key Points**:
 - Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
-- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.  
+- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.  
 - If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
 
 ### Return Value
 
-Either a **list** of [`CrawlResult`](./crawl-result.md) objects, or an **async generator** if streaming is enabled. You can iterate to check `result.success` or read each item’s `extracted_content`, `markdown`, or `dispatch_result`.
+Either a **list** of [`CrawlResult`](./crawl-result.md) objects, or an **async generator** if streaming is enabled. You can iterate to check `result.success` or read each item’s `extracted_content`, `markdown`, or `dispatch_result`.
 
 ---
 
 ## Dispatcher Reference
 
-- **`MemoryAdaptiveDispatcher`**: Dynamically manages concurrency based on system memory usage.  
-- **`SemaphoreDispatcher`**: Fixed concurrency limit, simpler but less adaptive.  
+- **`MemoryAdaptiveDispatcher`**: Dynamically manages concurrency based on system memory usage.  
+- **`SemaphoreDispatcher`**: Fixed concurrency limit, simpler but less adaptive.  
 
 For advanced usage or custom settings, see [Multi-URL Crawling with Dispatchers](../advanced/multi-url-crawling.md).
 
@@ -113,12 +117,14 @@ For advanced usage or custom settings, see [Multi-URL Crawling with Dispatchers]
 
 ## Common Pitfalls
 
-1. **Large Lists**: If you pass thousands of URLs, be mindful of memory or rate-limits. A dispatcher can help.  
-2. **Session Reuse**: If you need specialized logins or persistent contexts, ensure your dispatcher or tasks handle sessions accordingly.  
-3. **Error Handling**: Each `CrawlResult` might fail for different reasons—always check `result.success` or the `error_message` before proceeding.
+1. **Large Lists**: If you pass thousands of URLs, be mindful of memory or rate-limits. A dispatcher can help.  
+
+2. **Session Reuse**: If you need specialized logins or persistent contexts, ensure your dispatcher or tasks handle sessions accordingly.  
+
+3. **Error Handling**: Each `CrawlResult` might fail for different reasons—always check `result.success` or the `error_message` before proceeding.
 
 ---
 
 ## Conclusion
 
-Use `arun_many()` when you want to **crawl multiple URLs** simultaneously or in controlled parallel tasks. If you need advanced concurrency features (like memory-based adaptive throttling or complex rate-limiting), provide a **dispatcher**. Each result is a standard `CrawlResult`, possibly augmented with concurrency stats (`dispatch_result`) for deeper inspection. For more details on concurrency logic and dispatchers, see the [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) docs.
\ No newline at end of file
+Use `arun_many()` when you want to **crawl multiple URLs** simultaneously or in controlled parallel tasks. If you need advanced concurrency features (like memory-based adaptive throttling or complex rate-limiting), provide a **dispatcher**. Each result is a standard `CrawlResult`, possibly augmented with concurrency stats (`dispatch_result`) for deeper inspection. For more details on concurrency logic and dispatchers, see the [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) docs.
\ No newline at end of file
diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md
index e9c6cc6b..50177f3e 100644
--- a/docs/md_v2/api/async-webcrawler.md
+++ b/docs/md_v2/api/async-webcrawler.md
@@ -1,16 +1,20 @@
 # AsyncWebCrawler
 
-The **`AsyncWebCrawler`** is the core class for asynchronous web crawling in Crawl4AI. You typically create it **once**, optionally customize it with a **`BrowserConfig`** (e.g., headless, user agent), then **run** multiple **`arun()`** calls with different **`CrawlerRunConfig`** objects.
+The **`AsyncWebCrawler`** is the core class for asynchronous web crawling in Crawl4AI. You typically create it **once**, optionally customize it with a **`BrowserConfig`** (e.g., headless, user agent), then **run** multiple **`arun()`** calls with different **`CrawlerRunConfig`** objects.
 
 **Recommended usage**:
-1. **Create** a `BrowserConfig` for global browser settings.  
-2. **Instantiate** `AsyncWebCrawler(config=browser_config)`.  
-3. **Use** the crawler in an async context manager (`async with`) or manage start/close manually.  
+
+1. **Create** a `BrowserConfig` for global browser settings.  
+
+2. **Instantiate** `AsyncWebCrawler(config=browser_config)`.  
+
+3. **Use** the crawler in an async context manager (`async with`) or manage start/close manually.  
+
 4. **Call** `arun(url, config=crawler_run_config)` for each page you want.
 
 ---
 
-## 1. Constructor Overview
+## 1. Constructor Overview
 
 ```python
 class AsyncWebCrawler:
@@ -37,7 +41,7 @@ class AsyncWebCrawler:
             base_directory:     
                 Folder for storing caches/logs (if relevant).
             thread_safe: 
-                If True, attempts some concurrency safeguards. Usually False.
+                If True, attempts some concurrency safeguards. Usually False.
             **kwargs: 
                 Additional legacy or debugging parameters.
         """
@@ -58,11 +62,12 @@ crawler = AsyncWebCrawler(config=browser_cfg)
 ```
 
 **Notes**:
+
 - **Legacy** parameters like `always_bypass_cache` remain for backward compatibility, but prefer to set **caching** in `CrawlerRunConfig`.
 
 ---
 
-## 2. Lifecycle: Start/Close or Context Manager
+## 2. Lifecycle: Start/Close or Context Manager
 
 ### 2.1 Context Manager (Recommended)
 
@@ -90,7 +95,7 @@ Use this style if you have a **long-running** application or need full control o
 
 ---
 
-## 3. Primary Method: `arun()`
+## 3. Primary Method: `arun()`
 
 ```python
 async def arun(
@@ -130,7 +135,7 @@ For **backward** compatibility, `arun()` can still accept direct arguments like
 
 ---
 
-## 4. Batch Processing: `arun_many()`
+## 4. Batch Processing: `arun_many()`
 
 ```python
 async def arun_many(
@@ -147,6 +152,7 @@ async def arun_many(
 ### 4.1 Resource-Aware Crawling
 
 The `arun_many()` method now uses an intelligent dispatcher that:
+
 - Monitors system memory usage
 - Implements adaptive rate limiting
 - Provides detailed progress monitoring
@@ -154,75 +160,47 @@ The `arun_many()` method now uses an intelligent dispatcher that:
 
 ### 4.2 Example Usage
 
+Check page [Multi-url Crawling](../advanced/multi-url-crawling.md) for a detailed example of how to use `arun_many()`.
+
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
-from crawl4ai.dispatcher import DisplayMode
-
-# Configure browser
-browser_cfg = BrowserConfig(headless=True)
-
-# Configure crawler with rate limiting
-run_cfg = CrawlerRunConfig(
-    # Enable rate limiting
-    enable_rate_limiting=True,
-    rate_limit_config=RateLimitConfig(
-        base_delay=(1.0, 2.0),  # Random delay between 1-2 seconds
-        max_delay=30.0,         # Maximum delay after rate limit hits
-        max_retries=2,          # Number of retries before giving up
-        rate_limit_codes=[429, 503]  # Status codes that trigger rate limiting
-    ),
-    # Resource monitoring
-    memory_threshold_percent=70.0,  # Pause if memory exceeds this
-    check_interval=0.5,            # How often to check resources
-    max_session_permit=3,          # Maximum concurrent crawls
-    display_mode=DisplayMode.DETAILED.value  # Show detailed progress
-)
-
-urls = [
-    "https://example.com/page1",
-    "https://example.com/page2",
-    "https://example.com/page3"
-]
-
-async with AsyncWebCrawler(config=browser_cfg) as crawler:
-    results = await crawler.arun_many(urls, config=run_cfg)
-    for result in results:
-        print(f"URL: {result.url}, Success: {result.success}")
-```
 
 ### 4.3 Key Features
 
-1. **Rate Limiting**
+1. **Rate Limiting**
+   
    - Automatic delay between requests
    - Exponential backoff on rate limit detection
    - Domain-specific rate limiting
    - Configurable retry strategy
 
-2. **Resource Monitoring**
+2. **Resource Monitoring**
+
    - Memory usage tracking
    - Adaptive concurrency based on system load
    - Automatic pausing when resources are constrained
 
-3. **Progress Monitoring**
+3. **Progress Monitoring**
+
    - Detailed or aggregated progress display
    - Real-time status updates
    - Memory usage statistics
 
-4. **Error Handling**
+4. **Error Handling**
+
    - Graceful handling of rate limits
    - Automatic retries with backoff
    - Detailed error reporting
 
 ---
 
-## 5. `CrawlResult` Output
+## 5. `CrawlResult` Output
 
 Each `arun()` returns a **`CrawlResult`** containing:
 
 - `url`: Final URL (if redirected).
 - `html`: Original HTML.
 - `cleaned_html`: Sanitized HTML.
-- `markdown_v2` (or future `markdown`): Markdown outputs (raw, fit, etc.).
+- `markdown_v2`: Deprecated. Instead just use regular `markdown`
 - `extracted_content`: If an extraction strategy was used (JSON for CSS/LLM strategies).
 - `screenshot`, `pdf`: If screenshots/PDF requested.
 - `media`, `links`: Information about discovered images/links.
@@ -232,7 +210,7 @@ For details, see [CrawlResult doc](./crawl-result.md).
 
 ---
 
-## 6. Quick Example
+## 6. Quick Example
 
 Below is an example hooking it all together:
 
@@ -243,14 +221,14 @@ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 import json
 
 async def main():
-    # 1. Browser config
+    # 1. Browser config
     browser_cfg = BrowserConfig(
         browser_type="firefox",
         headless=False,
         verbose=True
     )
 
-    # 2. Run config
+    # 2. Run config
     schema = {
         "name": "Articles",
         "baseSelector": "article.post",
@@ -295,17 +273,18 @@ asyncio.run(main())
 ```
 
 **Explanation**:
-- We define a **`BrowserConfig`** with Firefox, no headless, and `verbose=True`.  
-- We define a **`CrawlerRunConfig`** that **bypasses cache**, uses a **CSS** extraction schema, has a `word_count_threshold=15`, etc.  
+
+- We define a **`BrowserConfig`** with Firefox, no headless, and `verbose=True`.  
+- We define a **`CrawlerRunConfig`** that **bypasses cache**, uses a **CSS** extraction schema, has a `word_count_threshold=15`, etc.  
 - We pass them to `AsyncWebCrawler(config=...)` and `arun(url=..., config=...)`.
 
 ---
 
-## 7. Best Practices & Migration Notes
+## 7. Best Practices & Migration Notes
 
-1. **Use** `BrowserConfig` for **global** settings about the browser’s environment.  
-2. **Use** `CrawlerRunConfig` for **per-crawl** logic (caching, content filtering, extraction strategies, wait conditions).  
-3. **Avoid** legacy parameters like `css_selector` or `word_count_threshold` directly in `arun()`. Instead:
+1. **Use** `BrowserConfig` for **global** settings about the browser’s environment.  
+2. **Use** `CrawlerRunConfig` for **per-crawl** logic (caching, content filtering, extraction strategies, wait conditions).  
+3. **Avoid** legacy parameters like `css_selector` or `word_count_threshold` directly in `arun()`. Instead:
 
    ```python
    run_cfg = CrawlerRunConfig(css_selector=".main-content", word_count_threshold=20)
@@ -316,16 +295,17 @@ asyncio.run(main())
 
 ---
 
-## 8. Summary
+## 8. Summary
 
 **AsyncWebCrawler** is your entry point to asynchronous crawling:
 
-- **Constructor** accepts **`BrowserConfig`** (or defaults).  
-- **`arun(url, config=CrawlerRunConfig)`** is the main method for single-page crawls.  
-- **`arun_many(urls, config=CrawlerRunConfig)`** handles concurrency across multiple URLs.  
-- For advanced lifecycle control, use `start()` and `close()` explicitly.  
+- **Constructor** accepts **`BrowserConfig`** (or defaults).  
+- **`arun(url, config=CrawlerRunConfig)`** is the main method for single-page crawls.  
+- **`arun_many(urls, config=CrawlerRunConfig)`** handles concurrency across multiple URLs.  
+- For advanced lifecycle control, use `start()` and `close()` explicitly.  
 
 **Migration**:  
+
 - If you used `AsyncWebCrawler(browser_type="chromium", css_selector="...")`, move browser settings to `BrowserConfig(...)` and content/crawl logic to `CrawlerRunConfig(...)`.
 
-This modular approach ensures your code is **clean**, **scalable**, and **easy to maintain**. For any advanced or rarely used parameters, see the [BrowserConfig docs](../api/parameters.md).
\ No newline at end of file
+This modular approach ensures your code is **clean**, **scalable**, and **easy to maintain**. For any advanced or rarely used parameters, see the [BrowserConfig docs](../api/parameters.md).
\ No newline at end of file
diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md
index 7ed6275a..a27a87d2 100644
--- a/docs/md_v2/api/crawl-result.md
+++ b/docs/md_v2/api/crawl-result.md
@@ -10,15 +10,14 @@ class CrawlResult(BaseModel):
     html: str
     success: bool
     cleaned_html: Optional[str] = None
+    fit_html: Optional[str] = None  # Preprocessed HTML optimized for extraction
     media: Dict[str, List[Dict]] = {}
     links: Dict[str, List[Dict]] = {}
     downloaded_files: Optional[List[str]] = None
     screenshot: Optional[str] = None
     pdf : Optional[bytes] = None
+    mhtml: Optional[str] = None
     markdown: Optional[Union[str, MarkdownGenerationResult]] = None
-    markdown_v2: Optional[MarkdownGenerationResult] = None
-    fit_markdown: Optional[str] = None
-    fit_html: Optional[str] = None
     extracted_content: Optional[str] = None
     metadata: Optional[dict] = None
     error_message: Optional[str] = None
@@ -52,7 +51,7 @@ if not result.success:
 ```
 
 ### 1.3 **`status_code`** *(Optional[int])*  
-**What**: The page’s HTTP status code (e.g., 200, 404).  
+**What**: The page's HTTP status code (e.g., 200, 404).  
 **Usage**:
 ```python
 if result.status_code == 404:
@@ -84,7 +83,7 @@ if result.response_headers:
 ```
 
 ### 1.7 **`ssl_certificate`** *(Optional[SSLCertificate])*  
-**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a  [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the site’s certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`, 
+**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a  [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the site's certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`, 
  `subject`, `valid_from`, `valid_until`, etc. 
 **Usage**:
 ```python
@@ -111,14 +110,6 @@ print(len(result.html))
 print(result.cleaned_html[:500])  # Show a snippet
 ```
 
-### 2.3 **`fit_html`** *(Optional[str])*  
-**What**: If a **content filter** or heuristic (e.g., Pruning/BM25) modifies the HTML, the “fit” or post-filter version.  
-**When**: This is **only** present if your `markdown_generator` or `content_filter` produces it.  
-**Usage**:
-```python
-if result.fit_html:
-    print("High-value HTML content:", result.fit_html[:300])
-```
 
 ---
 
@@ -132,20 +123,18 @@ Crawl4AI can convert HTML→Markdown, optionally including:
 - **Links as citations** (with a references section)  
 - **Fit** markdown if a **content filter** is used (like Pruning or BM25)
 
-### 3.2 **`markdown_v2`** *(Optional[MarkdownGenerationResult])*  
-**What**: The **structured** object holding multiple markdown variants. Soon to be consolidated into `markdown`.  
 
 **`MarkdownGenerationResult`** includes:
 - **`raw_markdown`** *(str)*: The full HTML→Markdown conversion.  
 - **`markdown_with_citations`** *(str)*: Same markdown, but with link references as academic-style citations.  
 - **`references_markdown`** *(str)*: The reference list or footnotes at the end.  
-- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered “fit” text.  
+- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered "fit" text.  
 - **`fit_html`** *(Optional[str])*: The HTML that led to `fit_markdown`.
 
 **Usage**:
 ```python
-if result.markdown_v2:
-    md_res = result.markdown_v2
+if result.markdown:
+    md_res = result.markdown
     print("Raw MD:", md_res.raw_markdown[:300])
     print("Citations MD:", md_res.markdown_with_citations[:300])
     print("References:", md_res.references_markdown)
@@ -153,26 +142,15 @@ if result.markdown_v2:
         print("Pruned text:", md_res.fit_markdown[:300])
 ```
 
-### 3.3 **`markdown`** *(Optional[Union[str, MarkdownGenerationResult]])*  
-**What**: In future versions, `markdown` will fully replace `markdown_v2`. Right now, it might be a `str` or a `MarkdownGenerationResult`.  
+### 3.2 **`markdown`** *(Optional[Union[str, MarkdownGenerationResult]])*  
+**What**: Holds the `MarkdownGenerationResult`.  
 **Usage**:
 ```python
-# Soon, you might see:
-if isinstance(result.markdown, MarkdownGenerationResult):
-    print(result.markdown.raw_markdown[:200])
-else:
-    print(result.markdown)
+print(result.markdown.raw_markdown[:200])
+print(result.markdown.fit_markdown)
+print(result.markdown.fit_html)
 ```
-
-### 3.4 **`fit_markdown`** *(Optional[str])*  
-**What**: A direct reference to the final filtered markdown (legacy approach).  
-**When**: This is set if a filter or content strategy explicitly writes there. Usually overshadowed by `markdown_v2.fit_markdown`.  
-**Usage**:
-```python
-print(result.fit_markdown)  # Legacy field, prefer result.markdown_v2.fit_markdown
-```
-
-**Important**: “Fit” content (in `fit_markdown`/`fit_html`) only exists if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
+**Important**: "Fit" content (in `fit_markdown`/`fit_html`) exists in result.markdown, only if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
 
 ---
 
@@ -184,7 +162,7 @@ print(result.fit_markdown)  # Legacy field, prefer result.markdown_v2.fit_markdo
 
 - `src` *(str)*: Media URL  
 - `alt` or `title` *(str)*: Descriptive text  
-- `score` *(float)*: Relevance score if the crawler’s heuristic found it “important”  
+- `score` *(float)*: Relevance score if the crawler's heuristic found it "important"  
 - `desc` or `description` *(Optional[str])*: Additional context extracted from surrounding text  
 
 **Usage**:
@@ -252,7 +230,16 @@ if result.pdf:
         f.write(result.pdf)
 ```
 
-### 5.5 **`metadata`** *(Optional[dict])*  
+### 5.5 **`mhtml`** *(Optional[str])*  
+**What**: MHTML snapshot of the page if `capture_mhtml=True` in `CrawlerRunConfig`. MHTML (MIME HTML) format preserves the entire web page with all its resources (CSS, images, scripts, etc.) in a single file.  
+**Usage**:
+```python
+if result.mhtml:
+    with open("page.mhtml", "w", encoding="utf-8") as f:
+        f.write(result.mhtml)
+```
+
+### 5.6 **`metadata`** *(Optional[dict])*  
 **What**: Page-level metadata if discovered (title, description, OG data, etc.).  
 **Usage**:
 ```python
@@ -269,7 +256,7 @@ A `DispatchResult` object providing additional concurrency and resource usage in
 
 - **`task_id`**: A unique identifier for the parallel task.
 - **`memory_usage`** (float): The memory (in MB) used at the time of completion.
-- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task’s execution.
+- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task's execution.
 - **`start_time`** / **`end_time`** (datetime): Time range for this crawling task.
 - **`error_message`** (str): Any dispatcher- or concurrency-related error encountered.
 
@@ -287,7 +274,69 @@ for result in results:
 
 ---
 
-## 7. Example: Accessing Everything
+## 7. Network Requests & Console Messages
+
+When you enable network and console message capturing in `CrawlerRunConfig` using `capture_network_requests=True` and `capture_console_messages=True`, the `CrawlResult` will include these fields:
+
+### 7.1 **`network_requests`** *(Optional[List[Dict[str, Any]]])*
+**What**: A list of dictionaries containing information about all network requests, responses, and failures captured during the crawl.
+**Structure**:
+- Each item has an `event_type` field that can be `"request"`, `"response"`, or `"request_failed"`.
+- Request events include `url`, `method`, `headers`, `post_data`, `resource_type`, and `is_navigation_request`.
+- Response events include `url`, `status`, `status_text`, `headers`, and `request_timing`.
+- Failed request events include `url`, `method`, `resource_type`, and `failure_text`.
+- All events include a `timestamp` field.
+
+**Usage**:
+```python
+if result.network_requests:
+    # Count different types of events
+    requests = [r for r in result.network_requests if r.get("event_type") == "request"]
+    responses = [r for r in result.network_requests if r.get("event_type") == "response"]
+    failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"]
+    
+    print(f"Captured {len(requests)} requests, {len(responses)} responses, and {len(failures)} failures")
+    
+    # Analyze API calls
+    api_calls = [r for r in requests if "api" in r.get("url", "")]
+    
+    # Identify failed resources
+    for failure in failures:
+        print(f"Failed to load: {failure.get('url')} - {failure.get('failure_text')}")
+```
+
+### 7.2 **`console_messages`** *(Optional[List[Dict[str, Any]]])*
+**What**: A list of dictionaries containing all browser console messages captured during the crawl.
+**Structure**:
+- Each item has a `type` field indicating the message type (e.g., `"log"`, `"error"`, `"warning"`, etc.).
+- The `text` field contains the actual message text.
+- Some messages include `location` information (URL, line, column).
+- All messages include a `timestamp` field.
+
+**Usage**:
+```python
+if result.console_messages:
+    # Count messages by type
+    message_types = {}
+    for msg in result.console_messages:
+        msg_type = msg.get("type", "unknown")
+        message_types[msg_type] = message_types.get(msg_type, 0) + 1
+    
+    print(f"Message type counts: {message_types}")
+    
+    # Display errors (which are usually most important)
+    for msg in result.console_messages:
+        if msg.get("type") == "error":
+            print(f"Error: {msg.get('text')}")
+```
+
+These fields provide deep visibility into the page's network activity and browser console, which is invaluable for debugging, security analysis, and understanding complex web applications.
+
+For more details on network and console capturing, see the [Network & Console Capture documentation](../advanced/network-console-capture.md).
+
+---
+
+## 8. Example: Accessing Everything
 
 ```python
 async def handle_result(result: CrawlResult):
@@ -302,15 +351,13 @@ async def handle_result(result: CrawlResult):
     # HTML
     print("Original HTML size:", len(result.html))
     print("Cleaned HTML size:", len(result.cleaned_html or ""))
-
+    
     # Markdown output
-    if result.markdown_v2:
-        print("Raw Markdown:", result.markdown_v2.raw_markdown[:300])
-        print("Citations Markdown:", result.markdown_v2.markdown_with_citations[:300])
-        if result.markdown_v2.fit_markdown:
-            print("Fit Markdown:", result.markdown_v2.fit_markdown[:200])
-    else:
-        print("Raw Markdown (legacy):", result.markdown[:200] if result.markdown else "N/A")
+    if result.markdown:
+        print("Raw Markdown:", result.markdown.raw_markdown[:300])
+        print("Citations Markdown:", result.markdown.markdown_with_citations[:300])
+        if result.markdown.fit_markdown:
+            print("Fit Markdown:", result.markdown.fit_markdown[:200])
 
     # Media & Links
     if "images" in result.media:
@@ -322,23 +369,43 @@ async def handle_result(result: CrawlResult):
     if result.extracted_content:
         print("Structured data:", result.extracted_content)
     
-    # Screenshot/PDF
+    # Screenshot/PDF/MHTML
     if result.screenshot:
         print("Screenshot length:", len(result.screenshot))
     if result.pdf:
         print("PDF bytes length:", len(result.pdf))
+    if result.mhtml:
+        print("MHTML length:", len(result.mhtml))
+        
+    # Network and console capturing
+    if result.network_requests:
+        print(f"Network requests captured: {len(result.network_requests)}")
+        # Analyze request types
+        req_types = {}
+        for req in result.network_requests:
+            if "resource_type" in req:
+                req_types[req["resource_type"]] = req_types.get(req["resource_type"], 0) + 1
+        print(f"Resource types: {req_types}")
+        
+    if result.console_messages:
+        print(f"Console messages captured: {len(result.console_messages)}")
+        # Count by message type
+        msg_types = {}
+        for msg in result.console_messages:
+            msg_types[msg.get("type", "unknown")] = msg_types.get(msg.get("type", "unknown"), 0) + 1
+        print(f"Message types: {msg_types}")
 ```
 
 ---
 
-## 8. Key Points & Future
+## 9. Key Points & Future
 
-1. **`markdown_v2` vs `markdown`**  
-   - Right now, `markdown_v2` is the more robust container (`MarkdownGenerationResult`), providing **raw_markdown**, **markdown_with_citations**, references, plus possible **fit_markdown**.  
-   - In future versions, everything will unify under **`markdown`**. If you rely on advanced features (citations, fit content), check `markdown_v2`.
+1. **Deprecated legacy properties of CrawlResult**  
+   - `markdown_v2` - Deprecated in v0.5. Just use `markdown`. It holds the `MarkdownGenerationResult` now!
+   - `fit_markdown` and `fit_html` - Deprecated in v0.5. They can now be accessed via `MarkdownGenerationResult` in `result.markdown`. eg: `result.markdown.fit_markdown` and `result.markdown.fit_html`
 
 2. **Fit Content**  
-   - **`fit_markdown`** and **`fit_html`** appear only if you used a content filter (like **PruningContentFilter** or **BM25ContentFilter**) inside your **MarkdownGenerationStrategy** or set them directly.  
+   - **`fit_markdown`** and **`fit_html`** appear in MarkdownGenerationResult, only if you used a content filter (like **PruningContentFilter** or **BM25ContentFilter**) inside your **MarkdownGenerationStrategy** or set them directly.  
    - If no filter is used, they remain `None`.
 
 3. **References & Citations**  
diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index 932a2642..c7ac21ae 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -70,9 +70,9 @@ We group them by category.
 |------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
 | **`word_count_threshold`**   | `int` (default: ~200)                | Skips text blocks below X words. Helps ignore trivial sections.                                 |
 | **`extraction_strategy`**    | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.).                                  |
-| **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.).                 |
-| **`content_filter`**         | `RelevantContentFilter` (None)       | Filters out irrelevant text blocks. E.g., `PruningContentFilter` or `BM25ContentFilter`.        |
-| **`css_selector`**           | `str` (None)                         | Retains only the part of the page matching this selector.                                       |
+| **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html').                 |
+| **`css_selector`**           | `str` (None)                         | Retains only the part of the page matching this selector. Affects the entire extraction process. |
+| **`target_elements`**        | `List[str]` (None)                   | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
 | **`excluded_tags`**          | `list` (None)                        | Removes entire tags (e.g. `["script", "style"]`).                                               |
 | **`excluded_selector`**      | `str` (None)                         | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`.                                    |
 | **`only_text`**              | `bool` (False)                       | If `True`, tries to extract text-only content.                                                  |
@@ -140,6 +140,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
 | **`screenshot_wait_for`**                  | `float or None`     | Extra wait time before the screenshot.                                                                    |
 | **`screenshot_height_threshold`**          | `int` (~20000)      | If the page is taller than this, alternate screenshot strategies are used.                                |
 | **`pdf`**                                  | `bool` (False)      | If `True`, returns a PDF in `result.pdf`.                                                                 |
+| **`capture_mhtml`**                        | `bool` (False)      | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. |
 | **`image_description_min_word_threshold`** | `int` (~50)         | Minimum words for an image’s alt text or description to be considered valid.                              |
 | **`image_score_threshold`**                | `int` (~3)          | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.).              |
 | **`exclude_external_images`**              | `bool` (False)      | Exclude images from other domains.                                                                        |
@@ -159,32 +160,7 @@ Use these for link-level content filtering (often to keep crawls “internal”
 
 ---
 
-### G) **Rate Limiting & Resource Management**
-
-| **Parameter**                | **Type / Default**                     | **What It Does**                                                                                                           |
-|------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
-| **`enable_rate_limiting`**  | `bool` (default: `False`)              | Enable intelligent rate limiting for multiple URLs                                                                          |
-| **`rate_limit_config`**     | `RateLimitConfig` (default: `None`)    | Configuration for rate limiting behavior                                                                                   |
-
-The `RateLimitConfig` class has these fields:
-
-| **Field**           | **Type / Default**                     | **What It Does**                                                                                                           |
-|--------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
-| **`base_delay`**   | `Tuple[float, float]` (1.0, 3.0)      | Random delay range between requests to the same domain                                                                      |
-| **`max_delay`**    | `float` (60.0)                        | Maximum delay after rate limit detection                                                                                    |
-| **`max_retries`**  | `int` (3)                             | Number of retries before giving up on rate-limited requests                                                                 |
-| **`rate_limit_codes`** | `List[int]` ([429, 503])          | HTTP status codes that trigger rate limiting behavior                                                                       |
-
-| **Parameter**                  | **Type / Default**                     | **What It Does**                                                                                                           |
-|-------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
-| **`memory_threshold_percent`** | `float` (70.0)                        | Maximum memory usage before pausing new crawls                                                                              |
-| **`check_interval`**          | `float` (1.0)                         | How often to check system resources (in seconds)                                                                           |
-| **`max_session_permit`**      | `int` (20)                            | Maximum number of concurrent crawl sessions                                                                                |
-| **`display_mode`**            | `str` (`None`, "DETAILED", "AGGREGATED") | How to display progress information                                                                                     |
-
----
-
-### H) **Debug & Logging**
+### G) **Debug & Logging**
 
 | **Parameter**  | **Type / Default** | **What It Does**                                                         |
 |----------------|--------------------|---------------------------------------------------------------------------|
@@ -218,7 +194,7 @@ The `clone()` method is particularly useful when you need slightly different con
 
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, RateLimitConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 
 async def main():
     # Configure the browser
@@ -239,17 +215,6 @@ async def main():
         exclude_external_links=True,
         wait_for="css:.article-loaded",
         screenshot=True,
-        enable_rate_limiting=True,
-        rate_limit_config=RateLimitConfig(
-            base_delay=(1.0, 3.0),
-            max_delay=60.0,
-            max_retries=3,
-            rate_limit_codes=[429, 503]
-        ),
-        memory_threshold_percent=70.0,
-        check_interval=1.0,
-        max_session_permit=20,
-        display_mode="DETAILED",
         stream=True
     )
 
@@ -267,6 +232,7 @@ async def main():
 
 if __name__ == "__main__":
     asyncio.run(main())
+```
 
 ## 2.4 Compliance & Ethics
 
@@ -282,11 +248,32 @@ run_config = CrawlerRunConfig(
 )
 ```
 
-## 3. Putting It All Together
+# 3. **LLMConfig** - Setting up LLM providers
+LLMConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
+
+1. LLMExtractionStrategy
+2. LLMContentFilter
+3. JsonCssExtractionStrategy.generate_schema
+4. JsonXPathExtractionStrategy.generate_schema
+
+## 3.1 Parameters
+| **Parameter**         | **Type / Default**                     | **What It Does**                                                                                                                     |
+|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
+| **`provider`**    | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* | Which LLM provoder to use. 
+| **`api_token`**         |1.Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables  <br/> 2. API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` <br/> 3. Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`              | API token to use for the given provider 
+| **`base_url`**         |Optional. Custom API endpoint | If your provider has a custom endpoint
+
+## 3.2 Example Usage
+```python
+llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+```
+
+## 4. Putting It All Together
 
 - **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.  
 - **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.  
 - **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).  
+- **Use** `LLMConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
 
 ```python
 # Create a modified copy with the clone() method
@@ -294,3 +281,4 @@ stream_cfg = run_cfg.clone(
     stream=True,
     cache_mode=CacheMode.BYPASS
 )
+```
diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md
index 06b757d4..a44d0fcd 100644
--- a/docs/md_v2/api/strategies.md
+++ b/docs/md_v2/api/strategies.md
@@ -36,6 +36,45 @@ LLMExtractionStrategy(
 )
 ```
 
+### RegexExtractionStrategy
+
+Used for fast pattern-based extraction of common entities using regular expressions.
+
+```python
+RegexExtractionStrategy(
+    # Pattern Configuration
+    pattern: IntFlag = RegexExtractionStrategy.Nothing,  # Bit flags of built-in patterns to use
+    custom: Optional[Dict[str, str]] = None,           # Custom pattern dictionary {label: regex}
+    
+    # Input Format
+    input_format: str = "fit_html",                    # "html", "markdown", "text" or "fit_html"
+)
+
+# Built-in Patterns as Bit Flags
+RegexExtractionStrategy.Email           # Email addresses
+RegexExtractionStrategy.PhoneIntl       # International phone numbers 
+RegexExtractionStrategy.PhoneUS         # US-format phone numbers
+RegexExtractionStrategy.Url             # HTTP/HTTPS URLs
+RegexExtractionStrategy.IPv4            # IPv4 addresses
+RegexExtractionStrategy.IPv6            # IPv6 addresses
+RegexExtractionStrategy.Uuid            # UUIDs
+RegexExtractionStrategy.Currency        # Currency values (USD, EUR, etc)
+RegexExtractionStrategy.Percentage      # Percentage values
+RegexExtractionStrategy.Number          # Numeric values
+RegexExtractionStrategy.DateIso         # ISO format dates
+RegexExtractionStrategy.DateUS          # US format dates
+RegexExtractionStrategy.Time24h         # 24-hour format times
+RegexExtractionStrategy.PostalUS        # US postal codes
+RegexExtractionStrategy.PostalUK        # UK postal codes
+RegexExtractionStrategy.HexColor        # HTML hex color codes
+RegexExtractionStrategy.TwitterHandle   # Twitter handles
+RegexExtractionStrategy.Hashtag         # Hashtags
+RegexExtractionStrategy.MacAddr         # MAC addresses
+RegexExtractionStrategy.Iban            # International bank account numbers
+RegexExtractionStrategy.CreditCard      # Credit card numbers
+RegexExtractionStrategy.All             # All available patterns
+```
+
 ### CosineStrategy
 
 Used for content similarity-based extraction and clustering.
@@ -131,6 +170,7 @@ OverlappingWindowChunking(
 ```python
 from pydantic import BaseModel
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMConfig
 
 # Define schema
 class Article(BaseModel):
@@ -140,7 +180,7 @@ class Article(BaseModel):
 
 # Create strategy
 strategy = LLMExtractionStrategy(
-    provider="ollama/llama2",
+    llm_config = LLMConfig(provider="ollama/llama2"),
     schema=Article.schema(),
     instruction="Extract article details"
 )
@@ -155,6 +195,55 @@ result = await crawler.arun(
 data = json.loads(result.extracted_content)
 ```
 
+### Regex Extraction
+
+```python
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, RegexExtractionStrategy
+
+# Method 1: Use built-in patterns
+strategy = RegexExtractionStrategy(
+    pattern = RegexExtractionStrategy.Email | RegexExtractionStrategy.Url
+)
+
+# Method 2: Use custom patterns
+price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
+strategy = RegexExtractionStrategy(custom=price_pattern)
+
+# Method 3: Generate pattern with LLM assistance (one-time)
+from crawl4ai import LLMConfig
+
+async with AsyncWebCrawler() as crawler:
+    # Get sample HTML first
+    sample_result = await crawler.arun("https://example.com/products")
+    html = sample_result.fit_html
+    
+    # Generate regex pattern once
+    pattern = RegexExtractionStrategy.generate_pattern(
+        label="price",
+        html=html,
+        query="Product prices in USD format",
+        llm_config=LLMConfig(provider="openai/gpt-4o-mini")
+    )
+    
+    # Save pattern for reuse
+    import json
+    with open("price_pattern.json", "w") as f:
+        json.dump(pattern, f)
+    
+    # Use pattern for extraction (no LLM calls)
+    strategy = RegexExtractionStrategy(custom=pattern)
+    result = await crawler.arun(
+        url="https://example.com/products",
+        config=CrawlerRunConfig(extraction_strategy=strategy)
+    )
+    
+    # Process results
+    data = json.loads(result.extracted_content)
+    for item in data:
+        print(f"{item['label']}: {item['value']}")
+```
+
 ### CSS Extraction
 
 ```python
@@ -197,6 +286,7 @@ result = await crawler.arun(
 
 ```python
 from crawl4ai.chunking_strategy import OverlappingWindowChunking
+from crawl4ai import LLMConfig
 
 # Create chunking strategy
 chunker = OverlappingWindowChunking(
@@ -206,7 +296,7 @@ chunker = OverlappingWindowChunking(
 
 # Use with extraction strategy
 strategy = LLMExtractionStrategy(
-    provider="ollama/llama2",
+    llm_config = LLMConfig(provider="ollama/llama2"),
     chunking_strategy=chunker
 )
 
@@ -218,12 +308,28 @@ result = await crawler.arun(
 
 ## Best Practices
 
-1. **Choose the Right Strategy**
-   - Use `LLMExtractionStrategy` for complex, unstructured content
-   - Use `JsonCssExtractionStrategy` for well-structured HTML
+1. **Choose the Right Strategy**
+   - Use `RegexExtractionStrategy` for common data types like emails, phones, URLs, dates
+   - Use `JsonCssExtractionStrategy` for well-structured HTML with consistent patterns
+   - Use `LLMExtractionStrategy` for complex, unstructured content requiring reasoning
    - Use `CosineStrategy` for content similarity and clustering
 
-2. **Optimize Chunking**
+2. **Strategy Selection Guide**
+   ```
+   Is the target data a common type (email/phone/date/URL)? 
+   → RegexExtractionStrategy
+   
+   Does the page have consistent HTML structure?
+   → JsonCssExtractionStrategy or JsonXPathExtractionStrategy
+   
+   Is the data semantically complex or unstructured?
+   → LLMExtractionStrategy
+   
+   Need to find content similar to a specific topic?
+   → CosineStrategy
+   ```
+
+3. **Optimize Chunking**
    ```python
    # For long documents
    strategy = LLMExtractionStrategy(
@@ -232,7 +338,26 @@ result = await crawler.arun(
    )
    ```
 
-3. **Handle Errors**
+4. **Combine Strategies for Best Performance**
+   ```python
+   # First pass: Extract structure with CSS
+   css_strategy = JsonCssExtractionStrategy(product_schema)
+   css_result = await crawler.arun(url, config=CrawlerRunConfig(extraction_strategy=css_strategy))
+   product_data = json.loads(css_result.extracted_content)
+   
+   # Second pass: Extract specific fields with regex
+   descriptions = [product["description"] for product in product_data]
+   regex_strategy = RegexExtractionStrategy(
+       pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS,
+       custom={"dimension": r"\d+x\d+x\d+ (?:cm|in)"}
+   )
+   
+   # Process descriptions with regex
+   for text in descriptions:
+       matches = regex_strategy.extract("", text)  # Direct extraction
+   ```
+
+5. **Handle Errors**
    ```python
    try:
        result = await crawler.arun(
@@ -245,11 +370,31 @@ result = await crawler.arun(
        print(f"Extraction failed: {e}")
    ```
 
-4. **Monitor Performance**
+6. **Monitor Performance**
    ```python
    strategy = CosineStrategy(
        verbose=True,  # Enable logging
        word_count_threshold=20,  # Filter short content
        top_k=5  # Limit results
    )
+   ```
+
+7. **Cache Generated Patterns**
+   ```python
+   # For RegexExtractionStrategy pattern generation
+   import json
+   from pathlib import Path
+   
+   cache_dir = Path("./pattern_cache")
+   cache_dir.mkdir(exist_ok=True)
+   pattern_file = cache_dir / "product_pattern.json"
+   
+   if pattern_file.exists():
+       with open(pattern_file) as f:
+           pattern = json.load(f)
+   else:
+       # Generate once with LLM
+       pattern = RegexExtractionStrategy.generate_pattern(...)
+       with open(pattern_file, "w") as f:
+           json.dump(pattern, f)
    ```
\ No newline at end of file
diff --git a/docs/md_v2/ask_ai/ask-ai.css b/docs/md_v2/ask_ai/ask-ai.css
new file mode 100644
index 00000000..c464d43b
--- /dev/null
+++ b/docs/md_v2/ask_ai/ask-ai.css
@@ -0,0 +1,444 @@
+/* ==== File: docs/ask_ai/ask_ai.css ==== */
+
+/* --- Basic Reset & Font --- */
+body {
+    /* Attempt to inherit variables from parent window (iframe context) */
+    /* Fallback values if variables are not inherited */
+    --fallback-bg: #070708;
+    --fallback-font: #e8e9ed;
+    --fallback-secondary: #a3abba;
+    --fallback-primary: #50ffff;
+    --fallback-primary-dimmed: #09b5a5;
+    --fallback-border: #1d1d20;
+    --fallback-code-bg: #1e1e1e;
+    --fallback-invert-font: #222225;
+    --font-stack: dm, Monaco, Courier New, monospace, serif;
+
+    font-family: var(--font-stack, "Courier New", monospace); /* Use theme font stack */
+    background-color: var(--background-color, var(--fallback-bg));
+    color: var(--font-color, var(--fallback-font));
+    margin: 0;
+    padding: 0;
+    font-size: 14px; /* Match global font size */
+    line-height: 1.5em; /* Match global line height */
+    height: 100vh; /* Ensure body takes full height */
+    overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
+    display: flex; /* Use flex for the main container */
+}
+
+a {
+    color: var(--secondary-color, var(--fallback-secondary));
+    text-decoration: none;
+    transition: color 0.2s;
+}
+a:hover {
+    color: var(--primary-color, var(--fallback-primary));
+}
+
+/* --- Main Container Layout --- */
+.ai-assistant-container {
+    display: flex;
+    width: 100%;
+    height: 100%;
+    background-color: var(--background-color, var(--fallback-bg));
+}
+
+/* --- Sidebar Styling --- */
+.sidebar {
+    flex-shrink: 0; /* Prevent sidebars from shrinking */
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+    /* background-color: var(--code-bg-color, var(--fallback-code-bg)); */
+    overflow-y: hidden; /* Header fixed, list scrolls */
+}
+
+.left-sidebar {
+    flex-basis: 240px; /* Width of history panel */
+    border-right: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+
+.right-sidebar {
+    flex-basis: 280px; /* Width of citations panel */
+    border-left: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+
+.sidebar header {
+    padding: 0.6em 1em;
+    border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
+    flex-shrink: 0;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+
+.sidebar header h3 {
+    margin: 0;
+    font-size: 1.1em;
+    color: var(--font-color, var(--fallback-font));
+}
+
+.sidebar ul {
+    list-style: none;
+    padding: 0;
+    margin: 0;
+    overflow-y: auto; /* Enable scrolling for the list */
+    flex-grow: 1; /* Allow list to take remaining space */
+    padding: 0.5em 0;
+}
+
+.sidebar ul li {
+    padding: 0.3em 1em;
+}
+.sidebar ul li.no-citations,
+.sidebar ul li.no-history {
+    color: var(--secondary-color, var(--fallback-secondary));
+    font-style: italic;
+    font-size: 0.9em;
+    padding-left: 1em;
+}
+
+.sidebar ul li a {
+    color: var(--secondary-color, var(--fallback-secondary));
+    text-decoration: none;
+    display: block;
+    padding: 0.2em 0.5em;
+    border-radius: 3px;
+    transition: background-color 0.2s, color 0.2s;
+}
+
+.sidebar ul li a:hover {
+    color: var(--primary-color, var(--fallback-primary));
+    background-color: rgba(80, 255, 255, 0.08); /* Use primary color with alpha */
+}
+/* Style for active history item */
+#history-list li.active a {
+    color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+    font-weight: bold;
+    background-color: rgba(80, 255, 255, 0.12);
+}
+
+/* --- Chat Panel Styling --- */
+#chat-panel {
+    flex-grow: 1; /* Take remaining space */
+    display: flex;
+    flex-direction: column;
+    height: 100%;
+    overflow: hidden; /* Prevent overflow, internal elements handle scroll */
+}
+
+#chat-messages {
+    flex-grow: 1;
+    overflow-y: auto; /* Scrollable chat history */
+    padding: 1em 1.5em;
+    border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+
+.message {
+    margin-bottom: 1em;
+    padding: 0.8em 1.2em;
+    border-radius: 8px;
+    max-width: 90%; /* Slightly wider */
+    line-height: 1.6;
+    /* Apply pre-wrap for better handling of spaces/newlines AND wrapping */
+    white-space: pre-wrap;
+    word-wrap: break-word; /* Ensure long words break */
+}
+
+.user-message {
+    background-color: var(--progress-bar-background, var(--fallback-border)); /* User message background */
+    color: var(--font-color, var(--fallback-font));
+    margin-left: auto; /* Align user messages to the right */
+    text-align: left;
+}
+
+.ai-message {
+    background-color: var(--code-bg-color, var(--fallback-code-bg)); /* AI message background */
+    color: var(--font-color, var(--fallback-font));
+    margin-right: auto; /* Align AI messages to the left */
+    border: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+.ai-message.welcome-message {
+    border: none;
+    background-color: transparent;
+    max-width: 100%;
+    text-align: center;
+    color: var(--secondary-color, var(--fallback-secondary));
+    white-space: normal;
+}
+
+/* Styles for code within messages */
+.ai-message code {
+    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; /* Use light bg for code */
+    /* color: var(--background-color, var(--fallback-bg)) !important; Dark text */
+    padding: 0.1em 0.4em;
+    border-radius: 4px;
+    font-size: 0.9em;
+}
+.ai-message pre {
+    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
+    color: var(--background-color, var(--fallback-bg)) !important;
+    padding: 1em;
+    border-radius: 5px;
+    overflow-x: auto;
+    margin: 0.8em 0;
+    white-space: pre;
+}
+.ai-message pre code {
+    background-color: transparent !important;
+    padding: 0;
+    font-size: inherit;
+}
+
+/* Override white-space for specific elements generated by Markdown */
+.ai-message p,
+.ai-message ul,
+.ai-message ol,
+.ai-message blockquote {
+    white-space: normal; /* Allow standard wrapping for block elements */
+}
+
+/* --- Markdown Element Styling within Messages --- */
+.message p {
+    margin-top: 0;
+    margin-bottom: 0.5em;
+}
+.message p:last-child {
+    margin-bottom: 0;
+}
+.message ul,
+.message ol {
+    margin: 0.5em 0 0.5em 1.5em;
+    padding: 0;
+}
+.message li {
+    margin-bottom: 0.2em;
+}
+
+/* Code block styling (adjusts previous rules slightly) */
+.message code {
+    /* Inline code */
+    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
+    color: var(--font-color);
+    padding: 0.1em 0.4em;
+    border-radius: 4px;
+    font-size: 0.9em;
+    /* Ensure inline code breaks nicely */
+    word-break: break-all;
+    white-space: normal; /* Allow inline code to wrap if needed */
+}
+.message pre {
+    /* Code block container */
+    background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
+    color: var(--background-color, var(--fallback-bg)) !important;
+    padding: 1em;
+    border-radius: 5px;
+    overflow-x: auto;
+    margin: 0.8em 0;
+    font-size: 0.9em; /* Slightly smaller code blocks */
+}
+.message pre code {
+    /* Code within code block */
+    background-color: transparent !important;
+    padding: 0;
+    font-size: inherit;
+    word-break: normal; /* Don't break words in code blocks */
+    white-space: pre; /* Preserve whitespace strictly in code blocks */
+}
+
+/* Thinking indicator */
+.message-thinking {
+    display: inline-block;
+    width: 5px;
+    height: 5px;
+    background-color: var(--primary-color, var(--fallback-primary));
+    border-radius: 50%;
+    margin-left: 8px;
+    vertical-align: middle;
+    animation: thinking 1s infinite ease-in-out;
+}
+@keyframes thinking {
+    0%,
+    100% {
+        opacity: 0.5;
+        transform: scale(0.8);
+    }
+    50% {
+        opacity: 1;
+        transform: scale(1.2);
+    }
+}
+
+/* --- Thinking Indicator (Blinking Cursor Style) --- */
+.thinking-indicator-cursor {
+    display: inline-block;
+    width: 10px; /* Width of the cursor */
+    height: 1.1em; /* Match line height */
+    background-color: var(--primary-color, var(--fallback-primary));
+    margin-left: 5px;
+    vertical-align: text-bottom; /* Align with text baseline */
+    animation: blink-cursor 1s step-end infinite;
+}
+
+@keyframes blink-cursor {
+    from,
+    to {
+        background-color: transparent;
+    }
+    50% {
+        background-color: var(--primary-color, var(--fallback-primary));
+    }
+}
+
+#chat-input-area {
+    flex-shrink: 0; /* Prevent input area from shrinking */
+    padding: 1em 1.5em;
+    display: flex;
+    align-items: flex-end; /* Align items to bottom */
+    gap: 10px;
+    background-color: var(--code-bg-color, var(--fallback-code-bg)); /* Match sidebars */
+}
+
+#chat-input-area textarea {
+    flex-grow: 1;
+    padding: 0.8em 1em;
+    border: 1px solid var(--progress-bar-background, var(--fallback-border));
+    background-color: var(--background-color, var(--fallback-bg));
+    color: var(--font-color, var(--fallback-font));
+    border-radius: 5px;
+    resize: none; /* Disable manual resize */
+    font-family: inherit;
+    font-size: 1em;
+    line-height: 1.4;
+    max-height: 150px; /* Limit excessive height */
+    overflow-y: auto;
+    /* rows: 2; */
+}
+
+#chat-input-area button {
+    /* Basic button styling - maybe inherit from main theme? */
+    padding: 0.6em 1.2em;
+    border: 1px solid var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+    background-color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+    color: var(--background-color, var(--fallback-bg));
+    border-radius: 5px;
+    cursor: pointer;
+    font-size: 0.9em;
+    transition: background-color 0.2s, border-color 0.2s;
+    height: min-content; /* Align with bottom of textarea */
+}
+
+#chat-input-area button:hover {
+    background-color: var(--primary-color, var(--fallback-primary));
+    border-color: var(--primary-color, var(--fallback-primary));
+}
+#chat-input-area button:disabled {
+    opacity: 0.6;
+    cursor: not-allowed;
+}
+
+.loading-indicator {
+    font-size: 0.9em;
+    color: var(--secondary-color, var(--fallback-secondary));
+    margin-right: 10px;
+    align-self: center;
+}
+
+/* --- Buttons --- */
+/* Inherit some button styles if possible */
+.btn.btn-sm {
+    color: var(--font-color, var(--fallback-font));
+    padding: 0.2em 0.5em;
+    font-size: 0.8em;
+    border: 1px solid var(--secondary-color, var(--fallback-secondary));
+    background: none;
+    border-radius: 3px;
+    cursor: pointer;
+}
+.btn.btn-sm:hover {
+    border-color: var(--font-color, var(--fallback-font));
+    background-color: var(--progress-bar-background, var(--fallback-border));
+}
+
+/* --- Basic Responsiveness --- */
+@media screen and (max-width: 900px) {
+    .left-sidebar {
+        flex-basis: 200px; /* Shrink history */
+    }
+    .right-sidebar {
+        flex-basis: 240px; /* Shrink citations */
+    }
+}
+
+@media screen and (max-width: 768px) {
+    /* Stack layout on mobile? Or hide sidebars? Hiding for now */
+    .sidebar {
+        display: none; /* Hide sidebars on small screens */
+    }
+    /* Could add toggle buttons later */
+}
+
+
+/* ==== File: docs/ask_ai/ask-ai.css (Updates V4 - Delete Button) ==== */
+
+
+.sidebar ul li {
+    /* Use flexbox to align link and delete button */
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 0; /* Remove padding from li, add to link/button */
+    margin: 0.1em 0; /* Small vertical margin */
+}
+
+.sidebar ul li a {
+    /* Link takes most space */
+    flex-grow: 1;
+    padding: 0.3em 0.5em 0.3em 1em; /* Adjust padding */
+    /* Make ellipsis work for long titles */
+    white-space: nowrap;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    /* Keep existing link styles */
+    color: var(--secondary-color, var(--fallback-secondary));
+    text-decoration: none;
+    display: block;
+    border-radius: 3px;
+    transition: background-color 0.2s, color 0.2s;
+}
+.sidebar ul li a:hover {
+    color: var(--primary-color, var(--fallback-primary));
+    background-color: rgba(80, 255, 255, 0.08);
+}
+
+/* Style for active history item's link */
+#history-list li.active a {
+    color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+    font-weight: bold;
+    background-color: rgba(80, 255, 255, 0.12);
+}
+
+/* --- Delete Chat Button --- */
+.delete-chat-btn {
+    flex-shrink: 0; /* Don't shrink */
+    background: none;
+    border: none;
+    color: var(--secondary-color, var(--fallback-secondary));
+    cursor: pointer;
+    padding: 0.4em 0.8em; /* Padding around icon */
+    font-size: 0.9em;
+    opacity: 0.5; /* Dimmed by default */
+    transition: opacity 0.2s, color 0.2s;
+    margin-left: 5px; /* Space between link and button */
+    border-radius: 3px;
+}
+
+.sidebar ul li:hover .delete-chat-btn,
+.delete-chat-btn:hover {
+    opacity: 1; /* Show fully on hover */
+    color: var(--error-color, #ff3c74); /* Use error color on hover */
+}
+.delete-chat-btn:focus {
+    outline: 1px dashed var(--error-color, #ff3c74); /* Accessibility */
+     opacity: 1;
+}
diff --git a/docs/md_v2/ask_ai/ask-ai.js b/docs/md_v2/ask_ai/ask-ai.js
new file mode 100644
index 00000000..bb1b370c
--- /dev/null
+++ b/docs/md_v2/ask_ai/ask-ai.js
@@ -0,0 +1,607 @@
+// ==== File: docs/ask_ai/ask-ai.js (Marked, Streaming, History) ====
+
+document.addEventListener("DOMContentLoaded", () => {
+    console.log("AI Assistant JS V2 Loaded");
+
+    // --- DOM Element Selectors ---
+    const historyList = document.getElementById("history-list");
+    const newChatButton = document.getElementById("new-chat-button");
+    const chatMessages = document.getElementById("chat-messages");
+    const chatInput = document.getElementById("chat-input");
+    const sendButton = document.getElementById("send-button");
+    const citationsList = document.getElementById("citations-list");
+
+    // --- Constants ---
+    const CHAT_INDEX_KEY = "aiAssistantChatIndex_v1";
+    const CHAT_PREFIX = "aiAssistantChat_v1_";
+
+    // --- State ---
+    let currentChatId = null;
+    let conversationHistory = []; // Holds message objects { sender: 'user'/'ai', text: '...' }
+    let isThinking = false;
+    let streamInterval = null; // To control the streaming interval
+
+    // --- Event Listeners ---
+    sendButton.addEventListener("click", handleSendMessage);
+    chatInput.addEventListener("keydown", handleInputKeydown);
+    newChatButton.addEventListener("click", handleNewChat);
+    chatInput.addEventListener("input", autoGrowTextarea);
+
+    // --- Initialization ---
+    loadChatHistoryIndex(); // Load history list on startup
+    const initialQuery = checkForInitialQuery(window.parent.location); // Check for query param
+    if (!initialQuery) {
+        loadInitialChat(); // Load normally if no query
+    }
+
+    // --- Core Functions ---
+
+    function handleSendMessage() {
+        const userMessageText = chatInput.value.trim();
+        if (!userMessageText || isThinking) return;
+
+        setThinking(true); // Start thinking state
+
+        // Add user message to state and UI
+        const userMessage = { sender: "user", text: userMessageText };
+        conversationHistory.push(userMessage);
+        addMessageToChat(userMessage, false); // Add user message without parsing markdown
+
+        chatInput.value = "";
+        autoGrowTextarea(); // Reset textarea height
+
+        // Prepare for AI response (create empty div)
+        const aiMessageDiv = addMessageToChat({ sender: "ai", text: "" }, true); // Add empty div with thinking indicator
+
+        // TODO: Generate fingerprint/JWT here
+
+        // TODO: Send `conversationHistory` + JWT to backend API
+        // Replace placeholder below with actual API call
+        // The backend should ideally return a stream of text tokens
+
+        // --- Placeholder Streaming Simulation ---
+        const simulatedFullResponse = `Okay, Here’s a minimal Python script that creates an AsyncWebCrawler, fetches a webpage, and prints the first 300 characters of its Markdown output:
+
+\`\`\`python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+        print(result.markdown[:300])  # Print first 300 chars
+
+if __name__ == "__main__":
+    asyncio.run(main())
+\`\`\`
+
+A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`;
+
+        // Simulate receiving the response stream
+        streamSimulatedResponse(aiMessageDiv, simulatedFullResponse);
+
+        // // Simulate receiving citations *after* stream starts (or with first chunk)
+        // setTimeout(() => {
+        //     addCitations([
+        //         { title: "Simulated Doc 1", url: "#sim1" },
+        //         { title: "Another Concept", url: "#sim2" },
+        //     ]);
+        // }, 500); // Citations appear shortly after thinking starts
+    }
+
+    function handleInputKeydown(event) {
+        if (event.key === "Enter" && !event.shiftKey) {
+            event.preventDefault();
+            handleSendMessage();
+        }
+    }
+
+    function addMessageToChat(message, addThinkingIndicator = false) {
+        const messageDiv = document.createElement("div");
+        messageDiv.classList.add("message", `${message.sender}-message`);
+
+        // Parse markdown and set HTML
+        messageDiv.innerHTML = message.text ? marked.parse(message.text) : "";
+
+        if (message.sender === "ai") {
+            // Apply Syntax Highlighting AFTER setting innerHTML
+            messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
+                if (typeof hljs !== "undefined") {
+                    // Check if already highlighted to prevent double-highlighting issues
+                    if (!block.classList.contains("hljs")) {
+                        hljs.highlightElement(block);
+                    }
+                } else {
+                    console.warn("highlight.js (hljs) not found for syntax highlighting.");
+                }
+            });
+
+            // Add thinking indicator if needed (and not already present)
+            if (addThinkingIndicator && !message.text && !messageDiv.querySelector(".thinking-indicator-cursor")) {
+                const thinkingDiv = document.createElement("div");
+                thinkingDiv.className = "thinking-indicator-cursor";
+                messageDiv.appendChild(thinkingDiv);
+            }
+        } else {
+            // User messages remain plain text
+            // messageDiv.textContent = message.text;
+        }
+
+        // wrap each pre in a div.terminal
+        messageDiv.querySelectorAll("pre").forEach((block) => {
+            const wrapper = document.createElement("div");
+            wrapper.className = "terminal";
+            block.parentNode.insertBefore(wrapper, block);
+            wrapper.appendChild(block);
+        });
+
+        chatMessages.appendChild(messageDiv);
+        // Scroll only if user is near the bottom? (More advanced)
+        // Simple scroll for now:
+        scrollToBottom();
+        return messageDiv; // Return the created element
+    }
+
+    function streamSimulatedResponse(messageDiv, fullText) {
+        const thinkingIndicator = messageDiv.querySelector(".thinking-indicator-cursor");
+        if (thinkingIndicator) thinkingIndicator.remove();
+
+        const tokens = fullText.split(/(\s+)/);
+        let currentText = "";
+        let tokenIndex = 0;
+        // Clear previous interval just in case
+        if (streamInterval) clearInterval(streamInterval);
+
+        streamInterval = setInterval(() => {
+            const cursorSpan = '<span class="thinking-indicator-cursor"></span>'; // Cursor for streaming
+            if (tokenIndex < tokens.length) {
+                currentText += tokens[tokenIndex];
+                // Render intermediate markdown + cursor
+                messageDiv.innerHTML = marked.parse(currentText + cursorSpan);
+                // Re-highlight code blocks on each stream update - might be slightly inefficient
+                // but ensures partial code blocks look okay. Highlight only final on completion.
+                // messageDiv.querySelectorAll('pre code:not(.hljs)').forEach((block) => {
+                //     hljs.highlightElement(block);
+                // });
+                scrollToBottom(); // Keep scrolling as content streams
+                tokenIndex++;
+            } else {
+                // Streaming finished
+                clearInterval(streamInterval);
+                streamInterval = null;
+
+                // Final render without cursor
+                messageDiv.innerHTML = marked.parse(currentText);
+
+                // === Final Syntax Highlighting ===
+                messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
+                    if (typeof hljs !== "undefined" && !block.classList.contains("hljs")) {
+                        hljs.highlightElement(block);
+                    }
+                });
+
+                // === Extract Citations ===
+                const citations = extractMarkdownLinks(currentText);
+
+                // Wrap each pre in a div.terminal
+                messageDiv.querySelectorAll("pre").forEach((block) => {
+                    const wrapper = document.createElement("div");
+                    wrapper.className = "terminal";
+                    block.parentNode.insertBefore(wrapper, block);
+                    wrapper.appendChild(block);
+                });
+
+                const aiMessage = { sender: "ai", text: currentText, citations: citations };
+                conversationHistory.push(aiMessage);
+                updateCitationsDisplay();
+                saveCurrentChat();
+                setThinking(false);
+            }
+        }, 50); // Adjust speed
+    }
+
+    // === NEW Function to Extract Links ===
+    function extractMarkdownLinks(markdownText) {
+        const regex = /\[([^\]]+)\]\(([^)]+)\)/g; // [text](url)
+        const citations = [];
+        let match;
+        while ((match = regex.exec(markdownText)) !== null) {
+            // Avoid adding self-links from within the citations list if AI includes them
+            if (!match[2].startsWith("#citation-")) {
+                citations.push({
+                    title: match[1].trim(),
+                    url: match[2].trim(),
+                });
+            }
+        }
+        // Optional: Deduplicate links based on URL
+        const uniqueCitations = citations.filter(
+            (citation, index, self) => index === self.findIndex((c) => c.url === citation.url)
+        );
+        return uniqueCitations;
+    }
+
+    // === REVISED Function to Display Citations ===
+    function updateCitationsDisplay() {
+        let lastCitations = null;
+        // Find the most recent AI message with citations
+        for (let i = conversationHistory.length - 1; i >= 0; i--) {
+            if (
+                conversationHistory[i].sender === "ai" &&
+                conversationHistory[i].citations &&
+                conversationHistory[i].citations.length > 0
+            ) {
+                lastCitations = conversationHistory[i].citations;
+                break; // Found the latest citations
+            }
+        }
+
+        citationsList.innerHTML = ""; // Clear previous
+        if (!lastCitations) {
+            citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
+            return;
+        }
+
+        lastCitations.forEach((citation, index) => {
+            const li = document.createElement("li");
+            const a = document.createElement("a");
+            // Generate a unique ID for potential internal linking if needed
+            // a.id = `citation-${index}`;
+            a.href = citation.url || "#";
+            a.textContent = citation.title;
+            a.target = "_top"; // Open in main window
+            li.appendChild(a);
+            citationsList.appendChild(li);
+        });
+    }
+
+    function addCitations(citations) {
+        citationsList.innerHTML = ""; // Clear
+        if (!citations || citations.length === 0) {
+            citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
+            return;
+        }
+        citations.forEach((citation) => {
+            const li = document.createElement("li");
+            const a = document.createElement("a");
+            a.href = citation.url || "#";
+            a.textContent = citation.title;
+            a.target = "_top"; // Open in main window
+            li.appendChild(a);
+            citationsList.appendChild(li);
+        });
+    }
+
+    function setThinking(thinking) {
+        isThinking = thinking;
+        sendButton.disabled = thinking;
+        chatInput.disabled = thinking;
+        chatInput.placeholder = thinking ? "AI is responding..." : "Ask about Crawl4AI...";
+        // Stop any existing stream if we start thinking again (e.g., rapid resend)
+        if (thinking && streamInterval) {
+            clearInterval(streamInterval);
+            streamInterval = null;
+        }
+    }
+
+    function autoGrowTextarea() {
+        chatInput.style.height = "auto";
+        chatInput.style.height = `${chatInput.scrollHeight}px`;
+    }
+
+    function scrollToBottom() {
+        chatMessages.scrollTop = chatMessages.scrollHeight;
+    }
+
+    // --- Query Parameter Handling ---
+    function checkForInitialQuery(locationToCheck) {
+        // <-- Receive location object
+        if (!locationToCheck) {
+            console.warn("Ask AI: Could not access parent window location.");
+            return false;
+        }
+        const urlParams = new URLSearchParams(locationToCheck.search); // <-- Use passed location's search string
+        const encodedQuery = urlParams.get("qq"); // <-- Use 'qq'
+
+        if (encodedQuery) {
+            console.log("Initial query found (qq):", encodedQuery);
+            try {
+                const decodedText = decodeURIComponent(escape(atob(encodedQuery)));
+                console.log("Decoded query:", decodedText);
+
+                // Start new chat immediately
+                handleNewChat(true);
+
+                // Delay setting input and sending message slightly
+                setTimeout(() => {
+                    chatInput.value = decodedText;
+                    autoGrowTextarea();
+                    handleSendMessage();
+
+                    // Clean the PARENT window's URL
+                    try {
+                        const cleanUrl = locationToCheck.pathname;
+                        // Use parent's history object
+                        window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
+                    } catch (e) {
+                        console.warn("Ask AI: Could not clean parent URL using replaceState.", e);
+                        // This might fail due to cross-origin restrictions if served differently,
+                        // but should work fine with mkdocs serve on the same origin.
+                    }
+                }, 100);
+
+                return true; // Query processed
+            } catch (e) {
+                console.error("Error decoding initial query (qq):", e);
+                // Clean the PARENT window's URL even on error
+                try {
+                    const cleanUrl = locationToCheck.pathname;
+                    window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
+                } catch (cleanError) {
+                    console.warn("Ask AI: Could not clean parent URL after decode error.", cleanError);
+                }
+                return false;
+            }
+        }
+        return false; // No 'qq' query found
+    }
+
+    // --- History Management ---
+
+    function handleNewChat(isFromQuery = false) {
+        if (isThinking) return; // Don't allow new chat while responding
+
+        // Only save if NOT triggered immediately by a query parameter load
+        if (!isFromQuery) {
+            saveCurrentChat();
+        }
+
+        currentChatId = `chat_${Date.now()}`;
+        conversationHistory = []; // Clear message history state
+        chatMessages.innerHTML = ""; // Start with clean slate for query
+        if (!isFromQuery) {
+            // Show welcome only if manually started
+            // chatMessages.innerHTML =
+            //     '<div class="message ai-message welcome-message">Started a new chat! Ask me anything about Crawl4AI.</div>';
+            chatMessages.innerHTML =
+                '<div class="message ai-message welcome-message">We will launch this feature very soon.</div>';
+        }
+        addCitations([]); // Clear citations
+        updateCitationsDisplay(); // Clear UI
+
+        // Add to index and save
+        let index = loadChatIndex();
+        // Generate a generic title initially, update later
+        const newTitle = isFromQuery ? "Chat from Selection" : `Chat ${new Date().toLocaleString()}`;
+        // index.unshift({ id: currentChatId, title: `Chat ${new Date().toLocaleString()}` }); // Add to start
+        index.unshift({ id: currentChatId, title: newTitle });
+        saveChatIndex(index);
+
+        renderHistoryList(index); // Update UI
+        setActiveHistoryItem(currentChatId);
+        saveCurrentChat(); // Save the empty new chat state
+    }
+
+    function loadChat(chatId) {
+        if (isThinking || chatId === currentChatId) return;
+
+        // Check if chat data actually exists before proceeding
+        const storedChat = localStorage.getItem(CHAT_PREFIX + chatId);
+        if (storedChat === null) {
+            console.warn(`Attempted to load non-existent chat: ${chatId}. Removing from index.`);
+            deleteChatData(chatId); // Clean up index
+            loadChatHistoryIndex(); // Reload history list
+            loadInitialChat(); // Load next available chat
+            return;
+        }
+
+        console.log(`Loading chat: ${chatId}`);
+        saveCurrentChat(); // Save current before switching
+
+        try {
+            conversationHistory = JSON.parse(storedChat);
+            currentChatId = chatId;
+            renderChatMessages(conversationHistory);
+            updateCitationsDisplay();
+            setActiveHistoryItem(chatId);
+        } catch (e) {
+            console.error("Error loading chat:", chatId, e);
+            alert("Failed to load chat data.");
+            conversationHistory = [];
+            renderChatMessages(conversationHistory);
+            updateCitationsDisplay();
+        }
+    }
+
+    function saveCurrentChat() {
+        if (currentChatId && conversationHistory.length > 0) {
+            try {
+                localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify(conversationHistory));
+                console.log(`Chat ${currentChatId} saved.`);
+
+                // Update title in index (e.g., use first user message)
+                let index = loadChatIndex();
+                const currentItem = index.find((item) => item.id === currentChatId);
+                if (
+                    currentItem &&
+                    conversationHistory[0]?.sender === "user" &&
+                    !currentItem.title.startsWith("Chat about:")
+                ) {
+                    currentItem.title = `Chat about: ${conversationHistory[0].text.substring(0, 30)}...`;
+                    saveChatIndex(index);
+                    // Re-render history list if title changed - small optimization needed here maybe
+                    renderHistoryList(index);
+                    setActiveHistoryItem(currentChatId); // Re-set active after re-render
+                }
+            } catch (e) {
+                console.error("Error saving chat:", currentChatId, e);
+                // Handle potential storage full errors
+                if (e.name === "QuotaExceededError") {
+                    alert("Local storage is full. Cannot save chat history.");
+                    // Consider implementing history pruning logic here
+                }
+            }
+        } else if (currentChatId) {
+            // Save empty state for newly created chats if needed, or remove?
+            localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify([]));
+        }
+    }
+
+    function loadChatIndex() {
+        try {
+            const storedIndex = localStorage.getItem(CHAT_INDEX_KEY);
+            return storedIndex ? JSON.parse(storedIndex) : [];
+        } catch (e) {
+            console.error("Error loading chat index:", e);
+            return []; // Return empty array on error
+        }
+    }
+
+    function saveChatIndex(indexArray) {
+        try {
+            localStorage.setItem(CHAT_INDEX_KEY, JSON.stringify(indexArray));
+        } catch (e) {
+            console.error("Error saving chat index:", e);
+        }
+    }
+
+    function renderHistoryList(indexArray) {
+        historyList.innerHTML = ""; // Clear existing
+        if (!indexArray || indexArray.length === 0) {
+            historyList.innerHTML = '<li class="no-history">No past chats found.</li>';
+            return;
+        }
+        indexArray.forEach((item) => {
+            const li = document.createElement("li");
+            li.dataset.chatId = item.id; // Add ID to li for easier selection
+
+            const a = document.createElement("a");
+            a.href = "#";
+            a.dataset.chatId = item.id;
+            a.textContent = item.title || `Chat ${item.id.split("_")[1] || item.id}`;
+            a.title = a.textContent; // Tooltip for potentially long titles
+            a.addEventListener("click", (e) => {
+                e.preventDefault();
+                loadChat(item.id);
+            });
+
+            // === Add Delete Button ===
+            const deleteBtn = document.createElement("button");
+            deleteBtn.className = "delete-chat-btn";
+            deleteBtn.innerHTML = "✕"; // Trash can emoji/icon (or use text/SVG/FontAwesome)
+            deleteBtn.title = "Delete Chat";
+            deleteBtn.dataset.chatId = item.id; // Store ID on button too
+            deleteBtn.addEventListener("click", handleDeleteChat);
+
+            li.appendChild(a);
+            li.appendChild(deleteBtn); // Append button to the list item
+            historyList.appendChild(li);
+        });
+    }
+
+    function renderChatMessages(messages) {
+        chatMessages.innerHTML = ""; // Clear existing messages
+        messages.forEach((message) => {
+            // Ensure highlighting is applied when loading from history
+            addMessageToChat(message, false);
+        });
+        if (messages.length === 0) {
+            // chatMessages.innerHTML =
+            //     '<div class="message ai-message welcome-message">Chat history loaded. Ask a question!</div>';
+            chatMessages.innerHTML =
+                '<div class="message ai-message welcome-message">We will launch this feature very soon.</div>';
+        }
+        // Scroll to bottom after loading messages
+        scrollToBottom();
+    }
+
+    function setActiveHistoryItem(chatId) {
+        document.querySelectorAll("#history-list li").forEach((li) => li.classList.remove("active"));
+        // Select the LI element directly now
+        const activeLi = document.querySelector(`#history-list li[data-chat-id="${chatId}"]`);
+        if (activeLi) {
+            activeLi.classList.add("active");
+        }
+    }
+
+    function loadInitialChat() {
+        const index = loadChatIndex();
+        if (index.length > 0) {
+            loadChat(index[0].id);
+        } else {
+            // Check if handleNewChat wasn't already called by query handler
+            if (!currentChatId) {
+                handleNewChat();
+            }
+        }
+    }
+
+    function loadChatHistoryIndex() {
+        const index = loadChatIndex();
+        renderHistoryList(index);
+        if (currentChatId) setActiveHistoryItem(currentChatId);
+    }
+
+    // === NEW Function to Handle Delete Click ===
+    function handleDeleteChat(event) {
+        event.stopPropagation(); // Prevent triggering loadChat on the link behind it
+        const button = event.currentTarget;
+        const chatIdToDelete = button.dataset.chatId;
+
+        if (!chatIdToDelete) return;
+
+        // Confirmation dialog
+        if (
+            window.confirm(
+                `Are you sure you want to delete this chat session?\n"${
+                    button.previousElementSibling?.textContent || "Chat " + chatIdToDelete
+                }"`
+            )
+        ) {
+            console.log(`Deleting chat: ${chatIdToDelete}`);
+
+            // Perform deletion
+            const updatedIndex = deleteChatData(chatIdToDelete);
+
+            // If the deleted chat was the currently active one, load another chat
+            if (currentChatId === chatIdToDelete) {
+                currentChatId = null; // Reset current ID
+                conversationHistory = []; // Clear state
+                if (updatedIndex.length > 0) {
+                    // Load the new top chat (most recent remaining)
+                    loadChat(updatedIndex[0].id);
+                } else {
+                    // No chats left, start a new one
+                    handleNewChat();
+                }
+            } else {
+                // If a different chat was deleted, just re-render the list
+                renderHistoryList(updatedIndex);
+                // Re-apply active state in case IDs shifted (though they shouldn't)
+                setActiveHistoryItem(currentChatId);
+            }
+        }
+    }
+
+    // === NEW Function to Delete Chat Data ===
+    function deleteChatData(chatId) {
+        // Remove chat data
+        localStorage.removeItem(CHAT_PREFIX + chatId);
+
+        // Update index
+        let index = loadChatIndex();
+        index = index.filter((item) => item.id !== chatId);
+        saveChatIndex(index);
+
+        console.log(`Chat ${chatId} data and index entry removed.`);
+        return index; // Return the updated index
+    }
+
+    // --- Virtual Scrolling Placeholder ---
+    // NOTE: Virtual scrolling is complex. For now, we do direct rendering.
+    // If performance becomes an issue with very long chats/history,
+    // investigate libraries like 'simple-virtual-scroll' or 'virtual-scroller'.
+    // You would replace parts of `renderChatMessages` and `renderHistoryList`
+    // to work with the chosen library's API (providing data and item renderers).
+    console.warn("Virtual scrolling not implemented. Performance may degrade with very long chat histories.");
+});
diff --git a/docs/md_v2/ask_ai/index.html b/docs/md_v2/ask_ai/index.html
new file mode 100644
index 00000000..ccb7faa4
--- /dev/null
+++ b/docs/md_v2/ask_ai/index.html
@@ -0,0 +1,64 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Crawl4AI Assistant</title>
+    <!-- Link main styles first for variable access -->
+    <link rel="stylesheet" href="../assets/layout.css">
+    <link rel="stylesheet" href="../assets/styles.css">
+    <!-- Link specific AI styles -->
+    <link rel="stylesheet" href="../assets/highlight.css">
+    <link rel="stylesheet" href="ask-ai.css">
+</head>
+<body>
+    <div class="ai-assistant-container">
+
+        <!-- Left Sidebar: Conversation History -->
+        <aside id="history-panel" class="sidebar left-sidebar">
+            <header>
+                <h3>History</h3>
+                <button id="new-chat-button" class="btn btn-sm">New Chat</button>
+            </header>
+            <ul id="history-list">
+                <!-- History items populated by JS -->
+            </ul>
+        </aside>
+
+        <!-- Main Area: Chat Interface -->
+        <main id="chat-panel">
+            <div id="chat-messages">
+                <!-- Chat messages populated by JS -->
+                 <div class="message ai-message welcome-message">
+                    Welcome to the Crawl4AI Assistant! How can I help you today?
+                 </div>
+            </div>
+            <div id="chat-input-area">
+                <!-- Loading indicator for general waiting (optional) -->
+                <!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
+                <textarea id="chat-input" placeholder="We will roll out this feature very soon." rows="2" disabled></textarea> 
+                <button id="send-button">Send</button>
+            </div>
+        </main>
+
+        <!-- Right Sidebar: Citations / Context -->
+        <aside id="citations-panel" class="sidebar right-sidebar">
+            <header>
+                <h3>Citations</h3>
+            </header>
+            <ul id="citations-list">
+                <!-- Citations populated by JS -->
+                <li class="no-citations">No citations for this response yet.</li>
+            </ul>
+        </aside>
+
+    </div>
+
+    <!-- Include Marked.js library -->
+    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+    <script src="../assets/highlight.min.js"></script> 
+
+    <!-- Your AI Assistant Logic -->
+    <script src="ask-ai.js"></script>
+</body>
+</html>
\ No newline at end of file
diff --git a/docs/md_v2/assets/copy_code.js b/docs/md_v2/assets/copy_code.js
new file mode 100644
index 00000000..20e6be4f
--- /dev/null
+++ b/docs/md_v2/assets/copy_code.js
@@ -0,0 +1,62 @@
+// ==== File: docs/assets/copy_code.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+    // Target specifically code blocks within the main content area
+    const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
+
+    codeBlocks.forEach((codeElement) => {
+        const preElement = codeElement.parentElement; // The <pre> tag
+
+        // Ensure the <pre> tag can contain a positioned button
+        if (window.getComputedStyle(preElement).position === 'static') {
+            preElement.style.position = 'relative';
+        }
+
+        // Create the button
+        const copyButton = document.createElement('button');
+        copyButton.className = 'copy-code-button';
+        copyButton.type = 'button';
+        copyButton.setAttribute('aria-label', 'Copy code to clipboard');
+        copyButton.title = 'Copy code to clipboard';
+        copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
+
+        // Append the button to the <pre> element
+        preElement.appendChild(copyButton);
+
+        // Add click event listener
+        copyButton.addEventListener('click', () => {
+            copyCodeToClipboard(codeElement, copyButton);
+        });
+    });
+
+    async function copyCodeToClipboard(codeElement, button) {
+        // Use innerText to get the rendered text content, preserving line breaks
+        const textToCopy = codeElement.innerText;
+
+        try {
+            await navigator.clipboard.writeText(textToCopy);
+
+            // Visual feedback
+            button.innerHTML = 'Copied!';
+            button.classList.add('copied');
+            button.disabled = true; // Temporarily disable
+
+            // Revert button state after a short delay
+            setTimeout(() => {
+                button.innerHTML = 'Copy';
+                button.classList.remove('copied');
+                button.disabled = false;
+            }, 2000); // Show "Copied!" for 2 seconds
+
+        } catch (err) {
+            console.error('Failed to copy code: ', err);
+            // Optional: Provide error feedback on the button
+            button.innerHTML = 'Error';
+            setTimeout(() => {
+                button.innerHTML = 'Copy';
+            }, 2000);
+        }
+    }
+
+    console.log("Copy Code Button script loaded.");
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/floating_ask_ai_button.js b/docs/md_v2/assets/floating_ask_ai_button.js
new file mode 100644
index 00000000..177c2356
--- /dev/null
+++ b/docs/md_v2/assets/floating_ask_ai_button.js
@@ -0,0 +1,39 @@
+// ==== File: docs/assets/floating_ask_ai_button.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+    const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
+    const currentPath = window.location.pathname;
+
+    // Determine the base URL for constructing the link correctly,
+    // especially if deployed in a sub-directory.
+    // This assumes a simple structure; adjust if needed.
+    const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
+
+
+    // Check if the current page IS the Ask AI page
+    // Use includes() for flexibility (handles trailing slash or .html)
+    if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
+        console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
+        return; // Don't add the button on the target page
+    }
+
+    // --- Create the button ---
+    const fabLink = document.createElement('a');
+    fabLink.className = 'floating-ask-ai-button';
+    fabLink.href = askAiPagePath; // Construct the correct URL
+    fabLink.title = 'Ask Crawl4AI Assistant';
+    fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
+
+    // Add content (using SVG icon for better visuals)
+    fabLink.innerHTML = `
+        <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor">
+            <path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2zm-2 12H6v-2h12v2zm0-3H6V9h12v2zm0-3H6V6h12v2z"/>
+        </svg>
+        <span>Ask AI</span>
+    `;
+
+    // Append to body
+    document.body.appendChild(fabLink);
+
+    console.log("Floating Ask AI Button added.");
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/github_stats.js b/docs/md_v2/assets/github_stats.js
new file mode 100644
index 00000000..a48b3de1
--- /dev/null
+++ b/docs/md_v2/assets/github_stats.js
@@ -0,0 +1,119 @@
+// ==== File: assets/github_stats.js ====
+
+document.addEventListener('DOMContentLoaded', async () => {
+    // --- Configuration ---
+    const targetHeaderSelector = '.terminal .container:first-child'; // Selector for your header container
+    const insertBeforeSelector = '.terminal-nav'; // Selector for the element to insert the badge BEFORE (e.g., the main nav)
+                                                  // Or set to null to append at the end of the header.
+
+    // --- Find elements ---
+    const headerContainer = document.querySelector(targetHeaderSelector);
+    if (!headerContainer) {
+        console.warn('GitHub Stats: Header container not found with selector:', targetHeaderSelector);
+        return;
+    }
+
+    const repoLinkElement = headerContainer.querySelector('a[href*="github.com/"]'); // Find the existing GitHub link
+    let repoUrl = 'https://github.com/unclecode/crawl4ai';
+    // if (repoLinkElement) {
+    //     repoUrl = repoLinkElement.href;
+    // } else {
+    //     // Fallback: Try finding from config (requires template injection - harder)
+    //     // Or hardcode if necessary, but reading from the link is better.
+    //      console.warn('GitHub Stats: GitHub repo link not found in header.');
+    //      // Try to get repo_url from mkdocs config if available globally (less likely)
+    //      // repoUrl = window.mkdocs_config?.repo_url; // Requires setting this variable
+    //      // if (!repoUrl) return; // Exit if still no URL
+    //      return; // Exit for now if link isn't found
+    // }
+
+
+    // --- Extract Repo Owner/Name ---
+    let owner = '';
+    let repo = '';
+    try {
+        const url = new URL(repoUrl);
+        const pathParts = url.pathname.split('/').filter(part => part.length > 0);
+        if (pathParts.length >= 2) {
+            owner = pathParts[0];
+            repo = pathParts[1];
+        }
+    } catch (e) {
+        console.error('GitHub Stats: Could not parse repository URL:', repoUrl, e);
+        return;
+    }
+
+    if (!owner || !repo) {
+        console.warn('GitHub Stats: Could not extract owner/repo from URL:', repoUrl);
+        return;
+    }
+
+    // --- Get Version (Attempt to extract from site title) ---
+    let version = '';
+    const siteTitleElement = headerContainer.querySelector('.terminal-title, .site-title'); // Adjust selector based on theme's title element
+    // Example title: "Crawl4AI Documentation (v0.5.x)"
+    if (siteTitleElement) {
+         const match = siteTitleElement.textContent.match(/\((v?[^)]+)\)/); // Look for text in parentheses starting with 'v' (optional)
+         if (match && match[1]) {
+             version = match[1].trim();
+         }
+    }
+     if (!version) {
+        console.info('GitHub Stats: Could not extract version from title. You might need to adjust the selector or regex.');
+        // You could fallback to config.extra.version if injected into JS
+        // version = window.mkdocs_config?.extra?.version || 'N/A';
+     }
+
+
+    // --- Fetch GitHub API Data ---
+    let stars = '...';
+    let forks = '...';
+    try {
+        const apiUrl = `https://api.github.com/repos/${owner}/${repo}`;
+        const response = await fetch(apiUrl);
+
+        if (response.ok) {
+            const data = await response.json();
+            // Format large numbers (optional)
+            stars = data.stargazers_count > 1000 ? `${(data.stargazers_count / 1000).toFixed(1)}k` : data.stargazers_count;
+            forks = data.forks_count > 1000 ? `${(data.forks_count / 1000).toFixed(1)}k` : data.forks_count;
+        } else {
+            console.warn(`GitHub Stats: API request failed with status ${response.status}. Rate limit exceeded?`);
+            stars = 'N/A';
+            forks = 'N/A';
+        }
+    } catch (error) {
+        console.error('GitHub Stats: Error fetching repository data:', error);
+        stars = 'N/A';
+        forks = 'N/A';
+    }
+
+    // --- Create Badge HTML ---
+    const badgeContainer = document.createElement('div');
+    badgeContainer.className = 'github-stats-badge';
+
+    // Use innerHTML for simplicity, including potential icons (requires FontAwesome or similar)
+    // Ensure your theme loads FontAwesome or add it yourself if you want icons.
+    badgeContainer.innerHTML = `
+        <a href="${repoUrl}" target="_blank" rel="noopener">
+            <!-- Optional Icon (FontAwesome example) -->
+            <!-- <i class="fab fa-github"></i> -->
+             <span class="repo-name">${owner}/${repo}</span>
+             ${version ? `<span class="stat version"><i class="fas fa-tag"></i> ${version}</span>` : ''}
+            <span class="stat stars"><i class="fas fa-star"></i> ${stars}</span>
+            <span class="stat forks"><i class="fas fa-code-branch"></i> ${forks}</span>
+        </a>
+    `;
+
+    // --- Inject Badge into Header ---
+    const insertBeforeElement = insertBeforeSelector ? headerContainer.querySelector(insertBeforeSelector) : null;
+    if (insertBeforeElement) {
+        // headerContainer.insertBefore(badgeContainer, insertBeforeElement);
+        headerContainer.querySelector(insertBeforeSelector).appendChild(badgeContainer); 
+    } else {
+        headerContainer.appendChild(badgeContainer); 
+    }
+
+     console.info('GitHub Stats: Badge added to header.');
+
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/layout.css b/docs/md_v2/assets/layout.css
new file mode 100644
index 00000000..044c272b
--- /dev/null
+++ b/docs/md_v2/assets/layout.css
@@ -0,0 +1,576 @@
+/* ==== File: assets/layout.css (Non-Fluid Centered Layout) ==== */
+
+:root {
+    --header-height: 55px; /* Adjust if needed */
+    --sidebar-width: 280px; /* Adjust if needed */
+    --toc-width: 340px; /* As specified */
+    --content-max-width: 90em; /* Max width for the centered content */
+    --layout-transition-speed: 0.2s;
+    --global-space: 10px;
+}
+
+/* --- Basic Setup --- */
+html {
+    scroll-behavior: smooth;
+    scroll-padding-top: calc(var(--header-height) + 15px);
+    box-sizing: border-box;
+}
+*, *:before, *:after {
+    box-sizing: inherit;
+}
+
+body {
+    padding-top: 0;
+    padding-bottom: 0;
+    background-color: var(--background-color);
+    color: var(--font-color);
+    /* Prevents horizontal scrollbars during transitions */
+    overflow-x: hidden;
+}
+
+/* --- Fixed Header --- */
+/* Full width, fixed header */
+.terminal .container:first-child { /* Assuming this targets the header container */
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    height: var(--header-height);
+    background-color: var(--background-color);
+    z-index: 1000;
+    border-bottom: 1px solid var(--progress-bar-background);
+    max-width: none; /* Override any container max-width */
+    padding: 0 calc(var(--global-space) * 2);
+}
+
+/* --- Main Layout Container (Below Header) --- */
+/* This container just provides space for the fixed header */
+.container:has(.terminal-mkdocs-main-grid) {
+    margin: 0 auto;
+    padding: 0;
+    padding-top: var(--header-height); /* Space for fixed header */
+}
+
+/* --- Flex Container: Grid holding content and toc (CENTERED) --- */
+/* THIS is the main centered block */
+.terminal-mkdocs-main-grid {
+    display: flex;
+    align-items: flex-start;
+    /* Enforce max-width and center */
+    max-width: var(--content-max-width);
+    margin-left: auto;
+    margin-right: auto;
+    position: relative;
+    /* Apply side padding within the centered block */
+    padding-left: calc(var(--global-space) * 2);
+    padding-right: calc(var(--global-space) * 2);
+    /* Add margin-left to clear the fixed sidebar - ONLY ON DESKTOP */
+    margin-left: var(--sidebar-width);
+}
+
+/* --- 1. Fixed Left Sidebar (Viewport Relative) --- */
+#terminal-mkdocs-side-panel {
+    position: fixed;
+    top: var(--header-height);
+    left: max(0px, calc((90vw - var(--content-max-width)) / 2)); 
+    bottom: 0;
+    width: var(--sidebar-width);
+    background-color: var(--background-color);
+    border-right: 1px solid var(--progress-bar-background);
+    overflow-y: auto;
+    z-index: 900;
+    padding: 1em calc(var(--global-space) * 2);
+    padding-bottom: 2em;
+    transition: left var(--layout-transition-speed) ease-in-out;
+}
+
+/* --- 2. Main Content Area (Within Centered Grid) --- */
+#terminal-mkdocs-main-content {
+    flex-grow: 1;
+    flex-shrink: 1;
+    min-width: 0; /* Flexbox shrink fix */
+
+    /* No left/right margins needed here - handled by parent grid */
+    margin-left: 0;
+    margin-right: 0;
+
+    /* Internal Padding */
+    padding: 1.5em 2em;
+
+    position: relative;
+    z-index: 1;
+}
+
+/* --- 3. Right Table of Contents (Sticky, Within Centered Grid) --- */
+#toc-sidebar {
+    flex-basis: var(--toc-width);
+    flex-shrink: 0;
+    width: var(--toc-width);
+
+    position: sticky; /* Sticks within the centered grid */
+    top: var(--header-height);
+    align-self: stretch;
+    height: calc(100vh - var(--header-height));
+    overflow-y: auto;
+
+    padding: 1.5em 1em;
+    font-size: 0.85em;
+    border-left: 1px solid var(--progress-bar-background);
+    z-index: 800;
+    /* display: none; /* JS handles */
+}
+
+/* (ToC link styles remain the same) */
+#toc-sidebar h4 { margin-top: 0; margin-bottom: 1em; font-size: 1.1em; color: var(--secondary-color); padding-left: 0.8em; }
+#toc-sidebar ul { list-style: none; padding: 0; margin: 0; }
+#toc-sidebar ul li a { display: block; padding: 0.3em 0; color: var(--secondary-color); text-decoration: none; border-left: 3px solid transparent; padding-left: 0.8em; transition: all 0.1s ease-in-out; line-height: 1.4; word-break: break-word; }
+#toc-sidebar ul li.toc-level-3 a { padding-left: 1.8em; }
+#toc-sidebar ul li.toc-level-4 a { padding-left: 2.8em; }
+#toc-sidebar ul li a:hover { color: var(--font-color); background-color: rgba(255, 255, 255, 0.05); }
+#toc-sidebar ul li a.active { color: var(--primary-color); border-left-color: var(--primary-color); background-color: rgba(80, 255, 255, 0.08); }
+
+
+/* --- Footer Styling (Respects Centered Layout) --- */
+footer {
+    background-color: var(--code-bg-color);
+    color: var(--secondary-color);
+    position: relative;
+    z-index: 10;
+    margin-top: 2em;
+
+    /* Apply margin-left to clear the fixed sidebar */
+    margin-left: var(--sidebar-width);
+
+    /* Constrain width relative to the centered grid it follows */
+    max-width: calc(var(--content-max-width) - var(--sidebar-width));
+    margin-right: auto; /* Keep it left-aligned within the space next to sidebar */
+
+    /* Use padding consistent with the grid */
+    padding: 2em calc(var(--global-space) * 2);
+}
+
+/* Adjust footer grid if needed */
+.terminal-mkdocs-footer-grid {
+    display: grid;
+    grid-template-columns: 1fr auto;
+    gap: 1em;
+    align-items: center;
+}
+
+/* ==========================================================================
+   RESPONSIVENESS (Adapting the Non-Fluid Layout)
+   ========================================================================== */
+
+/* --- Medium screens: Hide ToC --- */
+@media screen and (max-width: 1200px) {
+    #toc-sidebar {
+        display: none;
+    }
+
+    .terminal-mkdocs-main-grid {
+        /* Grid adjusts automatically as ToC is removed */
+        /* Ensure grid padding remains */
+         padding-left: calc(var(--global-space) * 2);
+         padding-right: calc(var(--global-space) * 2);
+    }
+
+    #terminal-mkdocs-main-content {
+        /* Content area naturally expands */
+    }
+
+    footer {
+        /* Footer still respects the left sidebar and overall max width */
+        margin-left: var(--sidebar-width);
+        max-width: calc(var(--content-max-width) - var(--sidebar-width));
+        /* Padding remains consistent */
+         padding-left: calc(var(--global-space) * 2);
+         padding-right: calc(var(--global-space) * 2);
+    }
+}
+
+/* --- Mobile Menu Styles --- */
+.mobile-menu-toggle {
+    display: none; /* Hidden by default, shown in mobile */
+    background: none;
+    border: none;
+    padding: 10px;
+    cursor: pointer;
+    z-index: 1200;
+    margin-right: 10px;
+    position: absolute;
+    left: 10px;
+    top: 50%;
+    transform: translateY(-50%);
+    /* Make sure it doesn't get moved */
+    min-width: 30px;
+    min-height: 30px;
+}
+
+.hamburger-line {
+    display: block;
+    width: 22px;
+    height: 2px;
+    margin: 5px 0;
+    background-color: var(--font-color);
+    transition: transform 0.3s, opacity 0.3s;
+}
+
+/* Hamburger animation */
+.mobile-menu-toggle.is-active .hamburger-line:nth-child(1) {
+    transform: translateY(7px) rotate(45deg);
+}
+
+.mobile-menu-toggle.is-active .hamburger-line:nth-child(2) {
+    opacity: 0;
+}
+
+.mobile-menu-toggle.is-active .hamburger-line:nth-child(3) {
+    transform: translateY(-7px) rotate(-45deg);
+}
+
+.mobile-menu-close {
+    display: none; /* Hidden by default, shown in mobile */
+    position: absolute;
+    top: 10px;
+    right: 10px;
+    background: none;
+    border: none;
+    color: var(--font-color);
+    font-size: 24px;
+    cursor: pointer;
+    z-index: 1200;
+    padding: 5px 10px;
+}
+
+.mobile-menu-backdrop {
+    position: fixed;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background-color: rgba(0, 0, 0, 0.7);
+    z-index: 1050;
+}
+
+/* --- Small screens: Hide left sidebar, full width content & footer --- */
+@media screen and (max-width: 768px) {
+    /* Hide the terminal-menu from theme */
+    .terminal-menu {
+        display: none !important;
+    }
+    
+    /* Add padding to site name to prevent hamburger overlap */
+    .terminal-mkdocs-site-name,
+    .terminal-logo a,
+    .terminal-nav .logo {
+        padding-left: 40px !important;
+        white-space: nowrap;
+        overflow: hidden;
+        text-overflow: ellipsis;
+    }
+    
+    /* Show mobile menu toggle button */
+    .mobile-menu-toggle {
+        display: block;
+    }
+    
+    /* Show mobile menu close button */
+    .mobile-menu-close {
+        display: block;
+    }
+
+    #terminal-mkdocs-side-panel {
+        left: -100%; /* Hide completely off-screen */
+        z-index: 1100;
+        box-shadow: 2px 0 10px rgba(0,0,0,0.3);
+        top: 0; /* Start from top edge */
+        height: 100%; /* Full height */
+        transition: left 0.3s ease-in-out;
+        padding-top: 50px; /* Space for close button */
+        overflow-y: auto;
+        width: 85%; /* Wider on mobile */
+        max-width: 320px; /* Maximum width */
+        background-color: var(--background-color); /* Ensure solid background */
+    }
+    
+    #terminal-mkdocs-side-panel.sidebar-visible {
+        left: 0;
+    }
+    
+    /* Make navigation links more touch-friendly */
+    #terminal-mkdocs-side-panel a {
+        padding: 6px 15px;
+        display: block;
+        /* No border as requested */
+    }
+    
+    #terminal-mkdocs-side-panel ul {
+        padding-left: 0;
+    }
+    
+    #terminal-mkdocs-side-panel ul ul a {
+        padding-left: 10px;
+    }
+
+    .terminal-mkdocs-main-grid {
+        /* Grid now takes full width (minus body padding) */
+        margin-left: 0 !important; /* Override sidebar margin with !important */
+        margin-right: 0; /* Override auto margin */
+        max-width: 100%; /* Allow full width */
+        padding-left: var(--global-space); /* Reduce padding */
+        padding-right: var(--global-space);
+    }
+
+    #terminal-mkdocs-main-content {
+        padding: 1.5em 1em; /* Adjust internal padding */
+    }
+
+    footer {
+        margin-left: 0; /* Full width footer */
+        max-width: 100%; /* Allow full width */
+        padding: 2em 1em; /* Adjust internal padding */
+    }
+
+    .terminal-mkdocs-footer-grid {
+         grid-template-columns: 1fr; /* Stack footer items */
+         text-align: center;
+         gap: 0.5em;
+    }
+}
+
+
+/* ==== GitHub Stats Badge Styling ==== */
+
+.github-stats-badge {
+    display: inline-block; /* Or flex if needed */
+    margin-left: 2em; /* Adjust spacing */
+    vertical-align: middle; /* Align with other header items */
+    font-size: 0.9em; /* Slightly smaller font */
+}
+
+.github-stats-badge a {
+    color: var(--secondary-color); /* Use secondary color */
+    text-decoration: none;
+    display: flex; /* Use flex for alignment */
+    align-items: center;
+    gap: 0.8em; /* Space between items */
+    padding: 0.2em 0.5em;
+    border: 1px solid var(--progress-bar-background); /* Subtle border */
+    border-radius: 4px;
+    transition: color 0.2s, background-color 0.2s;
+}
+
+.github-stats-badge a:hover {
+    color: var(--font-color); /* Brighter color on hover */
+    background-color: var(--progress-bar-background); /* Subtle background on hover */
+}
+
+.github-stats-badge .repo-name {
+    color: var(--font-color); /* Make repo name stand out slightly */
+    font-weight: 500; /* Optional bolder weight */
+}
+
+.github-stats-badge .stat {
+    /* Styles for individual stats (version, stars, forks) */
+    white-space: nowrap; /* Prevent wrapping */
+}
+
+.github-stats-badge .stat i {
+    /* Optional: Style for FontAwesome icons */
+    margin-right: 0.3em;
+    color: var(--secondary-dimmed-color); /* Dimmer color for icons */
+}
+
+
+/* Adjust positioning relative to search/nav if needed */
+/* Example: If search is floated right */
+/* .terminal-nav { float: left; } */
+/* .github-stats-badge { float: left; } */
+/* #mkdocs-search-query { float: right; } */
+
+/* --- Responsive adjustments --- */
+@media screen and (max-width: 900px) { /* Example breakpoint */
+    .github-stats-badge .repo-name {
+        display: none; /* Hide full repo name on smaller screens */
+    }
+    .github-stats-badge {
+        margin-left: 1em;
+    }
+     .github-stats-badge a {
+        gap: 0.5em;
+    }
+}
+@media screen and (max-width: 768px) {
+    /* Further hide or simplify on mobile if needed */
+     .github-stats-badge {
+        display: none; /* Example: Hide completely on smallest screens */
+     }
+}
+
+/* --- Ask AI Selection Button --- */
+.ask-ai-selection-button {
+    background-color: var(--primary-dimmed-color, #09b5a5);
+    color: var(--background-color, #070708);
+    border: none;
+    padding: 6px 10px;
+    font-size: 0.8em;
+    border-radius: 4px;
+    cursor: pointer;
+    box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
+    transition: background-color 0.2s ease, transform 0.15s ease;
+    white-space: nowrap;
+    display: flex;
+    align-items: center;
+    font-weight: 500;
+    animation: askAiButtonAppear 0.2s ease-out;
+}
+
+@keyframes askAiButtonAppear {
+    from {
+        opacity: 0;
+        transform: scale(0.9);
+    }
+    to {
+        opacity: 1;
+        transform: scale(1);
+    }
+}
+
+.ask-ai-selection-button:hover {
+    background-color: var(--primary-color, #50ffff);
+    transform: scale(1.05);
+}
+
+/* Mobile styles for Ask AI button */
+@media screen and (max-width: 768px) {
+    .ask-ai-selection-button {
+        padding: 8px 12px; /* Larger touch target on mobile */
+        font-size: 0.9em; /* Slightly larger text */
+    }
+}
+
+/* ==== File: docs/assets/layout.css (Additions) ==== */
+
+/* ... (keep all existing layout CSS) ... */
+
+/* --- Copy Code Button Styling --- */
+
+/* Ensure the parent <pre> can contain the absolutely positioned button */
+#terminal-mkdocs-main-content pre {
+    position: relative; /* Needed for absolute positioning of child */
+    /* Add a little padding top/right to make space for the button */
+    padding-top: 2.5em;
+    padding-right: 1em; /* Ensure padding is sufficient */
+}
+
+.copy-code-button {
+    position: absolute;
+    top: 0.5em; /* Adjust spacing from top */
+    left: 0.5em; /* Adjust spacing from left */
+    z-index: 1; /* Sit on top of code */
+
+    background-color: var(--progress-bar-background, #444); /* Use a background */
+    color: var(--font-color, #eaeaea);
+    border: 1px solid var(--secondary-color, #727578);
+    padding: 3px 8px;
+    font-size: 0.8em;
+    font-family: var(--font-stack, monospace);
+    border-radius: 4px;
+    cursor: pointer;
+    opacity: 0; /* Hidden by default */
+    transition: opacity 0.2s ease-in-out, background-color 0.2s ease, color 0.2s ease;
+    white-space: nowrap;
+}
+
+/* Show button on hover of the <pre> container */
+#terminal-mkdocs-main-content pre:hover .copy-code-button {
+    opacity: 0.8; /* Show partially */
+}
+
+.copy-code-button:hover {
+    opacity: 1; /* Fully visible on button hover */
+    background-color: var(--secondary-color, #727578);
+}
+
+.copy-code-button:focus {
+     opacity: 1; /* Ensure visible when focused */
+     outline: 1px dashed var(--primary-color);
+}
+
+
+/* Style for "Copied!" state */
+.copy-code-button.copied {
+    background-color: var(--primary-dimmed-color, #09b5a5);
+    color: var(--background-color, #070708);
+    border-color: var(--primary-dimmed-color, #09b5a5);
+    opacity: 1; /* Ensure visible */
+}
+.copy-code-button.copied:hover {
+     background-color: var(--primary-dimmed-color, #09b5a5); /* Prevent hover change */
+}
+
+/* ==== File: docs/assets/layout.css (Additions) ==== */
+
+/* ... (keep all existing layout CSS) ... */
+
+/* --- Floating Ask AI Button --- */
+.floating-ask-ai-button {
+    position: fixed;
+    bottom: 25px;
+    right: 25px;
+    z-index: 1050; /* Below modals, above most content */
+
+    background-color: var(--primary-dimmed-color, #09b5a5);
+    color: var(--background-color, #070708);
+    border: none;
+    border-radius: 50%; /* Make it circular */
+    width: 60px; /* Adjust size */
+    height: 60px; /* Adjust size */
+    padding: 10px; /* Adjust padding */
+    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.4);
+    cursor: pointer;
+    transition: background-color 0.2s ease, transform 0.2s ease;
+
+    display: flex;
+    flex-direction: column; /* Stack icon and text */
+    align-items: center;
+    justify-content: center;
+    text-decoration: none;
+    text-align: center;
+}
+
+.floating-ask-ai-button svg {
+    width: 24px; /* Control icon size */
+    height: 24px;
+}
+
+.floating-ask-ai-button span {
+    font-size: 0.7em;
+    margin-top: 2px; /* Space between icon and text */
+    display: block; /* Ensure it takes space */
+     line-height: 1;
+}
+
+
+.floating-ask-ai-button:hover {
+    background-color: var(--primary-color, #50ffff);
+    transform: scale(1.05); /* Slight grow effect */
+}
+
+.floating-ask-ai-button:focus {
+     outline: 2px solid var(--primary-color);
+     outline-offset: 2px;
+}
+
+/* Optional: Hide text on smaller screens if needed */
+@media screen and (max-width: 768px) {
+     .floating-ask-ai-button span {
+        /* display: none; */ /* Uncomment to hide text */
+     }
+     .floating-ask-ai-button {
+        width: 55px;
+        height: 55px;
+        bottom: 20px;
+        right: 20px;
+     }
+}
\ No newline at end of file
diff --git a/docs/md_v2/assets/mobile_menu.js b/docs/md_v2/assets/mobile_menu.js
new file mode 100644
index 00000000..e529839e
--- /dev/null
+++ b/docs/md_v2/assets/mobile_menu.js
@@ -0,0 +1,106 @@
+// mobile_menu.js - Hamburger menu for mobile view
+document.addEventListener('DOMContentLoaded', () => {
+    // Get references to key elements
+    const sidePanel = document.getElementById('terminal-mkdocs-side-panel');
+    const mainHeader = document.querySelector('.terminal .container:first-child');
+    
+    if (!sidePanel || !mainHeader) {
+        console.warn('Mobile menu: Required elements not found');
+        return;
+    }
+    
+    // Force hide sidebar on mobile
+    const checkMobile = () => {
+        if (window.innerWidth <= 768) {
+            // Force with !important-like priority
+            sidePanel.style.setProperty('left', '-100%', 'important');
+            // Also hide terminal-menu from the theme
+            const terminalMenu = document.querySelector('.terminal-menu');
+            if (terminalMenu) {
+                terminalMenu.style.setProperty('display', 'none', 'important');
+            }
+        } else {
+            sidePanel.style.removeProperty('left');
+            // Restore terminal-menu if it exists
+            const terminalMenu = document.querySelector('.terminal-menu');
+            if (terminalMenu) {
+                terminalMenu.style.removeProperty('display');
+            }
+        }
+    };
+    
+    // Run on initial load
+    checkMobile();
+    
+    // Also run on resize
+    window.addEventListener('resize', checkMobile);
+    
+    // Create hamburger button
+    const hamburgerBtn = document.createElement('button');
+    hamburgerBtn.className = 'mobile-menu-toggle';
+    hamburgerBtn.setAttribute('aria-label', 'Toggle navigation menu');
+    hamburgerBtn.innerHTML = `
+        <span class="hamburger-line"></span>
+        <span class="hamburger-line"></span>
+        <span class="hamburger-line"></span>
+    `;
+    
+    // Create backdrop overlay
+    const menuBackdrop = document.createElement('div');
+    menuBackdrop.className = 'mobile-menu-backdrop';
+    menuBackdrop.style.display = 'none';
+    document.body.appendChild(menuBackdrop);
+    
+    // Make sure it's properly hidden on page load
+    if (window.innerWidth <= 768) {
+        menuBackdrop.style.display = 'none';
+    }
+    
+    // Insert hamburger button into header
+    mainHeader.insertBefore(hamburgerBtn, mainHeader.firstChild);
+    
+    // Add menu close button to side panel
+    const closeBtn = document.createElement('button');
+    closeBtn.className = 'mobile-menu-close';
+    closeBtn.setAttribute('aria-label', 'Close navigation menu');
+    closeBtn.innerHTML = `&times;`;
+    sidePanel.insertBefore(closeBtn, sidePanel.firstChild);
+    
+    // Toggle function
+    function toggleMobileMenu() {
+        const isOpen = sidePanel.classList.toggle('sidebar-visible');
+        
+        // Toggle backdrop
+        menuBackdrop.style.display = isOpen ? 'block' : 'none';
+        
+        // Toggle aria-expanded
+        hamburgerBtn.setAttribute('aria-expanded', isOpen ? 'true' : 'false');
+        
+        // Toggle hamburger animation class
+        hamburgerBtn.classList.toggle('is-active');
+        
+        // Force sidebar visibility setting
+        if (isOpen) {
+            sidePanel.style.setProperty('left', '0', 'important');
+        } else {
+            sidePanel.style.setProperty('left', '-100%', 'important');
+        }
+        
+        // Prevent body scrolling when menu is open
+        document.body.style.overflow = isOpen ? 'hidden' : '';
+    }
+    
+    // Event listeners
+    hamburgerBtn.addEventListener('click', toggleMobileMenu);
+    closeBtn.addEventListener('click', toggleMobileMenu);
+    menuBackdrop.addEventListener('click', toggleMobileMenu);
+    
+    // Close menu on window resize to desktop
+    window.addEventListener('resize', () => {
+        if (window.innerWidth > 768 && sidePanel.classList.contains('sidebar-visible')) {
+            toggleMobileMenu();
+        }
+    });
+    
+    console.log('Mobile menu initialized');
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/selection_ask_ai.js b/docs/md_v2/assets/selection_ask_ai.js
new file mode 100644
index 00000000..e88ad34e
--- /dev/null
+++ b/docs/md_v2/assets/selection_ask_ai.js
@@ -0,0 +1,186 @@
+// ==== File: docs/assets/selection_ask_ai.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+    let askAiButton = null;
+    const askAiPageUrl = '/core/ask-ai/'; // Adjust if your Ask AI page path is different
+
+    function createAskAiButton() {
+        const button = document.createElement('button');
+        button.id = 'ask-ai-selection-btn';
+        button.className = 'ask-ai-selection-button';
+        
+        // Add icon and text for better visibility
+        button.innerHTML = `
+            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="12" height="12" fill="currentColor" style="margin-right: 4px; vertical-align: middle;">
+                <path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2z"/>
+            </svg>
+            <span>Ask AI</span>
+        `;
+        
+        // Common styles
+        button.style.display = 'none'; // Initially hidden
+        button.style.position = 'absolute';
+        button.style.zIndex = '1500'; // Ensure it's on top
+        button.style.boxShadow = '0 3px 8px rgba(0, 0, 0, 0.4)'; // More pronounced shadow
+        button.style.transition = 'transform 0.15s ease, background-color 0.2s ease'; // Smooth hover effect
+        
+        // Add transform on hover
+        button.addEventListener('mouseover', () => {
+            button.style.transform = 'scale(1.05)';
+        });
+        
+        button.addEventListener('mouseout', () => {
+            button.style.transform = 'scale(1)';
+        });
+        
+        document.body.appendChild(button);
+        button.addEventListener('click', handleAskAiClick);
+        return button;
+    }
+
+    function getSafeSelectedText() {
+        const selection = window.getSelection();
+        if (!selection || selection.rangeCount === 0) {
+            return null;
+        }
+        // Avoid selecting text within the button itself if it was somehow selected
+        const container = selection.getRangeAt(0).commonAncestorContainer;
+        if (askAiButton && askAiButton.contains(container)) {
+             return null;
+        }
+
+        const text = selection.toString().trim();
+        return text.length > 0 ? text : null;
+    }
+
+    function positionButton(event) {
+         const selection = window.getSelection();
+         if (!selection || selection.rangeCount === 0 || selection.isCollapsed) {
+             hideButton();
+             return;
+         }
+
+        const range = selection.getRangeAt(0);
+        const rect = range.getBoundingClientRect();
+
+        // Get viewport dimensions
+        const viewportWidth = window.innerWidth;
+        const viewportHeight = window.innerHeight;
+        
+        // Calculate position based on selection
+        const scrollX = window.scrollX;
+        const scrollY = window.scrollY;
+        
+        // Default position (top-right of selection)
+        let buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
+        let buttonLeft = rect.right + scrollX + 5; // 5px to the right
+        
+        // Check if we're on mobile (which we define as less than 768px)
+        const isMobile = viewportWidth <= 768;
+        
+        if (isMobile) {
+            // On mobile, position centered above selection to avoid edge issues
+            buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 10; // 10px above on mobile
+            buttonLeft = rect.left + scrollX + (rect.width / 2) - (askAiButton.offsetWidth / 2); // Centered
+        } else {
+            // For desktop, ensure the button doesn't go off screen
+            // Check right edge
+            if (buttonLeft + askAiButton.offsetWidth > scrollX + viewportWidth) {
+                buttonLeft = scrollX + viewportWidth - askAiButton.offsetWidth - 10; // 10px from right edge
+            }
+        }
+        
+        // Check top edge (for all devices)
+        if (buttonTop < scrollY) {
+            // If would go above viewport, position below selection instead
+            buttonTop = rect.bottom + scrollY + 5; // 5px below
+        }
+
+        askAiButton.style.top = `${buttonTop}px`;
+        askAiButton.style.left = `${buttonLeft}px`;
+        askAiButton.style.display = 'block'; // Show the button
+    }
+
+    function hideButton() {
+        if (askAiButton) {
+            askAiButton.style.display = 'none';
+        }
+    }
+
+    function handleAskAiClick(event) {
+        event.stopPropagation(); // Prevent mousedown from hiding button immediately
+        const selectedText = getSafeSelectedText();
+        if (selectedText) {
+            console.log("Selected Text:", selectedText);
+            // Base64 encode for URL safety (handles special chars, line breaks)
+            // Use encodeURIComponent first for proper Unicode handling before btoa
+            const encodedText = btoa(unescape(encodeURIComponent(selectedText)));
+            const targetUrl = `${askAiPageUrl}?qq=${encodedText}`;
+            console.log("Navigating to:", targetUrl);
+            window.location.href = targetUrl; // Navigate to Ask AI page
+        }
+        hideButton(); // Hide after click
+    }
+
+    // --- Event Listeners ---
+
+    // Function to handle selection events (both mouse and touch)
+    function handleSelectionEvent(event) {
+        // Slight delay to ensure selection is registered
+        setTimeout(() => {
+            const selectedText = getSafeSelectedText();
+            if (selectedText) {
+                if (!askAiButton) {
+                    askAiButton = createAskAiButton();
+                }
+                // Don't position if the event was ON the button itself
+                if (event.target !== askAiButton) {
+                     positionButton(event);
+                }
+            } else {
+                hideButton();
+            }
+        }, 10); // Small delay
+    }
+
+    // Mouse selection events (desktop)
+    document.addEventListener('mouseup', handleSelectionEvent);
+
+    // Touch selection events (mobile)
+    document.addEventListener('touchend', handleSelectionEvent);
+    document.addEventListener('selectionchange', () => {
+        // This helps with mobile selection which can happen without mouseup/touchend
+        setTimeout(() => {
+            const selectedText = getSafeSelectedText();
+            if (selectedText && askAiButton) {
+                positionButton();
+            }
+        }, 300); // Longer delay for selection change
+    });
+
+    // Hide button on various events
+    document.addEventListener('mousedown', (event) => {
+        // Hide if clicking anywhere EXCEPT the button itself
+        if (askAiButton && event.target !== askAiButton) {
+            hideButton();
+        }
+    });
+    
+    document.addEventListener('touchstart', (event) => {
+        // Same for touch events, but only hide if not on the button
+        if (askAiButton && event.target !== askAiButton) {
+            hideButton();
+        }
+    });
+    
+    document.addEventListener('scroll', hideButton, true); // Capture scroll events
+    
+    // Also hide when pressing Escape key
+    document.addEventListener('keydown', (event) => {
+        if (event.key === 'Escape') {
+            hideButton();
+        }
+    });
+
+    console.log("Selection Ask AI script loaded.");
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css
index 8ee8cbb1..46b90ab0 100644
--- a/docs/md_v2/assets/styles.css
+++ b/docs/md_v2/assets/styles.css
@@ -6,8 +6,8 @@
 }
 
 :root {
-    --global-font-size: 16px;
-    --global-code-font-size: 16px;
+    --global-font-size: 14px;
+    --global-code-font-size: 13px;
     --global-line-height: 1.5em;
     --global-space: 10px;
     --font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
@@ -50,8 +50,17 @@
     --display-h1-decoration: none;
 
     --display-h1-decoration: none;
+
+    --header-height: 65px; /* Adjust based on your actual header height */
+    --sidebar-width: 280px; /* Adjust based on your desired sidebar width */
+    --toc-width: 240px; /* Adjust based on your desired ToC width */
+    --layout-transition-speed: 0.2s; /* For potential future animations */
+
+    --page-width : 100em; /* Adjust based on your design */
 }
 
+
+
 /* body {
     background-color: var(--background-color);
     color: var(--font-color);
@@ -256,4 +265,9 @@ div.badges a {
 }
 div.badges a > img {
     width: auto;
+}
+
+
+table td, table th {
+    border: 1px solid var(--code-bg-color) !important;
 }
\ No newline at end of file
diff --git a/docs/md_v2/assets/toc.js b/docs/md_v2/assets/toc.js
new file mode 100644
index 00000000..8dad06b2
--- /dev/null
+++ b/docs/md_v2/assets/toc.js
@@ -0,0 +1,144 @@
+// ==== File: assets/toc.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+    const mainContent = document.getElementById('terminal-mkdocs-main-content');
+    const tocContainer = document.getElementById('toc-sidebar');
+    const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container
+
+    if (!mainContent) {
+        console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found.");
+        return;
+    }
+
+    // --- Create ToC container if it doesn't exist ---
+    let tocElement = tocContainer;
+    if (!tocElement) {
+        if (!mainGrid) {
+            console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC.");
+            return;
+        }
+        tocElement = document.createElement('aside');
+        tocElement.id = 'toc-sidebar';
+        tocElement.style.display = 'none'; // Keep hidden initially
+        // Append it as the last child of the flex grid
+        mainGrid.appendChild(tocElement);
+        console.info("TOC Generator: Created '#toc-sidebar' element.");
+    }
+
+    // --- Find Headings (h2, h3, h4 are common for ToC) ---
+    const headings = mainContent.querySelectorAll('h2, h3, h4');
+    if (headings.length === 0) {
+        console.info("TOC Generator: No headings found on this page. ToC not generated.");
+        tocElement.style.display = 'none'; // Ensure it's hidden
+        return;
+    }
+
+    // --- Generate ToC List ---
+    const tocList = document.createElement('ul');
+    const observerTargets = []; // Store headings for IntersectionObserver
+
+    headings.forEach((heading, index) => {
+        // Ensure heading has an ID for linking
+        if (!heading.id) {
+            // Create a simple slug-like ID
+            heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`;
+        }
+
+        const listItem = document.createElement('li');
+        const link = document.createElement('a');
+
+        link.href = `#${heading.id}`;
+        link.textContent = heading.textContent;
+
+        // Add class for styling based on heading level
+        const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4
+        listItem.classList.add(`toc-level-${level}`);
+
+        listItem.appendChild(link);
+        tocList.appendChild(listItem);
+        observerTargets.push(heading); // Add to observer list
+    });
+
+    // --- Populate and Show ToC ---
+    // Optional: Add a title
+    const tocTitle = document.createElement('h4');
+    tocTitle.textContent = 'On this page'; // Customize title if needed
+
+    tocElement.innerHTML = ''; // Clear previous content if any
+    tocElement.appendChild(tocTitle);
+    tocElement.appendChild(tocList);
+    tocElement.style.display = ''; // Show the ToC container
+
+    console.info(`TOC Generator: Generated ToC with ${headings.length} items.`);
+
+    // --- Scroll Spy using Intersection Observer ---
+    const tocLinks = tocElement.querySelectorAll('a');
+    let activeLink = null; // Keep track of the current active link
+
+    const observerOptions = {
+        // Observe changes relative to the viewport, offset by the header height
+        // Negative top margin pushes the intersection trigger point down
+        // Negative bottom margin ensures elements low on the screen can trigger before they exit
+        rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`,
+        threshold: 0 // Trigger as soon as any part enters/exits the boundary
+    };
+
+    const observerCallback = (entries) => {
+        let topmostVisibleHeading = null;
+
+        entries.forEach(entry => {
+            const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`);
+            if (!link) return;
+
+            // Check if the heading is intersecting (partially or fully visible within rootMargin)
+            if (entry.isIntersecting) {
+                 // Among visible headings, find the one closest to the top edge (within the rootMargin)
+                if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) {
+                    topmostVisibleHeading = entry.target;
+                 }
+            }
+        });
+
+        // If we found a topmost visible heading, activate its link
+        if (topmostVisibleHeading) {
+            const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`);
+            if (newActiveLink && newActiveLink !== activeLink) {
+                 // Remove active class from previous link
+                 if (activeLink) {
+                     activeLink.classList.remove('active');
+                     activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling
+                 }
+                 // Add active class to the new link
+                 newActiveLink.classList.add('active');
+                 newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling
+                 activeLink = newActiveLink;
+
+                 // Optional: Scroll the ToC sidebar to keep the active link visible
+                 // newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
+            }
+        }
+        // If no headings are intersecting (scrolled past the last one?), maybe deactivate all
+        // Or keep the last one active - depends on desired behavior. Current logic keeps last active.
+    };
+
+    const observer = new IntersectionObserver(observerCallback, observerOptions);
+
+    // Observe all target headings
+    observerTargets.forEach(heading => observer.observe(heading));
+
+    // Initial check in case a heading is already in view on load
+    // (Requires slight delay for accurate layout calculation)
+    setTimeout(() => {
+        observerCallback(observer.takeRecords()); // Process initial state
+    }, 100);
+
+    // move footer and the hr before footer to the end of the main content
+    const footer = document.querySelector('footer');
+    const hr = footer.previousElementSibling;
+    if (hr && hr.tagName === 'HR') {
+        mainContent.appendChild(hr);
+    }
+    mainContent.appendChild(footer);
+    console.info("TOC Generator: Footer moved to the end of the main content.");
+
+});
\ No newline at end of file
diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md
index 8b6ae74f..55532fce 100644
--- a/docs/md_v2/blog/index.md
+++ b/docs/md_v2/blog/index.md
@@ -4,6 +4,93 @@ Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical
 
 ## Latest Release
 
+Here’s the blog index entry for **v0.6.0**, written to match the exact tone and structure of your previous entries:
+
+---
+
+### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
+*April 23, 2025*
+
+Crawl4AI v0.6.0 is our most powerful release yet. This update brings major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.  
+
+The Docker server now exposes a full-featured MCP socket + SSE interface, supports streaming, and comes with a new Playground UI. Plus, table extraction is now native, and the new stress-test framework supports crawling 1,000+ URLs.  
+
+Other key changes:  
+
+*   Native support for `result.media["tables"]` to export DataFrames  
+* Full network + console logs and MHTML snapshot per crawl  
+* Browser pooling and pre-warming for faster cold starts  
+* New streaming endpoints via MCP API and Playground  
+* Robots.txt support, proxy rotation, and improved session handling  
+* Deprecated old markdown names, legacy modules cleaned up  
+* Massive repo cleanup: ~36K insertions, ~5K deletions across 121 files
+
+[Read full release notes →](releases/0.6.0.md)
+
+---
+
+Let me know if you want me to auto-update the actual file or just paste this into the markdown.
+
+### [Crawl4AI v0.5.0: Deep Crawling, Scalability, and a New CLI!](releases/0.5.0.md)
+
+My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5.0! This release brings a wealth of new features, performance improvements, and a more streamlined developer experience.  Here's a breakdown of what's new:
+
+**Major New Features:**
+
+*   **Deep Crawling:** Explore entire websites with configurable strategies (BFS, DFS, Best-First).  Define custom filters and URL scoring for targeted crawls.
+*   **Memory-Adaptive Dispatcher:**  Handle large-scale crawls with ease!  Our new dispatcher dynamically adjusts concurrency based on available memory and includes built-in rate limiting.
+*   **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
+*   **Docker Deployment:**  Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
+*   **Command-Line Interface (CLI):**  Interact with Crawl4AI directly from your terminal.  Crawl, configure, and extract data with simple commands.
+*   **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation.  Simplifies API key management and switching between models.
+
+**Minor Updates & Improvements:**
+
+*   **LXML Scraping Mode:** Faster HTML parsing with `LXMLWebScrapingStrategy`.
+*   **Proxy Rotation:** Added `ProxyRotationStrategy` with a `RoundRobinProxyStrategy` implementation.
+*   **PDF Processing:** Extract text, images, and metadata from PDF files.
+*   **URL Redirection Tracking:**  Automatically follows and records redirects.
+*   **Robots.txt Compliance:**  Optionally respect website crawling rules.
+*   **LLM-Powered Schema Generation:**  Automatically create extraction schemas using an LLM.
+*   **`LLMContentFilter`:** Generate high-quality, focused markdown using an LLM.
+*   **Improved Error Handling & Stability:** Numerous bug fixes and performance enhancements.
+*   **Enhanced Documentation:**  Updated guides and examples.
+
+**Breaking Changes & Migration:**
+
+This release includes several breaking changes to improve the library's structure and consistency.  Here's what you need to know:
+
+*   **`arun_many()` Behavior:** Now uses the `MemoryAdaptiveDispatcher` by default.  The return type depends on the `stream` parameter in `CrawlerRunConfig`.  Adjust code that relied on unbounded concurrency.
+*   **`max_depth` Location:** Moved to `CrawlerRunConfig` and now controls *crawl depth*.
+*   **Deep Crawling Imports:**  Import `DeepCrawlStrategy` and related classes from `crawl4ai.deep_crawling`.
+*   **`BrowserContext` API:**  Updated; the old `get_context` method is deprecated.
+*   **Optional Model Fields:** Many data model fields are now optional.  Handle potential `None` values.
+*   **`ScrapingMode` Enum:** Replaced with strategy pattern (`WebScrapingStrategy`, `LXMLWebScrapingStrategy`).
+*   **`content_filter` Parameter:** Removed from `CrawlerRunConfig`. Use extraction strategies or markdown generators with filters.
+*   **Removed Functionality:** The synchronous `WebCrawler`, the old CLI, and docs management tools have been removed.
+*   **Docker:**  Significant changes to deployment.  See the [Docker documentation](../deploy/docker/README.md).
+*   **`ssl_certificate.json`:** This file has been removed.
+* **Config**: FastFilterChain has been replaced with FilterChain
+* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
+*   **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
+
+**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.
+
+## License Change
+
+Crawl4AI v0.5.0 updates the license to Apache 2.0 *with a required attribution clause*.  This means you are free to use, modify, and distribute Crawl4AI (even commercially), but you *must* clearly attribute the project in any public use or distribution.  See the updated `LICENSE` file for the full legal text and specific requirements.
+
+**Get Started:**
+
+*   **Installation:** `pip install "crawl4ai[all]"` (or use the Docker image)
+*   **Documentation:** [https://docs.crawl4ai.com](https://docs.crawl4ai.com)
+*   **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
+
+I'm very excited to see what you build with Crawl4AI v0.5.0!
+
+---
+
 ### [0.4.2 - Configurable Crawlers, Session Management, and Smarter Screenshots](releases/0.4.2.md)
 *December 12, 2024*
 
diff --git a/docs/md_v2/blog/releases/0.5.0.md b/docs/md_v2/blog/releases/0.5.0.md
new file mode 100644
index 00000000..30269a29
--- /dev/null
+++ b/docs/md_v2/blog/releases/0.5.0.md
@@ -0,0 +1,524 @@
+# Crawl4AI v0.5.0 Release Notes
+
+**Release Theme: Power, Flexibility, and Scalability**
+
+Crawl4AI v0.5.0 is a major release focused on significantly enhancing the
+library's power, flexibility, and scalability. Key improvements include a new
+**deep crawling** system, a **memory-adaptive dispatcher** for handling
+large-scale crawls, **multiple crawling strategies** (including a fast HTTP-only
+crawler), **Docker** deployment options, and a powerful **command-line interface
+(CLI)**. This release also includes numerous bug fixes, performance
+optimizations, and documentation updates.
+
+**Important Note:** This release contains several **breaking changes**. Please
+review the "Breaking Changes" section carefully and update your code
+accordingly.
+
+## Key Features
+
+### 1. Deep Crawling
+
+Crawl4AI now supports deep crawling, allowing you to explore websites beyond the
+initial URLs. This is controlled by the `deep_crawl_strategy` parameter in
+`CrawlerRunConfig`. Several strategies are available:
+
+- **`BFSDeepCrawlStrategy` (Breadth-First Search):** Explores the website level
+  by level. (Default)
+- **`DFSDeepCrawlStrategy` (Depth-First Search):** Explores each branch as
+  deeply as possible before backtracking.
+- **`BestFirstCrawlingStrategy`:** Uses a scoring function to prioritize which
+  URLs to crawl next.
+
+```python
+import time
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BFSDeepCrawlStrategy
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import DomainFilter, ContentTypeFilter, FilterChain, URLPatternFilter, KeywordRelevanceScorer, BestFirstCrawlingStrategy
+import asyncio
+
+# Create a filter chain to filter urls based on patterns, domains and content type
+filter_chain = FilterChain(
+    [
+        DomainFilter(
+            allowed_domains=["docs.crawl4ai.com"],
+            blocked_domains=["old.docs.crawl4ai.com"],
+        ),
+        URLPatternFilter(patterns=["*core*", "*advanced*"],),
+        ContentTypeFilter(allowed_types=["text/html"]),
+    ]
+)
+
+# Create a keyword scorer that prioritises the pages with certain keywords first
+keyword_scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"], weight=0.7
+)
+
+# Set up the configuration
+deep_crawl_config = CrawlerRunConfig(
+    deep_crawl_strategy=BestFirstCrawlingStrategy(
+        max_depth=2,
+        include_external=False,
+        filter_chain=filter_chain,
+        url_scorer=keyword_scorer,
+    ),
+    scraping_strategy=LXMLWebScrapingStrategy(),
+    stream=True,
+    verbose=True,
+)
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        start_time = time.perf_counter()
+        results = []
+        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
+            print(f"Crawled: {result.url} (Depth: {result.metadata['depth']}), score: {result.metadata['score']:.2f}")
+            results.append(result)
+        duration = time.perf_counter() - start_time
+        print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+
+asyncio.run(main())
+```
+
+**Breaking Change:** The `max_depth` parameter is now part of `CrawlerRunConfig`
+and controls the _depth_ of the crawl, not the number of concurrent crawls. The
+`arun()` and `arun_many()` methods are now decorated to handle deep crawling
+strategies. Imports for deep crawling strategies have changed. See the
+[Deep Crawling documentation](../../core/deep-crawling.md) for more details.
+
+### 2. Memory-Adaptive Dispatcher
+
+The new `MemoryAdaptiveDispatcher` dynamically adjusts concurrency based on
+available system memory and includes built-in rate limiting. This prevents
+out-of-memory errors and avoids overwhelming target websites.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MemoryAdaptiveDispatcher
+import asyncio
+
+# Configure the dispatcher (optional, defaults are used if not provided)
+dispatcher = MemoryAdaptiveDispatcher(
+    memory_threshold_percent=80.0,  # Pause if memory usage exceeds 80%
+    check_interval=0.5,  # Check memory every 0.5 seconds
+)
+
+async def batch_mode():
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(
+            urls=["https://docs.crawl4ai.com", "https://github.com/unclecode/crawl4ai"],
+            config=CrawlerRunConfig(stream=False),  # Batch mode
+            dispatcher=dispatcher,
+        )
+        for result in results:
+            print(f"Crawled: {result.url} with status code: {result.status_code}")
+
+async def stream_mode():
+    async with AsyncWebCrawler() as crawler:
+        # OR, for streaming:
+        async for result in await crawler.arun_many(
+            urls=["https://docs.crawl4ai.com", "https://github.com/unclecode/crawl4ai"],
+            config=CrawlerRunConfig(stream=True),
+            dispatcher=dispatcher,
+        ):
+            print(f"Crawled: {result.url} with status code: {result.status_code}")
+
+print("Dispatcher in batch mode:")
+asyncio.run(batch_mode())
+print("-" * 50)
+print("Dispatcher in stream mode:")
+asyncio.run(stream_mode())
+```
+
+**Breaking Change:** `AsyncWebCrawler.arun_many()` now uses
+`MemoryAdaptiveDispatcher` by default. Existing code that relied on unbounded
+concurrency may require adjustments.
+
+### 3. Multiple Crawling Strategies (Playwright and HTTP)
+
+Crawl4AI now offers two crawling strategies:
+
+- **`AsyncPlaywrightCrawlerStrategy` (Default):** Uses Playwright for
+  browser-based crawling, supporting JavaScript rendering and complex
+  interactions.
+- **`AsyncHTTPCrawlerStrategy`:** A lightweight, fast, and memory-efficient
+  HTTP-only crawler. Ideal for simple scraping tasks where browser rendering is
+  unnecessary.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, HTTPCrawlerConfig
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+import asyncio
+
+# Use the HTTP crawler strategy
+http_crawler_config = HTTPCrawlerConfig(
+        method="GET",
+        headers={"User-Agent": "MyCustomBot/1.0"},
+        follow_redirects=True,
+        verify_ssl=True
+)
+
+async def main():
+    async with AsyncWebCrawler(crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config =http_crawler_config)) as crawler:
+        result = await crawler.arun("https://example.com")
+        print(f"Status code: {result.status_code}")
+        print(f"Content length: {len(result.html)}")
+
+asyncio.run(main())
+```
+
+### 4. Docker Deployment
+
+Crawl4AI can now be easily deployed as a Docker container, providing a
+consistent and isolated environment. The Docker image includes a FastAPI server
+with both streaming and non-streaming endpoints.
+
+```bash
+# Build the image (from the project root)
+docker build -t crawl4ai .
+
+# Run the container
+docker run -d -p 8000:8000 --name crawl4ai crawl4ai
+```
+
+**API Endpoints:**
+
+- `/crawl` (POST): Non-streaming crawl.
+- `/crawl/stream` (POST): Streaming crawl (NDJSON).
+- `/health` (GET): Health check.
+- `/schema` (GET): Returns configuration schemas.
+- `/md/{url}` (GET): Returns markdown content of the URL.
+- `/llm/{url}` (GET): Returns LLM extracted content.
+- `/token` (POST): Get JWT token
+
+**Breaking Changes:**
+
+- Docker deployment now requires a `.llm.env` file for API keys.
+- Docker deployment now requires Redis and a new `config.yml` structure.
+- Server startup now uses `supervisord` instead of direct process management.
+- Docker server now requires authentication by default (JWT tokens).
+
+See the [Docker deployment documentation](../../core/docker-deployment.md) for
+detailed instructions.
+
+### 5. Command-Line Interface (CLI)
+
+A new CLI (`crwl`) provides convenient access to Crawl4AI's functionality from
+the terminal.
+
+```bash
+# Basic crawl
+crwl https://example.com
+
+# Get markdown output
+crwl https://example.com -o markdown
+
+# Use a configuration file
+crwl https://example.com -B browser.yml -C crawler.yml
+
+# Use LLM-based extraction
+crwl https://example.com -e extract.yml -s schema.json
+
+# Ask a question about the crawled content
+crwl https://example.com -q "What is the main topic?"
+
+# See usage examples
+crwl --example
+```
+
+See the [CLI documentation](../docs/md_v2/core/cli.md) for more details.
+
+### 6. LXML Scraping Mode
+
+Added `LXMLWebScrapingStrategy` for faster HTML parsing using the `lxml`
+library. This can significantly improve scraping performance, especially for
+large or complex pages. Set `scraping_strategy=LXMLWebScrapingStrategy()` in
+your `CrawlerRunConfig`.
+
+**Breaking Change:** The `ScrapingMode` enum has been replaced with a strategy
+pattern. Use `WebScrapingStrategy` (default) or `LXMLWebScrapingStrategy`.
+
+### 7. Proxy Rotation
+
+Added `ProxyRotationStrategy` abstract base class with `RoundRobinProxyStrategy`
+concrete implementation.
+
+```python
+import re
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    RoundRobinProxyStrategy,
+)
+import asyncio
+from crawl4ai import ProxyConfig
+async def main():
+    # Load proxies and create rotation strategy
+    proxies = ProxyConfig.from_env()
+    #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
+    if not proxies:
+        print("No proxies found in environment. Set PROXIES env variable!")
+        return
+        
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+    
+    # Create configs
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        proxy_rotation_strategy=proxy_strategy
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
+
+        print("\n📈 Initializing crawler with proxy rotation...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            print("\n🚀 Starting batch crawl with proxy rotation...")
+            results = await crawler.arun_many(
+                urls=urls,
+                config=run_config
+            )
+            for result in results:
+                if result.success:
+                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
+                    
+                    if current_proxy and ip_match:
+                        print(f"URL {result.url}")
+                        print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
+                        verified = ip_match.group(0) == current_proxy.ip
+                        if verified:
+                            print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
+                        else:
+                            print("❌ Proxy failed or IP mismatch!")
+                    print("---")
+
+asyncio.run(main())
+```
+
+## Other Changes and Improvements
+
+- **Added: `LLMContentFilter` for intelligent markdown generation.** This new
+  filter uses an LLM to create more focused and relevant markdown output.
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import LLMContentFilter
+from crawl4ai import LLMConfig
+import asyncio
+
+llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
+
+markdown_generator = DefaultMarkdownGenerator(
+    content_filter=LLMContentFilter(llm_config=llm_config, instruction="Extract key concepts and summaries")
+)
+
+config = CrawlerRunConfig(markdown_generator=markdown_generator)
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://docs.crawl4ai.com", config=config)
+        print(result.markdown.fit_markdown)
+
+asyncio.run(main())
+```
+
+- **Added: URL redirection tracking.** The crawler now automatically follows
+  HTTP redirects (301, 302, 307, 308) and records the final URL in the
+  `redirected_url` field of the `CrawlResult` object. No code changes are
+  required to enable this; it's automatic.
+
+- **Added: LLM-powered schema generation utility.** A new `generate_schema`
+  method has been added to `JsonCssExtractionStrategy` and
+  `JsonXPathExtractionStrategy`. This greatly simplifies creating extraction
+  schemas.
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import LLMConfig
+
+llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
+
+schema = JsonCssExtractionStrategy.generate_schema(
+    html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>",
+    llm_config = llm_config,
+    query="Extract product name and price"
+)
+print(schema)
+```
+
+ Expected Output (may vary slightly due to LLM)
+```JSON
+{
+  "name": "ProductExtractor",
+  "baseSelector": "div.product",
+  "fields": [
+      {"name": "name", "selector": "h2", "type": "text"},
+      {"name": "price", "selector": ".price", "type": "text"}
+    ]
+ }
+```
+
+- **Added: robots.txt compliance support.** The crawler can now respect
+  `robots.txt` rules. Enable this by setting `check_robots_txt=True` in
+  `CrawlerRunConfig`.
+
+  ```python
+  config = CrawlerRunConfig(check_robots_txt=True)
+  ```
+
+- **Added: PDF processing capabilities.** Crawl4AI can now extract text, images,
+  and metadata from PDF files (both local and remote). This uses a new
+  `PDFCrawlerStrategy` and `PDFContentScrapingStrategy`.
+
+  ```python
+  from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+  from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
+  import asyncio
+
+  async def main():
+      async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
+          result = await crawler.arun(
+              "https://arxiv.org/pdf/2310.06825.pdf",
+              config=CrawlerRunConfig(
+                  scraping_strategy=PDFContentScrapingStrategy()
+              )
+          )
+          print(result.markdown)  # Access extracted text
+          print(result.metadata)  # Access PDF metadata (title, author, etc.)
+
+  asyncio.run(main())
+  ```
+
+- **Added: Support for frozenset serialization.** Improves configuration
+  serialization, especially for sets of allowed/blocked domains. No code changes
+  required.
+
+- **Added: New `LLMConfig` parameter.** This new parameter can be passed for
+  extraction, filtering, and schema generation tasks. It simplifies passing
+  provider strings, API tokens, and base URLs across all sections where LLM
+  configuration is necessary. It also enables reuse and allows for quick
+  experimentation between different LLM configurations.
+
+  ```python
+  from crawl4ai import LLMConfig
+  from crawl4ai.extraction_strategy import LLMExtractionStrategy
+  from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+  # Example of using LLMConfig with LLMExtractionStrategy
+  llm_config = LLMConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
+  strategy = LLMExtractionStrategy(llm_config=llm_config, schema=...)
+
+  # Example usage within a crawler
+  async with AsyncWebCrawler() as crawler:
+      result = await crawler.arun(
+          url="https://example.com",
+          config=CrawlerRunConfig(extraction_strategy=strategy)
+      )
+  ```
+  **Breaking Change:** Removed old parameters like `provider`, `api_token`,
+  `base_url`, and `api_base` from `LLMExtractionStrategy` and
+  `LLMContentFilter`. Users should migrate to using the `LLMConfig` object.
+
+- **Changed: Improved browser context management and added shared data support.
+  (Breaking Change:** `BrowserContext` API updated). Browser contexts are now
+  managed more efficiently, reducing resource usage. A new `shared_data`
+  dictionary is available in the `BrowserContext` to allow passing data between
+  different stages of the crawling process. **Breaking Change:** The
+  `BrowserContext` API has changed, and the old `get_context` method is
+  deprecated.
+
+- **Changed:** Renamed `final_url` to `redirected_url` in `CrawledURL`. This
+  improves consistency and clarity. Update any code referencing the old field
+  name.
+
+- **Changed:** Improved type hints and removed unused files. This is an internal
+  improvement and should not require code changes.
+
+- **Changed:** Reorganized deep crawling functionality into dedicated module.
+  (**Breaking Change:** Import paths for `DeepCrawlStrategy` and related classes
+  have changed). This improves code organization. Update imports to use the new
+  `crawl4ai.deep_crawling` module.
+
+- **Changed:** Improved HTML handling and cleanup codebase. (**Breaking
+  Change:** Removed `ssl_certificate.json` file). This removes an unused file.
+  If you were relying on this file for custom certificate validation, you'll
+  need to implement an alternative approach.
+
+- **Changed:** Enhanced serialization and config handling. (**Breaking Change:**
+  `FastFilterChain` has been replaced with `FilterChain`). This change
+  simplifies config and improves the serialization.
+
+- **Added:** Modified the license to Apache 2.0 _with a required attribution
+  clause_. See the `LICENSE` file for details. All users must now clearly
+  attribute the Crawl4AI project when using, distributing, or creating
+  derivative works.
+
+- **Fixed:** Prevent memory leaks by ensuring proper closure of Playwright
+  pages. No code changes required.
+
+- **Fixed:** Make model fields optional with default values (**Breaking
+  Change:** Code relying on all fields being present may need adjustment).
+  Fields in data models (like `CrawledURL`) are now optional, with default
+  values (usually `None`). Update code to handle potential `None` values.
+
+- **Fixed:** Adjust memory threshold and fix dispatcher initialization. This is
+  an internal bug fix; no code changes are required.
+
+- **Fixed:** Ensure proper exit after running doctor command. No code changes
+  are required.
+- **Fixed:** JsonCss selector and crawler improvements.
+- **Fixed:** Not working long page screenshot (#403)
+- **Documentation:** Updated documentation URLs to the new domain.
+- **Documentation:** Added SERP API project example.
+- **Documentation:** Added clarifying comments for CSS selector behavior.
+- **Documentation:** Add Code of Conduct for the project (#410)
+
+## Breaking Changes Summary
+
+- **Dispatcher:** The `MemoryAdaptiveDispatcher` is now the default for
+  `arun_many()`, changing concurrency behavior. The return type of `arun_many`
+  depends on the `stream` parameter.
+- **Deep Crawling:** `max_depth` is now part of `CrawlerRunConfig` and controls
+  crawl depth. Import paths for deep crawling strategies have changed.
+- **Browser Context:** The `BrowserContext` API has been updated.
+- **Models:** Many fields in data models are now optional, with default values.
+- **Scraping Mode:** `ScrapingMode` enum replaced by strategy pattern
+  (`WebScrapingStrategy`, `LXMLWebScrapingStrategy`).
+- **Content Filter:** Removed `content_filter` parameter from
+  `CrawlerRunConfig`. Use extraction strategies or markdown generators with
+  filters instead.
+- **Removed:** Synchronous `WebCrawler`, CLI, and docs management functionality.
+- **Docker:** Significant changes to Docker deployment, including new
+  requirements and configuration.
+- **File Removed**: Removed ssl_certificate.json file which might affect
+  existing certificate validations
+- **Renamed**: final_url to redirected_url for consistency
+- **Config**: FastFilterChain has been replaced with FilterChain
+- **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT,
+  List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+- **Proxy**: Removed synchronous WebCrawler support and related rate limiting
+  configurations
+
+## Migration Guide
+
+1. **Update Imports:** Adjust imports for `DeepCrawlStrategy`,
+   `BreadthFirstSearchStrategy`, and related classes due to the new
+   `deep_crawling` module structure.
+2. **`CrawlerRunConfig`:** Move `max_depth` to `CrawlerRunConfig`. If using
+   `content_filter`, migrate to an extraction strategy or a markdown generator
+   with a filter.
+3. **`arun_many()`:** Adapt code to the new `MemoryAdaptiveDispatcher` behavior
+   and the return type.
+4. **`BrowserContext`:** Update code using the `BrowserContext` API.
+5. **Models:** Handle potential `None` values for optional fields in data
+   models.
+6. **Scraping:** Replace `ScrapingMode` enum with `WebScrapingStrategy` or
+   `LXMLWebScrapingStrategy`.
+7. **Docker:** Review the updated Docker documentation and adjust your
+   deployment accordingly.
+8. **CLI:** Migrate to the new `crwl` command and update any scripts using the
+   old CLI.
+9. **Proxy:**: Removed synchronous WebCrawler support and related rate limiting
+   configurations.
+10. **Config:**: Replace FastFilterChain to FilterChain
diff --git a/docs/md_v2/blog/releases/0.6.0.md b/docs/md_v2/blog/releases/0.6.0.md
new file mode 100644
index 00000000..a3a7c216
--- /dev/null
+++ b/docs/md_v2/blog/releases/0.6.0.md
@@ -0,0 +1,143 @@
+# Crawl4AI v0.6.0 Release Notes
+
+We're excited to announce the release of **Crawl4AI v0.6.0**, our biggest and most feature-rich update yet. This version introduces major architectural upgrades, brand-new capabilities for geo-aware crawling, high-efficiency scraping, and real-time streaming support for scalable deployments.
+
+---
+
+## Highlights
+
+### 1. **World-Aware Crawlers**
+Crawl as if you’re anywhere in the world. With v0.6.0, each crawl can simulate:
+- Specific GPS coordinates
+- Browser locale
+- Timezone
+
+Example:
+```python
+CrawlerRunConfig(
+    url="https://browserleaks.com/geo",
+    locale="en-US",
+    timezone_id="America/Los_Angeles",
+    geolocation=GeolocationConfig(
+        latitude=34.0522,
+        longitude=-118.2437,
+        accuracy=10.0
+    )
+)
+```
+Great for accessing region-specific content or testing global behavior.
+
+---
+
+### 2. **Native Table Extraction**
+Extract HTML tables directly into usable formats like Pandas DataFrames or CSV with zero parsing hassle. All table data is available under `result.media["tables"]`.
+
+Example:
+```python
+raw_df = pd.DataFrame(
+    result.media["tables"][0]["rows"],
+    columns=result.media["tables"][0]["headers"]
+)
+```
+This makes it ideal for scraping financial data, pricing pages, or anything tabular.
+
+---
+
+### 3. **Browser Pooling & Pre-Warming**
+We've overhauled browser management. Now, multiple browser instances can be pooled and pages pre-warmed for ultra-fast launches:
+- Reduces cold-start latency
+- Lowers memory spikes
+- Enhances parallel crawling stability
+
+This powers the new **Docker Playground** experience and streamlines heavy-load crawling.
+
+---
+
+### 4. **Traffic & Snapshot Capture**
+Need full visibility? You can now capture:
+- Full network traffic logs
+- Console output
+- MHTML page snapshots for post-crawl audits and debugging
+
+No more guesswork on what happened during your crawl.
+
+---
+
+### 5. **MCP API and Streaming Support**
+We’re exposing **MCP socket and SSE endpoints**, allowing:
+- Live streaming of crawl results
+- Real-time integration with agents or frontends
+- A new Playground UI for interactive crawling
+
+This is a major step towards making Crawl4AI real-time ready.
+
+---
+
+### 6. **Stress-Test Framework**
+Want to test performance under heavy load? v0.6.0 includes a new memory stress-test suite that supports 1,000+ URL workloads. Ideal for:
+- Load testing
+- Performance benchmarking
+- Validating memory efficiency
+
+---
+
+## Core Improvements
+- Robots.txt compliance
+- Proxy rotation support
+- Improved URL normalization and session reuse
+- Shared data across crawler hooks
+- New page routing logic
+
+---
+
+## Breaking Changes & Deprecations
+- Legacy `crawl4ai/browser/*` modules are removed. Update imports accordingly.
+- `AsyncPlaywrightCrawlerStrategy.get_page` now uses a new function signature.
+- Deprecated markdown generator aliases now point to `DefaultMarkdownGenerator` with warning.
+
+---
+
+## Miscellaneous Updates
+- FastAPI validators replaced custom validation logic
+- Docker build now based on a Chromium layer
+- Repo-wide cleanup: ~36,000 insertions, ~5,000 deletions
+
+---
+
+## New Examples Included
+- Geo-location crawling
+- Network + console log capture
+- Docker MCP API usage
+- Markdown selector usage
+- Crypto project data extraction
+
+---
+
+## Watch the Release Video
+Want a visual walkthrough of all these updates? Watch the video:
+🔗 https://youtu.be/9x7nVcjOZks
+
+If you're new to Crawl4AI, start here:
+🔗 https://www.youtube.com/watch?v=xo3qK6Hg9AA&t=15s
+
+---
+
+## Join the Community
+We’ve just opened up our **Discord** for the public. Join us to:
+- Ask questions
+- Share your projects
+- Get help or contribute
+
+💬 https://discord.gg/wpYFACrHR4
+
+---
+
+## Install or Upgrade
+```bash
+pip install -U crawl4ai
+```
+
+---
+
+Live long and import crawl4ai. 🖖
+
diff --git a/docs/md_v2/core/ask-ai.md b/docs/md_v2/core/ask-ai.md
new file mode 100644
index 00000000..9122bd29
--- /dev/null
+++ b/docs/md_v2/core/ask-ai.md
@@ -0,0 +1,74 @@
+<div class="ask-ai-container">
+<iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
+</div>
+
+<script>
+// Iframe height adjustment
+function resizeAskAiIframe() {
+  const iframe = document.getElementById('ask-ai-frame');
+  if (iframe) {
+    const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
+    // Footer is removed by JS below, so calculate height based on header + small buffer
+    const topOffset = headerHeight + 20; // Header + buffer/margin
+
+    const availableHeight = window.innerHeight - topOffset;
+    iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
+  }
+}
+
+// Run immediately and on resize/load
+resizeAskAiIframe(); // Initial call
+let resizeTimer;
+window.addEventListener('load', resizeAskAiIframe);
+window.addEventListener('resize', () => {
+    clearTimeout(resizeTimer);
+    resizeTimer = setTimeout(resizeAskAiIframe, 150);
+});
+
+// Remove Footer & HR from parent page (DOM Ready might be safer)
+document.addEventListener('DOMContentLoaded', () => {
+    setTimeout(() => { // Add slight delay just in case elements render slowly
+        const footer = window.parent.document.querySelector('footer'); // Target parent document
+        if (footer) {
+            const hrBeforeFooter = footer.previousElementSibling;
+            if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
+                hrBeforeFooter.remove();
+            }
+            footer.remove();
+            // Trigger resize again after removing footer
+            resizeAskAiIframe();
+        } else {
+             console.warn("Ask AI Page: Could not find footer in parent document to remove.");
+        }
+    }, 100); // Shorter delay
+});
+</script>
+
+<style>
+#terminal-mkdocs-main-content {
+    padding: 0 !important;
+    margin: 0;
+    width: 100%;
+    height: 100%;
+    overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
+}
+
+/* Ensure iframe container takes full space */
+#terminal-mkdocs-main-content .ask-ai-container {
+    /* Remove negative margins if footer removal handles space */
+     margin: 0;
+    padding: 0;
+    max-width: none;
+    /* Let the JS set the height */
+    /* height: 600px; Initial fallback height */
+    overflow: hidden; /* Hide potential overflow before JS resize */
+}
+
+/* Hide title/paragraph if they were part of the markdown */
+/* Alternatively, just remove them from the .md file directly */
+/* #terminal-mkdocs-main-content > h1,
+#terminal-mkdocs-main-content > p:first-of-type {
+    display: none;
+} */
+
+</style>
diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
index 8d916738..9ea8f2a1 100644
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -1,9 +1,10 @@
-# Browser & Crawler Configuration (Quick Overview)
+# Browser, Crawler & LLM Configuration (Quick Overview)
 
-Crawl4AI’s flexibility stems from two key classes:
+Crawl4AI's flexibility stems from two key classes:
 
-1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).  
-2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
+1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).  
+2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).  
+3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
 
 In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
 
@@ -35,18 +36,16 @@ class BrowserConfig:
 
 ### Key Fields to Note
 
-
-
-1. **`browser_type`**  
+1. **`browser_type`**  
 - Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
 - Defaults to `"chromium"`.  
 - If you need a different engine, specify it here.
 
-2. **`headless`**  
+2. **`headless`**  
    - `True`: Runs the browser in headless mode (invisible browser).  
    - `False`: Runs the browser in visible mode, which helps with debugging.
 
-3. **`proxy_config`**  
+3. **`proxy_config`**  
    - A dictionary with fields like:  
 ```json
 {
@@ -57,31 +56,31 @@ class BrowserConfig:
 ```
    - Leave as `None` if a proxy is not required.
 
-4. **`viewport_width` & `viewport_height`**:  
+4. **`viewport_width` & `viewport_height`**:  
    - The initial window size.  
    - Some sites behave differently with smaller or bigger viewports.
 
-5. **`verbose`**:  
+5. **`verbose`**:  
    - If `True`, prints extra logs.  
    - Handy for debugging.
 
-6. **`use_persistent_context`**:  
+6. **`use_persistent_context`**:  
    - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.  
    - Typically also set `user_data_dir` to point to a folder.
 
-7. **`cookies`** & **`headers`**:  
+7. **`cookies`** & **`headers`**:  
    - If you want to start with specific cookies or add universal HTTP headers, set them here.  
    - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
 
-8. **`user_agent`**:  
+8. **`user_agent`**:  
    - Custom User-Agent string. If `None`, a default is used.  
    - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
 
-9. **`text_mode`** & **`light_mode`**:  
+9. **`text_mode`** & **`light_mode`**:  
    - `text_mode=True` disables images, possibly speeding up text-only crawls.  
    - `light_mode=True` turns off certain background features for performance.  
 
-10. **`extra_args`**:  
+10. **`extra_args`**:  
     - Additional flags for the underlying browser.  
     - E.g. `["--disable-extensions"]`.
 
@@ -135,6 +134,12 @@ class CrawlerRunConfig:
         wait_for=None,
         screenshot=False,
         pdf=False,
+        capture_mhtml=False,
+        # Location and Identity Parameters
+        locale=None,            # e.g. "en-US", "fr-FR"
+        timezone_id=None,       # e.g. "America/New_York"
+        geolocation=None,       # GeolocationConfig object
+        # Resource Management
         enable_rate_limiting=False,
         rate_limit_config=None,
         memory_threshold_percent=70.0,
@@ -150,62 +155,65 @@ class CrawlerRunConfig:
 
 ### Key Fields to Note
 
-1. **`word_count_threshold`**:  
+1. **`word_count_threshold`**:  
    - The minimum word count before a block is considered.  
    - If your site has lots of short paragraphs or items, you can lower it.
 
-2. **`extraction_strategy`**:  
+2. **`extraction_strategy`**:  
    - Where you plug in JSON-based extraction (CSS, LLM, etc.).  
    - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
 
-3. **`markdown_generator`**:  
+3. **`markdown_generator`**:  
    - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.  
    - If `None`, a default approach is used.
 
-4. **`cache_mode`**:  
+4. **`cache_mode`**:  
    - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).  
    - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
 
-5. **`js_code`**:  
+5. **`js_code`**:  
    - A string or list of JS strings to execute.  
-   - Great for “Load More” buttons or user interactions.  
+   - Great for "Load More" buttons or user interactions.  
 
-6. **`wait_for`**:  
+6. **`wait_for`**:  
    - A CSS or JS expression to wait for before extracting content.  
    - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
 
-7. **`screenshot`** & **`pdf`**:  
-   - If `True`, captures a screenshot or PDF after the page is fully loaded.  
-   - The results go to `result.screenshot` (base64) or `result.pdf` (bytes).
+7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
+   - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
+   - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
 
-8. **`verbose`**:  
+8. **Location Parameters**:  
+   - **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences
+   - **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`)
+   - **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)`
+   - See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
+
+9. **`verbose`**:  
    - Logs additional runtime details.  
-   - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
+   - Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`.
 
-9. **`enable_rate_limiting`**:  
+10. **`enable_rate_limiting`**:  
    - If `True`, enables rate limiting for batch processing.  
    - Requires `rate_limit_config` to be set.
 
-10. **`rate_limit_config`**:  
-    - A `RateLimitConfig` object controlling rate limiting behavior.  
-    - See below for details.
-
-11. **`memory_threshold_percent`**:  
+11. **`memory_threshold_percent`**:  
     - The memory threshold (as a percentage) to monitor.  
     - If exceeded, the crawler will pause or slow down.
 
-12. **`check_interval`**:  
+12. **`check_interval`**:  
     - The interval (in seconds) to check system resources.  
     - Affects how often memory and CPU usage are monitored.
 
-13. **`max_session_permit`**:  
+13. **`max_session_permit`**:  
     - The maximum number of concurrent crawl sessions.  
     - Helps prevent overwhelming the system.
 
-14. **`display_mode`**:  
+14. **`display_mode`**:  
     - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
     - Affects how much information is printed during the crawl.
 
+
 ### Helper Methods
 
 The `clone()` method is particularly useful for creating variations of your crawler configuration:
@@ -236,67 +244,36 @@ The `clone()` method:
 - Leaves the original configuration unchanged
 - Perfect for creating variations without repeating all parameters
 
-### Rate Limiting & Resource Management
-
-For batch processing with `arun_many()`, you can enable intelligent rate limiting:
-
-```python
-from crawl4ai import RateLimitConfig
-    
-config = CrawlerRunConfig(
-    enable_rate_limiting=True,
-    rate_limit_config=RateLimitConfig(
-        base_delay=(1.0, 3.0),    # Random delay range
-        max_delay=60.0,           # Max delay after rate limits
-        max_retries=3,            # Retries before giving up
-        rate_limit_codes=[429, 503]  # Status codes to watch
-    ),
-    memory_threshold_percent=70.0,  # Memory threshold
-    check_interval=1.0,            # Resource check interval
-    max_session_permit=20,         # Max concurrent crawls
-    display_mode="DETAILED"        # Progress display mode
-)
-```
-
-This configuration:
-- Implements intelligent rate limiting per domain
-- Monitors system resources
-- Provides detailed progress information
-- Manages concurrent crawls efficiently
-
-**Minimal Example**:
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-crawl_conf = CrawlerRunConfig(
-    js_code="document.querySelector('button#loadMore')?.click()",
-    wait_for="css:.loaded-content",
-    screenshot=True,
-    enable_rate_limiting=True,
-    rate_limit_config=RateLimitConfig(
-        base_delay=(1.0, 3.0),
-        max_delay=60.0,
-        max_retries=3,
-        rate_limit_codes=[429, 503]
-    ),
-    stream=True  # Enable streaming
-)
-
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(url="https://example.com", config=crawl_conf)
-    print(result.screenshot[:100])  # Base64-encoded PNG snippet
-```
-
 ---
 
-## 3. Putting It All Together
 
-In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` depending on each call’s needs:
+## 3. LLMConfig Essentials
+
+### Key fields to note
+
+1. **`provider`**:  
+- Which LLM provoder to use. 
+- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
+
+2. **`api_token`**:  
+    - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables  
+    - API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
+    - Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`            
+
+3. **`base_url`**:  
+   - If your provider has a custom endpoint
+
+```python
+llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+```
+
+## 4. Putting It All Together
+
+In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call's needs:
 
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 
 async def main():
@@ -318,17 +295,41 @@ async def main():
     }
     extraction = JsonCssExtractionStrategy(schema)
 
-    # 3) Crawler run config: skip cache, use extraction
+    # 3) Example LLM content filtering
+
+    gemini_config = LLMConfig(
+        provider="gemini/gemini-1.5-pro" 
+        api_token = "env:GEMINI_API_TOKEN"
+    )
+
+    # Initialize LLM filter with specific instruction
+    filter = LLMContentFilter(
+        llm_config=gemini_config,  # or your preferred provider
+        instruction="""
+        Focus on extracting the core educational content.
+        Include:
+        - Key concepts and explanations
+        - Important code examples
+        - Essential technical details
+        Exclude:
+        - Navigation elements
+        - Sidebars
+        - Footer content
+        Format the output as clean markdown with proper code blocks and headers.
+        """,
+        chunk_token_threshold=500,  # Adjust based on your needs
+        verbose=True
+    )
+
+    md_generator = DefaultMarkdownGenerator(
+    content_filter=filter,
+    options={"ignore_links": True}
+
+    # 4) Crawler run config: skip cache, use extraction
     run_conf = CrawlerRunConfig(
+        markdown_generator=md_generator,
         extraction_strategy=extraction,
         cache_mode=CacheMode.BYPASS,
-        enable_rate_limiting=True,
-        rate_limit_config=RateLimitConfig(
-            base_delay=(1.0, 3.0),
-            max_delay=60.0,
-            max_retries=3,
-            rate_limit_codes=[429, 503]
-        )
     )
 
     async with AsyncWebCrawler(config=browser_conf) as crawler:
@@ -346,11 +347,11 @@ if __name__ == "__main__":
 
 ---
 
-## 4. Next Steps
+## 5. Next Steps
 
 For a **detailed list** of available parameters (including advanced ones), see:
 
-- [BrowserConfig and CrawlerRunConfig Reference](../api/parameters.md)  
+- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)  
 
 You can explore topics like:
 
@@ -361,11 +362,12 @@ You can explore topics like:
 
 ---
 
-## 5. Conclusion
+## 6. Conclusion
 
-**BrowserConfig** and **CrawlerRunConfig** give you straightforward ways to define:
+**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
 
 - **Which** browser to launch, how it should run, and any proxy or user agent needs.  
 - **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
+- **Which** LLM provider to use, api token, temperature and base url for custom endpoints
 
 Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling!
\ No newline at end of file
diff --git a/docs/md_v2/core/cli.md b/docs/md_v2/core/cli.md
new file mode 100644
index 00000000..ff4bf658
--- /dev/null
+++ b/docs/md_v2/core/cli.md
@@ -0,0 +1,304 @@
+# Crawl4AI CLI Guide
+
+## Table of Contents
+- [Installation](#installation)
+- [Basic Usage](#basic-usage)
+- [Configuration](#configuration)
+  - [Browser Configuration](#browser-configuration)
+  - [Crawler Configuration](#crawler-configuration)
+  - [Extraction Configuration](#extraction-configuration)
+  - [Content Filtering](#content-filtering)
+- [Advanced Features](#advanced-features)
+  - [LLM Q&A](#llm-qa)
+  - [Structured Data Extraction](#structured-data-extraction)
+  - [Content Filtering](#content-filtering-1)
+- [Output Formats](#output-formats)
+- [Examples](#examples)
+- [Configuration Reference](#configuration-reference)
+- [Best Practices & Tips](#best-practices--tips)
+
+## Basic Usage
+
+The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
+
+```bash
+# Basic crawling
+crwl https://example.com
+
+# Get markdown output
+crwl https://example.com -o markdown
+
+# Verbose JSON output with cache bypass
+crwl https://example.com -o json -v --bypass-cache
+
+# See usage examples
+crwl --example
+```
+
+## Quick Example of Advanced Usage
+
+If you clone the repository and run the following command, you will receive the content of the page in JSON format according to a JSON-CSS schema:
+
+```bash
+crwl "https://www.infoq.com/ai-ml-data-eng/" -e docs/examples/cli/extract_css.yml -s docs/examples/cli/css_schema.json -o json;
+```
+
+## Configuration
+
+### Browser Configuration
+
+Browser settings can be configured via YAML file or command line parameters:
+
+```yaml
+# browser.yml
+headless: true
+viewport_width: 1280
+user_agent_mode: "random"
+verbose: true
+ignore_https_errors: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -B browser.yml
+
+# Using direct parameters
+crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
+```
+
+### Crawler Configuration
+
+Control crawling behavior:
+
+```yaml
+# crawler.yml
+cache_mode: "bypass"
+wait_until: "networkidle"
+page_timeout: 30000
+delay_before_return_html: 0.5
+word_count_threshold: 100
+scan_full_page: true
+scroll_delay: 0.3
+process_iframes: false
+remove_overlay_elements: true
+magic: true
+verbose: true
+```
+
+```bash
+# Using config file
+crwl https://example.com -C crawler.yml
+
+# Using direct parameters
+crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
+```
+
+### Extraction Configuration
+
+Two types of extraction are supported:
+
+1. CSS/XPath-based extraction:
+```yaml
+# extract_css.yml
+type: "json-css"
+params:
+  verbose: true
+```
+
+```json
+// css_schema.json
+{
+  "name": "ArticleExtractor",
+  "baseSelector": ".article",
+  "fields": [
+    {
+      "name": "title",
+      "selector": "h1.title",
+      "type": "text"
+    },
+    {
+      "name": "link",
+      "selector": "a.read-more",
+      "type": "attribute",
+      "attribute": "href"
+    }
+  ]
+}
+```
+
+2. LLM-based extraction:
+```yaml
+# extract_llm.yml
+type: "llm"
+provider: "openai/gpt-4"
+instruction: "Extract all articles with their titles and links"
+api_token: "your-token"
+params:
+  temperature: 0.3
+  max_tokens: 1000
+```
+
+```json
+// llm_schema.json
+{
+  "title": "Article",
+  "type": "object",
+  "properties": {
+    "title": {
+      "type": "string",
+      "description": "The title of the article"
+    },
+    "link": {
+      "type": "string",
+      "description": "URL to the full article"
+    }
+  }
+}
+```
+
+## Advanced Features
+
+### LLM Q&A
+
+Ask questions about crawled content:
+
+```bash
+# Simple question
+crwl https://example.com -q "What is the main topic discussed?"
+
+# View content then ask questions
+crwl https://example.com -o markdown  # See content first
+crwl https://example.com -q "Summarize the key points"
+crwl https://example.com -q "What are the conclusions?"
+
+# Combined with advanced crawling
+crwl https://example.com \
+    -B browser.yml \
+    -c "css_selector=article,scan_full_page=true" \
+    -q "What are the pros and cons mentioned?"
+```
+
+First-time setup:
+- Prompts for LLM provider and API token
+- Saves configuration in `~/.crawl4ai/global.yml`
+- Supports various providers (openai/gpt-4, anthropic/claude-3-sonnet, etc.)
+- For case of `ollama` you do not need to provide API token.
+- See [LiteLLM Providers](https://docs.litellm.ai/docs/providers) for full list
+
+### Structured Data Extraction
+
+Extract structured data using CSS selectors:
+
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json
+```
+
+Or using LLM-based extraction:
+
+```bash
+crwl https://example.com \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -o json
+```
+
+### Content Filtering
+
+Filter content for relevance:
+
+```yaml
+# filter_bm25.yml
+type: "bm25"
+query: "target content"
+threshold: 1.0
+
+# filter_pruning.yml
+type: "pruning"
+query: "focus topic"
+threshold: 0.48
+```
+
+```bash
+crwl https://example.com -f filter_bm25.yml -o markdown-fit
+```
+
+## Output Formats
+
+- `all` - Full crawl result including metadata
+- `json` - Extracted structured data (when using extraction)
+- `markdown` / `md` - Raw markdown output
+- `markdown-fit` / `md-fit` - Filtered markdown for better readability
+
+## Complete Examples
+
+1. Basic Extraction:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -C crawler.yml \
+    -o json
+```
+
+2. Structured Data Extraction:
+```bash
+crwl https://example.com \
+    -e extract_css.yml \
+    -s css_schema.json \
+    -o json \
+    -v
+```
+
+3. LLM Extraction with Filtering:
+```bash
+crwl https://example.com \
+    -B browser.yml \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -f filter_bm25.yml \
+    -o json
+```
+
+4. Interactive Q&A:
+```bash
+# First crawl and view
+crwl https://example.com -o markdown
+
+# Then ask questions
+crwl https://example.com -q "What are the main points?"
+crwl https://example.com -q "Summarize the conclusions"
+```
+
+## Best Practices & Tips
+
+1. **Configuration Management**:
+   - Keep common configurations in YAML files
+   - Use CLI parameters for quick overrides
+   - Store sensitive data (API tokens) in `~/.crawl4ai/global.yml`
+
+2. **Performance Optimization**:
+   - Use `--bypass-cache` for fresh content
+   - Enable `scan_full_page` for infinite scroll pages
+   - Adjust `delay_before_return_html` for dynamic content
+
+3. **Content Extraction**:
+   - Use CSS extraction for structured content
+   - Use LLM extraction for unstructured content
+   - Combine with filters for focused results
+
+4. **Q&A Workflow**:
+   - View content first with `-o markdown`
+   - Ask specific questions
+   - Use broader context with appropriate selectors
+
+## Recap
+
+The Crawl4AI CLI provides:
+- Flexible configuration via files and parameters
+- Multiple extraction strategies (CSS, XPath, LLM)
+- Content filtering and optimization
+- Interactive Q&A capabilities
+- Various output formats
+
diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md
index 5d46ef10..07c8861b 100644
--- a/docs/md_v2/core/content-selection.md
+++ b/docs/md_v2/core/content-selection.md
@@ -8,6 +8,10 @@ Below, we show how to configure these parameters and combine them for precise co
 
 ## 1. CSS-Based Selection
 
+There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`.
+
+### 1.1 Using `css_selector`
+
 A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
 
 ```python
@@ -32,6 +36,33 @@ if __name__ == "__main__":
 
 **Result**: Only elements matching that selector remain in `result.cleaned_html`.
 
+### 1.2 Using `target_elements`
+
+The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    config = CrawlerRunConfig(
+        # Target article body and sidebar, but not other content
+        target_elements=["article.main-content", "aside.sidebar"]
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/blog-post", 
+            config=config
+        )
+        print("Markdown focused on target elements")
+        print("Links from entire page still available:", len(result.links.get("internal", [])))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection.
+
 ---
 
 ## 2. Content Filtering & Exclusions
@@ -168,10 +199,10 @@ async def main():
         "name": "News Items",
         "baseSelector": "tr.athing",
         "fields": [
-            {"name": "title", "selector": "a.storylink", "type": "text"},
+            {"name": "title", "selector": "span.titleline a", "type": "text"},
             {
                 "name": "link", 
-                "selector": "a.storylink", 
+                "selector": "span.titleline a", 
                 "type": "attribute", 
                 "attribute": "href"
             }
@@ -211,7 +242,7 @@ if __name__ == "__main__":
 import asyncio
 import json
 from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
 
 class ArticleData(BaseModel):
@@ -220,8 +251,7 @@ class ArticleData(BaseModel):
 
 async def main():
     llm_strategy = LLMExtractionStrategy(
-        provider="openai/gpt-4",
-        api_token="sk-YOUR_API_KEY",
+        llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
         schema=ArticleData.schema(),
         extraction_type="schema",
         instruction="Extract 'headline' and a short 'summary' from the content."
@@ -405,15 +435,59 @@ Stick to BeautifulSoup strategy (default) when:
 
 ---
 
-## 7. Conclusion
+## 7. Combining CSS Selection Methods
 
-By mixing **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
+You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output:
 
-1. **`css_selector`** – Basic scoping to an element or region.  
-2. **`word_count_threshold`** – Skip short blocks.  
-3. **`excluded_tags`** – Remove entire HTML tags.  
-4. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.  
-5. **`exclude_external_images`** – Remove images from external sources.  
-6. **`process_iframes`** – Merge iframe content if needed.  
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def main():
+    # Target specific content but preserve page context
+    config = CrawlerRunConfig(
+        # Focus markdown on main content and sidebar
+        target_elements=["#main-content", ".sidebar"],
+        
+        # Global filters applied to entire page
+        excluded_tags=["nav", "footer", "header"],
+        exclude_external_links=True,
+        
+        # Use basic content thresholds
+        word_count_threshold=15,
+        
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/article",
+            config=config
+        )
+        
+        print(f"Content focuses on specific elements, but all links still analyzed")
+        print(f"Internal links: {len(result.links.get('internal', []))}")
+        print(f"External links: {len(result.links.get('external', []))}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+This approach gives you the best of both worlds:
+- Markdown generation and content extraction focus on the elements you care about
+- Links, images and other page data still give you the full context of the page
+- Content filtering still applies globally
+
+## 8. Conclusion
+
+By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
+
+1. **`target_elements`** – Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media.
+2. **`css_selector`** – Basic scoping to an element or region for all extraction processes.  
+3. **`word_count_threshold`** – Skip short blocks.  
+4. **`excluded_tags`** – Remove entire HTML tags.  
+5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.  
+6. **`exclude_external_images`** – Remove images from external sources.  
+7. **`process_iframes`** – Merge iframe content if needed.  
 
 Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!
\ No newline at end of file
diff --git a/docs/md_v2/core/crawler-result.md b/docs/md_v2/core/crawler-result.md
index e0e627ba..d7648ecb 100644
--- a/docs/md_v2/core/crawler-result.md
+++ b/docs/md_v2/core/crawler-result.md
@@ -26,8 +26,8 @@ class CrawlResult(BaseModel):
     downloaded_files: Optional[List[str]] = None
     screenshot: Optional[str] = None
     pdf : Optional[bytes] = None
+    mhtml: Optional[str] = None
     markdown: Optional[Union[str, MarkdownGenerationResult]] = None
-    markdown_v2: Optional[MarkdownGenerationResult] = None
     extracted_content: Optional[str] = None
     metadata: Optional[dict] = None
     error_message: Optional[str] = None
@@ -52,8 +52,8 @@ class CrawlResult(BaseModel):
 | **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads.         |
 | **screenshot (`Optional[str]`)**          | Screenshot of the page (base64-encoded) if `screenshot=True`.                                       |
 | **pdf (`Optional[bytes]`)**               | PDF of the page if `pdf=True`.                                                                      |
-| **markdown (`Optional[str or MarkdownGenerationResult]`)** | For now, `markdown_v2` holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
-| **markdown_v2 (`Optional[MarkdownGenerationResult]`)** | Legacy field for detailed markdown output. This will be replaced by `markdown` soon.                |
+| **mhtml (`Optional[str]`)**               | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources.      |
+| **markdown (`Optional[str or MarkdownGenerationResult]`)** | It holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
 | **extracted_content (`Optional[str]`)**   | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text.          |
 | **metadata (`Optional[dict]`)**           | Additional info about the crawl or extracted data.                                                  |
 | **error_message (`Optional[str]`)**       | If `success=False`, contains a short description of what went wrong.                                |
@@ -90,10 +90,10 @@ print(result.cleaned_html)  # Freed of forms, header, footer, data-* attributes
 
 ## 3. Markdown Generation
 
-### 3.1 `markdown_v2` (Legacy) vs `markdown`
+### 3.1 `markdown`
 
-- **`markdown_v2`**: The current location for detailed markdown output, returning a **`MarkdownGenerationResult`** object.  
-- **`markdown`**: Eventually, we’re merging these fields. For now, you might see `result.markdown_v2` used widely in code examples.
+- **`markdown`**: The current location for detailed markdown output, returning a **`MarkdownGenerationResult`** object.  
+- **`markdown_v2`**: Deprecated since v0.5.
 
 **`MarkdownGenerationResult`** Fields:
 
@@ -118,7 +118,7 @@ config = CrawlerRunConfig(
 )
 result = await crawler.arun(url="https://example.com", config=config)
 
-md_res = result.markdown_v2  # or eventually 'result.markdown'
+md_res = result.markdown  # or eventually 'result.markdown'
 print(md_res.raw_markdown[:500])
 print(md_res.markdown_with_citations)
 print(md_res.references_markdown)
@@ -192,18 +192,27 @@ for img in images:
     print("Image URL:", img["src"], "Alt:", img.get("alt"))
 ```
 
-### 5.3 `screenshot` and `pdf`
+### 5.3 `screenshot`, `pdf`, and `mhtml`
 
-If you set `screenshot=True` or `pdf=True` in **`CrawlerRunConfig`**, then:
+If you set `screenshot=True`, `pdf=True`, or `capture_mhtml=True` in **`CrawlerRunConfig`**, then:
 
-- `result.screenshot` contains a base64-encoded PNG string.  
+- `result.screenshot` contains a base64-encoded PNG string.
 - `result.pdf` contains raw PDF bytes (you can write them to a file).
+- `result.mhtml` contains the MHTML snapshot of the page as a string (you can write it to a .mhtml file).
 
 ```python
+# Save the PDF
 with open("page.pdf", "wb") as f:
     f.write(result.pdf)
+
+# Save the MHTML
+if result.mhtml:
+    with open("page.mhtml", "w", encoding="utf-8") as f:
+        f.write(result.mhtml)
 ```
 
+The MHTML (MIME HTML) format is particularly useful as it captures the entire web page including all of its resources (CSS, images, scripts, etc.) in a single file, making it perfect for archiving or offline viewing.
+
 ### 5.4 `ssl_certificate`
 
 If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the site’s SSL cert, such as issuer, validity dates, etc.
@@ -224,15 +233,17 @@ Check any field:
 if result.success:
     print(result.status_code, result.response_headers)
     print("Links found:", len(result.links.get("internal", [])))
-    if result.markdown_v2:
-        print("Markdown snippet:", result.markdown_v2.raw_markdown[:200])
+    if result.markdown:
+        print("Markdown snippet:", result.markdown.raw_markdown[:200])
     if result.extracted_content:
         print("Structured JSON:", result.extracted_content)
 else:
     print("Error:", result.error_message)
 ```
 
-**Remember**: Use `result.markdown_v2` for now. It will eventually become `result.markdown`.
+**Deprecation**: Since v0.5 `result.markdown_v2`, `result.fit_html`,`result.fit_markdown` are deprecated. Use `result.markdown` instead! It holds `MarkdownGenerationResult`, which includes `fit_html` and `fit_markdown`
+as it's properties.
+
 
 ---
 
diff --git a/docs/md_v2/core/deep-crawling.md b/docs/md_v2/core/deep-crawling.md
new file mode 100644
index 00000000..00834787
--- /dev/null
+++ b/docs/md_v2/core/deep-crawling.md
@@ -0,0 +1,488 @@
+# Deep Crawling
+
+One of Crawl4AI's most powerful features is its ability to perform **configurable deep crawling** that can explore websites beyond a single page. With fine-tuned control over crawl depth, domain boundaries, and content filtering, Crawl4AI gives you the tools to extract precisely the content you need.
+
+In this tutorial, you'll learn:
+
+1. How to set up a **Basic Deep Crawler** with BFS strategy  
+2. Understanding the difference between **streamed and non-streamed** output  
+3. Implementing **filters and scorers** to target specific content  
+4. Creating **advanced filtering chains** for sophisticated crawls  
+5. Using **BestFirstCrawling** for intelligent exploration prioritization  
+
+> **Prerequisites**  
+> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
+> - You know how to configure `CrawlerRunConfig`.
+
+---
+
+## 1. Quick Example
+
+Here's a minimal code snippet that implements a basic deep crawl using the **BFSDeepCrawlStrategy**:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+
+async def main():
+    # Configure a 2-level deep crawl
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2, 
+            include_external=False
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun("https://example.com", config=config)
+        
+        print(f"Crawled {len(results)} pages in total")
+        
+        # Access individual results
+        for result in results[:3]:  # Show first 3 results
+            print(f"URL: {result.url}")
+            print(f"Depth: {result.metadata.get('depth', 0)}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What's happening?**  
+- `BFSDeepCrawlStrategy(max_depth=2, include_external=False)` instructs Crawl4AI to:
+  - Crawl the starting page (depth 0) plus 2 more levels
+  - Stay within the same domain (don't follow external links)
+- Each result contains metadata like the crawl depth
+- Results are returned as a list after all crawling is complete
+
+---
+
+## 2. Understanding Deep Crawling Strategy Options
+
+### 2.1 BFSDeepCrawlStrategy (Breadth-First Search)
+
+The **BFSDeepCrawlStrategy** uses a breadth-first approach, exploring all links at one depth before moving deeper:
+
+```python
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+
+# Basic configuration
+strategy = BFSDeepCrawlStrategy(
+    max_depth=2,               # Crawl initial page + 2 levels deep
+    include_external=False,    # Stay within the same domain
+    max_pages=50,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.3,       # Minimum score for URLs to be crawled (optional)
+)
+```
+
+**Key parameters:**
+- **`max_depth`**: Number of levels to crawl beyond the starting page
+- **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs
+
+### 2.2 DFSDeepCrawlStrategy (Depth-First Search)
+
+The **DFSDeepCrawlStrategy** uses a depth-first approach, explores as far down a branch as possible before backtracking.
+
+```python
+from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
+
+# Basic configuration
+strategy = DFSDeepCrawlStrategy(
+    max_depth=2,               # Crawl initial page + 2 levels deep
+    include_external=False,    # Stay within the same domain
+    max_pages=30,              # Maximum number of pages to crawl (optional)
+    score_threshold=0.5,       # Minimum score for URLs to be crawled (optional)
+)
+```
+
+**Key parameters:**
+- **`max_depth`**: Number of levels to crawl beyond the starting page
+- **`include_external`**: Whether to follow links to other domains
+- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
+- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
+- **`filter_chain`**: FilterChain instance for URL filtering
+- **`url_scorer`**: Scorer instance for evaluating URLs
+
+### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)
+
+For more intelligent crawling, use **BestFirstCrawlingStrategy** with scorers to prioritize the most relevant pages:
+
+```python
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+# Create a scorer
+scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"],
+    weight=0.7
+)
+
+# Configure the strategy
+strategy = BestFirstCrawlingStrategy(
+    max_depth=2,
+    include_external=False,
+    url_scorer=scorer,
+    max_pages=25,              # Maximum number of pages to crawl (optional)
+)
+```
+
+This crawling approach:
+- Evaluates each discovered URL based on scorer criteria
+- Visits higher-scoring pages first
+- Helps focus crawl resources on the most relevant content
+- Can limit total pages crawled with `max_pages`
+- Does not need `score_threshold` as it naturally prioritizes by score
+
+---
+
+## 3. Streaming vs. Non-Streaming Results
+
+Crawl4AI can return results in two modes:
+
+### 3.1 Non-Streaming Mode (Default)
+
+```python
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+    stream=False  # Default behavior
+)
+
+async with AsyncWebCrawler() as crawler:
+    # Wait for ALL results to be collected before returning
+    results = await crawler.arun("https://example.com", config=config)
+    
+    for result in results:
+        process_result(result)
+```
+
+**When to use non-streaming mode:**
+- You need the complete dataset before processing
+- You're performing batch operations on all results together
+- Crawl time isn't a critical factor
+
+### 3.2 Streaming Mode
+
+```python
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+    stream=True  # Enable streaming
+)
+
+async with AsyncWebCrawler() as crawler:
+    # Returns an async iterator
+    async for result in await crawler.arun("https://example.com", config=config):
+        # Process each result as it becomes available
+        process_result(result)
+```
+
+**Benefits of streaming mode:**
+- Process results immediately as they're discovered
+- Start working with early results while crawling continues
+- Better for real-time applications or progressive display
+- Reduces memory pressure when handling many pages
+
+---
+
+## 4. Filtering Content with Filter Chains
+
+Filters help you narrow down which pages to crawl. Combine multiple filters using **FilterChain** for powerful targeting.
+
+### 4.1 Basic URL Pattern Filter
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
+
+# Only follow URLs containing "blog" or "docs"
+url_filter = URLPatternFilter(patterns=["*blog*", "*docs*"])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([url_filter])
+    )
+)
+```
+
+### 4.2 Combining Multiple Filters
+
+```python
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter
+)
+
+# Create a chain of filters
+filter_chain = FilterChain([
+    # Only follow URLs with specific patterns
+    URLPatternFilter(patterns=["*guide*", "*tutorial*"]),
+    
+    # Only crawl specific domains
+    DomainFilter(
+        allowed_domains=["docs.example.com"],
+        blocked_domains=["old.docs.example.com"]
+    ),
+    
+    # Only include specific content types
+    ContentTypeFilter(allowed_types=["text/html"])
+])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=2,
+        filter_chain=filter_chain
+    )
+)
+```
+
+### 4.3 Available Filter Types
+
+Crawl4AI includes several specialized filters:
+
+- **`URLPatternFilter`**: Matches URL patterns using wildcard syntax
+- **`DomainFilter`**: Controls which domains to include or exclude
+- **`ContentTypeFilter`**: Filters based on HTTP Content-Type
+- **`ContentRelevanceFilter`**: Uses similarity to a text query
+- **`SEOFilter`**: Evaluates SEO elements (meta tags, headers, etc.)
+
+---
+
+## 5. Using Scorers for Prioritized Crawling
+
+Scorers assign priority values to discovered URLs, helping the crawler focus on the most relevant content first.
+
+### 5.1 KeywordRelevanceScorer
+
+```python
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+
+# Create a keyword relevance scorer
+keyword_scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"],
+    weight=0.7  # Importance of this scorer (0.0 to 1.0)
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BestFirstCrawlingStrategy(
+        max_depth=2,
+        url_scorer=keyword_scorer
+    ),
+    stream=True  # Recommended with BestFirstCrawling
+)
+
+# Results will come in order of relevance score
+async with AsyncWebCrawler() as crawler:
+    async for result in await crawler.arun("https://example.com", config=config):
+        score = result.metadata.get("score", 0)
+        print(f"Score: {score:.2f} | {result.url}")
+```
+
+**How scorers work:**
+- Evaluate each discovered URL before crawling
+- Calculate relevance based on various signals
+- Help the crawler make intelligent choices about traversal order
+
+---
+
+## 6. Advanced Filtering Techniques
+
+### 6.1 SEO Filter for Quality Assessment
+
+The **SEOFilter** helps you identify pages with strong SEO characteristics:
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, SEOFilter
+
+# Create an SEO filter that looks for specific keywords in page metadata
+seo_filter = SEOFilter(
+    threshold=0.5,  # Minimum score (0.0 to 1.0)
+    keywords=["tutorial", "guide", "documentation"]
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([seo_filter])
+    )
+)
+```
+
+### 6.2 Content Relevance Filter
+
+The **ContentRelevanceFilter** analyzes the actual content of pages:
+
+```python
+from crawl4ai.deep_crawling.filters import FilterChain, ContentRelevanceFilter
+
+# Create a content relevance filter
+relevance_filter = ContentRelevanceFilter(
+    query="Web crawling and data extraction with Python",
+    threshold=0.7  # Minimum similarity score (0.0 to 1.0)
+)
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([relevance_filter])
+    )
+)
+```
+
+This filter:
+- Measures semantic similarity between query and page content
+- It's a BM25-based relevance filter using head section content
+
+---
+
+## 7. Building a Complete Advanced Crawler
+
+This example combines multiple techniques for a sophisticated crawl:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    DomainFilter,
+    URLPatternFilter,
+    ContentTypeFilter
+)
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+async def run_advanced_crawler():
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain([
+        # Domain boundaries
+        DomainFilter(
+            allowed_domains=["docs.example.com"],
+            blocked_domains=["old.docs.example.com"]
+        ),
+        
+        # URL patterns to include
+        URLPatternFilter(patterns=["*guide*", "*tutorial*", "*blog*"]),
+        
+        # Content type filtering
+        ContentTypeFilter(allowed_types=["text/html"])
+    ])
+
+    # Create a relevance scorer
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"],
+        weight=0.7
+    )
+
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True
+    )
+
+    # Execute the crawl
+    results = []
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun("https://docs.example.com", config=config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    # Analyze the results
+    print(f"Crawled {len(results)} high-value pages")
+    print(f"Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}")
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+if __name__ == "__main__":
+    asyncio.run(run_advanced_crawler())
+```
+
+---
+
+
+## 8. Limiting and Controlling Crawl Size
+
+### 8.1 Using max_pages
+
+You can limit the total number of pages crawled with the `max_pages` parameter:
+
+```python
+# Limit to exactly 20 pages regardless of depth
+strategy = BFSDeepCrawlStrategy(
+    max_depth=3,
+    max_pages=20
+)
+```
+
+This feature is useful for:
+- Controlling API costs
+- Setting predictable execution times
+- Focusing on the most important content
+- Testing crawl configurations before full execution
+
+### 8.2 Using score_threshold
+
+For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages:
+
+```python
+# Only follow links with scores above 0.4
+strategy = DFSDeepCrawlStrategy(
+    max_depth=2,
+    url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]),
+    score_threshold=0.4  # Skip URLs with scores below this value
+)
+```
+
+Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first.
+
+## 9. Common Pitfalls & Tips
+
+1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits.
+
+2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.
+
+3.**Be a good web citizen.**  Respect robots.txt. (disabled by default)
+  
+4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results.
+
+5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
+
+---
+
+## 10. Summary & Next Steps
+
+In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
+
+- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy**
+- Process results in streaming or non-streaming mode
+- Apply filters to target specific content
+- Use scorers to prioritize the most relevant pages
+- Limit crawls with `max_pages` and `score_threshold` parameters
+- Build a complete advanced crawler with combined techniques
+
+With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.
diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md
new file mode 100644
index 00000000..7e239d43
--- /dev/null
+++ b/docs/md_v2/core/docker-deployment.md
@@ -0,0 +1,821 @@
+# Crawl4AI Docker Guide 🐳
+
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+  - [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
+  - [Option 2: Using Docker Compose](#option-2-using-docker-compose)
+  - [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+  - [Playground Interface](#playground-interface)
+  - [Python SDK](#python-sdk)
+  - [Understanding Request Schema](#understanding-request-schema)
+  - [REST API Examples](#rest-api-examples)
+- [Additional API Endpoints](#additional-api-endpoints)
+  - [HTML Extraction Endpoint](#html-extraction-endpoint)
+  - [Screenshot Endpoint](#screenshot-endpoint)
+  - [PDF Export Endpoint](#pdf-export-endpoint)
+  - [JavaScript Execution Endpoint](#javascript-execution-endpoint)
+  - [Library Context Endpoint](#library-context-endpoint)
+- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
+  - [What is MCP?](#what-is-mcp)
+  - [Connecting via MCP](#connecting-via-mcp)
+  - [Using with Claude Code](#using-with-claude-code)
+  - [Available MCP Tools](#available-mcp-tools)
+  - [Testing MCP Connections](#testing-mcp-connections)
+  - [MCP Schemas](#mcp-schemas)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Server Configuration](#server-configuration)
+  - [Understanding config.yml](#understanding-configyml)
+  - [JWT Authentication](#jwt-authentication)
+  - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
+  - [Customizing Your Configuration](#customizing-your-configuration)
+  - [Configuration Recommendations](#configuration-recommendations)
+- [Getting Help](#getting-help)
+- [Summary](#summary)
+
+## Prerequisites
+
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
+- `git` for cloning the repository.
+- At least 4GB of RAM available for the container (more recommended for heavy use).
+- Python 3.10+ (if using the Python SDK).
+- Node.js 16+ (if using the Node.js examples).
+
+> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+We offer several ways to get the Crawl4AI server running. The quickest way is to use our pre-built Docker Hub images.
+
+### Option 1: Using Pre-built Docker Hub Images (Recommended)
+
+Pull and run images directly from Docker Hub without building locally.
+
+#### 1. Pull the Image
+
+Our latest release candidate is `0.6.0-r2`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
+
+```bash
+# Pull the release candidate (recommended for latest features)
+docker pull unclecode/crawl4ai:0.6.0-r1
+
+# Or pull the latest stable version
+docker pull unclecode/crawl4ai:latest
+```
+
+#### 2. Setup Environment (API Keys)
+
+If you plan to use LLMs, create a `.llm.env` file in your working directory:
+
+```bash
+# Create a .llm.env file with your API keys
+cat > .llm.env << EOL
+# OpenAI
+OPENAI_API_KEY=sk-your-key
+
+# Anthropic
+ANTHROPIC_API_KEY=your-anthropic-key
+
+# Other providers as needed
+# DEEPSEEK_API_KEY=your-deepseek-key
+# GROQ_API_KEY=your-groq-key
+# TOGETHER_API_KEY=your-together-key
+# MISTRAL_API_KEY=your-mistral-key
+# GEMINI_API_TOKEN=your-gemini-token
+EOL
+```
+> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
+
+#### 3. Run the Container
+
+*   **Basic run:**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai \
+      --shm-size=1g \
+      unclecode/crawl4ai:latest
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai \
+      --env-file .llm.env \
+      --shm-size=1g \
+      unclecode/crawl4ai:latest
+    ```
+
+> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
+
+#### 4. Stopping the Container
+
+```bash
+docker stop crawl4ai && docker rm crawl4ai
+```
+
+#### Docker Hub Versioning Explained
+
+*   **Image Name:** `unclecode/crawl4ai`
+*   **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r2`)
+    *   `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
+    *   `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
+*   **`latest` Tag:** Points to the most recent stable version
+*   **Multi-Architecture Support:** All images support both `linux/amd64` and `linux/arm64` architectures through a single tag
+
+### Option 2: Using Docker Compose
+
+Docker Compose simplifies building and running the service, especially for local development and testing.
+
+#### 1. Clone Repository
+
+```bash
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+```
+
+#### 2. Environment Setup (API Keys)
+
+If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+cp deploy/docker/.llm.env.example .llm.env
+
+# Now edit .llm.env and add your API keys
+```
+
+#### 3. Build and Run with Compose
+
+The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
+
+*   **Run Pre-built Image from Docker Hub:**
+    ```bash
+    # Pulls and runs the release candidate from Docker Hub
+    # Automatically selects the correct architecture
+    IMAGE=unclecode/crawl4ai:latest docker compose up -d
+    ```
+
+*   **Build and Run Locally:**
+    ```bash
+    # Builds the image locally using Dockerfile and runs it
+    # Automatically uses the correct architecture for your machine
+    docker compose up --build -d
+    ```
+
+*   **Customize the Build:**
+    ```bash
+    # Build with all features (includes torch and transformers)
+    INSTALL_TYPE=all docker compose up --build -d
+    
+    # Build with GPU support (for AMD64 platforms)
+    ENABLE_GPU=true docker compose up --build -d
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Service
+
+```bash
+# Stop the service
+docker compose down
+```
+
+### Option 3: Manual Local Build & Run
+
+If you prefer not to use Docker Compose for direct control over the build and run process.
+
+#### 1. Clone Repository & Setup Environment
+
+Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
+
+#### 2. Build the Image (Multi-Arch)
+
+Use `docker buildx` to build the image. Crawl4AI now uses buildx to handle multi-architecture builds automatically.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+# Build for the current architecture and load it into Docker
+docker buildx build -t crawl4ai-local:latest --load .
+
+# Or build for multiple architectures (useful for publishing)
+docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
+
+# Build with additional options
+docker buildx build \
+  --build-arg INSTALL_TYPE=all \
+  --build-arg ENABLE_GPU=false \
+  -t crawl4ai-local:latest --load .
+```
+
+#### 3. Run the Container
+
+*   **Basic run (no LLM support):**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --shm-size=1g \
+      crawl4ai-local:latest
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory (project root)
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --env-file .llm.env \
+      --shm-size=1g \
+      crawl4ai-local:latest
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Manual Container
+
+```bash
+docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
+```
+
+---
+
+## MCP (Model Context Protocol) Support
+
+Crawl4AI server includes support for the Model Context Protocol (MCP), allowing you to connect the server's capabilities directly to MCP-compatible clients like Claude Code.
+
+### What is MCP?
+
+MCP is an open protocol that standardizes how applications provide context to LLMs. It allows AI models to access external tools, data sources, and services through a standardized interface.
+
+### Connecting via MCP
+
+The Crawl4AI server exposes two MCP endpoints:
+
+- **Server-Sent Events (SSE)**: `http://localhost:11235/mcp/sse`
+- **WebSocket**: `ws://localhost:11235/mcp/ws`
+
+### Using with Claude Code
+
+You can add Crawl4AI as an MCP tool provider in Claude Code with a simple command:
+
+```bash
+# Add the Crawl4AI server as an MCP provider
+claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
+
+# List all MCP providers to verify it was added
+claude mcp list
+```
+
+Once connected, Claude Code can directly use Crawl4AI's capabilities like screenshot capture, PDF generation, and HTML processing without having to make separate API calls.
+
+### Available MCP Tools
+
+When connected via MCP, the following tools are available:
+
+- `md` - Generate markdown from web content
+- `html` - Extract preprocessed HTML
+- `screenshot` - Capture webpage screenshots
+- `pdf` - Generate PDF documents
+- `execute_js` - Run JavaScript on web pages
+- `crawl` - Perform multi-URL crawling
+- `ask` - Query the Crawl4AI library context
+
+### Testing MCP Connections
+
+You can test the MCP WebSocket connection using the test file included in the repository:
+
+```bash
+# From the repository root
+python tests/mcp/test_mcp_socket.py
+```
+
+### MCP Schemas
+
+Access the MCP tool schemas at `http://localhost:11235/mcp/schema` for detailed information on each tool's parameters and capabilities.
+
+---
+
+## Additional API Endpoints
+
+In addition to the core `/crawl` and `/crawl/stream` endpoints, the server provides several specialized endpoints:
+
+### HTML Extraction Endpoint
+
+```
+POST /html
+```
+
+Crawls the URL and returns preprocessed HTML optimized for schema extraction.
+
+```json
+{
+  "url": "https://example.com"
+}
+```
+
+### Screenshot Endpoint
+
+```
+POST /screenshot
+```
+
+Captures a full-page PNG screenshot of the specified URL.
+
+```json
+{
+  "url": "https://example.com",
+  "screenshot_wait_for": 2,
+  "output_path": "/path/to/save/screenshot.png"
+}
+```
+
+- `screenshot_wait_for`: Optional delay in seconds before capture (default: 2)
+- `output_path`: Optional path to save the screenshot (recommended)
+
+### PDF Export Endpoint
+
+```
+POST /pdf
+```
+
+Generates a PDF document of the specified URL.
+
+```json
+{
+  "url": "https://example.com",
+  "output_path": "/path/to/save/document.pdf"
+}
+```
+
+- `output_path`: Optional path to save the PDF (recommended)
+
+### JavaScript Execution Endpoint
+
+```
+POST /execute_js
+```
+
+Executes JavaScript snippets on the specified URL and returns the full crawl result.
+
+```json
+{
+  "url": "https://example.com",
+  "scripts": [
+    "return document.title",
+    "return Array.from(document.querySelectorAll('a')).map(a => a.href)"
+  ]
+}
+```
+
+- `scripts`: List of JavaScript snippets to execute sequentially
+
+---
+
+## Dockerfile Parameters
+
+You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
+
+```bash
+# Example: Build with 'all' features using buildx
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  --build-arg INSTALL_TYPE=all \
+  -t yourname/crawl4ai-all:latest \
+  --load \
+  . # Build from root context
+```
+
+### Build Arguments Explained
+
+| Argument     | Description                              | Default   | Options                            |
+| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
+| INSTALL_TYPE | Feature set                              | `default` | `default`, `all`, `torch`, `transformer` |
+| ENABLE_GPU   | GPU support (CUDA for AMD64)           | `false`   | `true`, `false`                    |
+| APP_HOME     | Install path inside container (advanced) | `/app`    | any valid path                   |
+| USE_LOCAL    | Install library from local source        | `true`    | `true`, `false`                    |
+| GITHUB_REPO  | Git repo to clone if USE_LOCAL=false   | *(see Dockerfile)* | any git URL                  |
+| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false   | `main`    | any branch name                  |
+
+*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
+
+### Build Best Practices
+
+1.  **Choose the Right Install Type**
+    *   `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
+    *   `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
+2.  **Platform Considerations**
+    *   Use `buildx` for building multi-architecture images, especially for pushing to registries.
+    *   Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
+3.  **Performance Optimization**
+    *   The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
+
+---
+
+## Using the API
+
+Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
+
+### Playground Interface
+
+A built-in web playground is available at `http://localhost:11235/playground` for testing and generating API requests. The playground allows you to:
+
+1. Configure `CrawlerRunConfig` and `BrowserConfig` using the main library's Python syntax
+2. Test crawling operations directly from the interface
+3. Generate corresponding JSON for REST API requests based on your configuration
+
+This is the easiest way to translate Python configuration to JSON requests when building integrations.
+
+### Python SDK
+
+Install the SDK: `pip install crawl4ai`
+
+```python
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
+
+async def main():
+    # Point to the correct server port
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
+        # If JWT is enabled on the server, authenticate first:
+        # await client.authenticate("user@example.com") # See Server Configuration section
+
+        # Example Non-streaming crawl
+        print("--- Running Non-Streaming Crawl ---")
+        results = await client.crawl(
+            ["https://httpbin.org/html"],
+            browser_config=BrowserConfig(headless=True), # Use library classes for config aid
+            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        if results: # client.crawl returns None on failure
+          print(f"Non-streaming results success: {results.success}")
+          if results.success:
+              for result in results: # Iterate through the CrawlResultContainer
+                  print(f"URL: {result.url}, Success: {result.success}")
+        else:
+            print("Non-streaming crawl failed.")
+
+
+        # Example Streaming crawl
+        print("\n--- Running Streaming Crawl ---")
+        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
+        try:
+            async for result in await client.crawl( # client.crawl returns an async generator for streaming
+                ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
+                browser_config=BrowserConfig(headless=True),
+                crawler_config=stream_config
+            ):
+                print(f"Streamed result: URL: {result.url}, Success: {result.success}")
+        except Exception as e:
+            print(f"Streaming crawl failed: {e}")
+
+
+        # Example Get schema
+        print("\n--- Getting Schema ---")
+        schema = await client.get_schema()
+        print(f"Schema received: {bool(schema)}") # Print whether schema was received
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+*(SDK parameters like timeout, verify_ssl etc. remain the same)*
+
+### Second Approach: Direct API Calls
+
+Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
+
+*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
+
+#### More Examples *(Ensure Schema example uses type/value wrapper)*
+
+**Advanced Crawler Configuration**
+*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
+
+**Extraction Strategy**
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "JsonCssExtractionStrategy",
+                "params": {
+                    "schema": {
+                        "type": "dict",
+                        "value": {
+                           "baseSelector": "article.post",
+                           "fields": [
+                               {"name": "title", "selector": "h1", "type": "text"},
+                               {"name": "content", "selector": ".content", "type": "html"}
+                           ]
+                         }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
+*(Keep Deep Crawler Example)*
+
+### REST API Examples
+
+Update URLs to use port `11235`.
+
+#### Simple Crawl
+
+```python
+import requests
+
+# Configuration objects converted to the required JSON structure
+browser_config_payload = {
+    "type": "BrowserConfig",
+    "params": {"headless": True}
+}
+crawler_config_payload = {
+    "type": "CrawlerRunConfig",
+    "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
+}
+
+crawl_payload = {
+    "urls": ["https://httpbin.org/html"],
+    "browser_config": browser_config_payload,
+    "crawler_config": crawler_config_payload
+}
+response = requests.post(
+    "http://localhost:11235/crawl", # Updated port
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled
+    json=crawl_payload
+)
+print(f"Status Code: {response.status_code}")
+if response.ok:
+    print(response.json())
+else:
+    print(f"Error: {response.text}")
+
+```
+
+#### Streaming Results
+
+```python
+import json
+import httpx # Use httpx for async streaming example
+
+async def test_stream_crawl(token: str = None): # Made token optional
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:11235/crawl/stream" # Updated port
+    payload = {
+        "urls": [
+            "https://httpbin.org/html",
+            "https://httpbin.org/links/5/0",
+        ],
+        "browser_config": {
+            "type": "BrowserConfig",
+            "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
+        },
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"stream": True, "cache_mode": "bypass"}
+        }
+    }
+
+    headers = {}
+    # if token:
+    #    headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
+
+    try:
+        async with httpx.AsyncClient() as client:
+            async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
+                print(f"Status: {response.status_code} (Expected: 200)")
+                response.raise_for_status() # Raise exception for bad status codes
+
+                # Read streaming response line-by-line (NDJSON)
+                async for line in response.aiter_lines():
+                    if line:
+                        try:
+                            data = json.loads(line)
+                            # Check for completion marker
+                            if data.get("status") == "completed":
+                                print("Stream completed.")
+                                break
+                            print(f"Streamed Result: {json.dumps(data, indent=2)}")
+                        except json.JSONDecodeError:
+                            print(f"Warning: Could not decode JSON line: {line}")
+
+    except httpx.HTTPStatusError as e:
+         print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+
+# To run this example:
+# import asyncio
+# asyncio.run(test_stream_crawl())
+```
+
+---
+
+## Metrics & Monitoring
+
+Keep an eye on your crawler with these endpoints:
+
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
+
+Example health check:
+```bash
+curl http://localhost:11235/health
+```
+
+---
+
+*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
+
+---
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file.
+
+### Understanding config.yml
+
+The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
+
+Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
+
+```yaml
+# Application Configuration
+app:
+  title: "Crawl4AI API"
+  version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
+  host: "0.0.0.0"
+  port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
+  reload: False # Default set to False - suitable for production
+  timeout_keep_alive: 300
+
+# Default LLM Configuration
+llm:
+  provider: "openai/gpt-4o-mini"
+  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
+
+# Redis Configuration (Used by internal Redis server managed by supervisord)
+redis:
+  host: "localhost"
+  port: 6379
+  db: 0
+  password: ""
+  # ... other redis options ...
+
+# Rate Limiting Configuration
+rate_limiting:
+  enabled: True
+  default_limit: "1000/minute"
+  trusted_proxies: []
+  storage_uri: "memory://"  # Use "redis://localhost:6379" if you need persistent/shared limits
+
+# Security Configuration
+security:
+  enabled: false # Master toggle for security features
+  jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
+  https_redirect: false # Force HTTPS (requires security.enabled=true)
+  trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
+  headers: # Security headers (applied if security.enabled=true)
+    x_content_type_options: "nosniff"
+    x_frame_options: "DENY"
+    content_security_policy: "default-src 'self'"
+    strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+  memory_threshold_percent: 95.0
+  rate_limiter:
+    base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
+  timeouts:
+    stream_init: 30.0  # Timeout for stream initialization
+    batch_process: 300.0 # Timeout for non-streaming /crawl processing
+
+# Logging Configuration
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+  prometheus:
+    enabled: True
+    endpoint: "/metrics"
+  health_check:
+    endpoint: "/health"
+```
+
+*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
+
+*(Configuration Tips and Best Practices remain the same)*
+
+### Customizing Your Configuration
+
+You can override the default `config.yml`.
+
+#### Method 1: Modify Before Build
+
+1.  Edit the `deploy/docker/config.yml` file in your local repository clone.
+2.  Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
+
+#### Method 2: Runtime Mount (Recommended for Custom Deploys)
+
+1.  Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
+2.  Mount it when running the container:
+
+    *   **Using `docker run`:**
+        ```bash
+        # Assumes my-custom-config.yml is in the current directory
+        docker run -d -p 11235:11235 \
+          --name crawl4ai-custom-config \
+          --env-file .llm.env \
+          --shm-size=1g \
+          -v $(pwd)/my-custom-config.yml:/app/config.yml \
+          unclecode/crawl4ai:latest # Or your specific tag
+        ```
+
+    *   **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
+        ```yaml
+        services:
+          crawl4ai-hub-amd64: # Or your chosen service
+            image: unclecode/crawl4ai:latest
+            profiles: ["hub-amd64"]
+            <<: *base-config
+            volumes:
+              # Mount local custom config over the default one in the container
+              - ./my-custom-config.yml:/app/config.yml
+              # Keep the shared memory volume from base-config
+              - /dev/shm:/dev/shm
+        ```
+        *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
+
+> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
+
+### Configuration Recommendations
+
+1. **Security First** 🔒
+   - Always enable security in production
+   - Use specific trusted_hosts instead of wildcards
+   - Set up proper rate limiting to protect your server
+   - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** 💻
+   - Adjust memory_threshold_percent based on available RAM
+   - Set timeouts according to your content size and network conditions
+   - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** 📊
+   - Enable Prometheus if you need metrics
+   - Set DEBUG logging in development, INFO in production
+   - Regular health check monitoring is crucial
+
+4. **Performance Tuning** ⚡
+   - Start with conservative rate limiter delays
+   - Increase batch_process timeout for large content
+   - Adjust stream_init timeout based on initial response times
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- 📖 Check our [full documentation](https://docs.crawl4ai.com)
+- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
+- ⭐ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment  
+- Using the interactive playground for testing
+- Making API requests with proper typing
+- Using the Python SDK
+- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
+- Connecting via the Model Context Protocol (MCP)
+- Monitoring your deployment
+
+The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
+
+For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
+
+Happy crawling! 🕷️
diff --git a/docs/md_v2/core/docker-deploymeny.md b/docs/md_v2/core/docker-deploymeny.md
deleted file mode 100644
index a3d0def1..00000000
--- a/docs/md_v2/core/docker-deploymeny.md
+++ /dev/null
@@ -1,702 +0,0 @@
-# Docker Deployment
-
-Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments.
-
-## Quick Start 🚀
-
-Pull and run the basic version:
-
-```bash
-# Basic run without security
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
-
-# Run with API security enabled
-docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic
-```
-
-## Running with Docker Compose 🐳
-
-### Use Docker Compose (From Local Dockerfile or Docker Hub)
-
-Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub.
-
-### **Option 1: Using Docker Compose to Build Locally**
-If you want to build the image locally, use the provided `docker-compose.local.yml` file.
-
-```bash
-docker-compose -f docker-compose.local.yml up -d
-```
-
-This will:
-1. Build the Docker image from the provided `Dockerfile`.
-2. Start the container and expose it on `http://localhost:11235`.
-
----
-
-### **Option 2: Using Docker Compose with Pre-Built Image from Hub**
-If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file.
-
-```bash
-docker-compose -f docker-compose.hub.yml up -d
-```
-
-This will:
-1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration).
-2. Start the container and expose it on `http://localhost:11235`.
-
----
-
-### **Stopping the Running Services**
-
-To stop the services started via Docker Compose, you can use:
-
-```bash
-docker-compose -f docker-compose.local.yml down
-# OR
-docker-compose -f docker-compose.hub.yml down
-```
-
-If the containers don’t stop and the application is still running, check the running containers:
-
-```bash
-docker ps
-```
-
-Find the `CONTAINER ID` of the running service and stop it forcefully:
-
-```bash
-docker stop <CONTAINER_ID>
-```
-
----
-
-### **Debugging with Docker Compose**
-
-- **Check Logs**: To view the container logs:
-  ```bash
-  docker-compose -f docker-compose.local.yml logs -f
-  ```
-
-- **Remove Orphaned Containers**: If the service is still running unexpectedly:
-  ```bash
-  docker-compose -f docker-compose.local.yml down --remove-orphans
-  ```
-
-- **Manually Remove Network**: If the network is still in use:
-  ```bash
-  docker network ls
-  docker network rm crawl4ai_default
-  ```
-
----
-
-### Why Use Docker Compose?
-
-Docker Compose is the recommended way to deploy Crawl4AI because:
-1. It simplifies multi-container setups.
-2. Allows you to define environment variables, resources, and ports in a single file.
-3. Makes it easier to switch between local development and production-ready images.
-
-For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent.
-
-
-
-
-## API Security 🔒
-
-### Understanding CRAWL4AI_API_TOKEN
-
-The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance:
-
-- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication
-- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible
-
-```bash
-# Secured Instance
-docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all
-
-# Unsecured Instance
-docker run -p 11235:11235 unclecode/crawl4ai:all
-```
-
-### Making API Calls
-
-For secured instances, include the token in all requests:
-
-```python
-import requests
-
-# Setup headers if token is being used
-api_token = "your_secret_token"  # Same token set in CRAWL4AI_API_TOKEN
-headers = {"Authorization": f"Bearer {api_token}"} if api_token else {}
-
-# Making authenticated requests
-response = requests.post(
-    "http://localhost:11235/crawl",
-    headers=headers,
-    json={
-        "urls": "https://example.com",
-        "priority": 10
-    }
-)
-
-# Checking task status
-task_id = response.json()["task_id"]
-status = requests.get(
-    f"http://localhost:11235/task/{task_id}",
-    headers=headers
-)
-```
-
-### Using with Docker Compose
-
-In your `docker-compose.yml`:
-```yaml
-services:
-  crawl4ai:
-    image: unclecode/crawl4ai:all
-    environment:
-      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional
-    # ... other configuration
-```
-
-Then either:
-1. Set in `.env` file:
-```env
-CRAWL4AI_API_TOKEN=your_secret_token
-```
-
-2. Or set via command line:
-```bash
-CRAWL4AI_API_TOKEN=your_secret_token docker-compose up
-```
-
-> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`).
-
-## Configuration Options 🔧
-
-### Environment Variables
-
-You can configure the service using environment variables:
-
-```bash
-# Basic configuration
-docker run -p 11235:11235 \
-    -e MAX_CONCURRENT_TASKS=5 \
-    unclecode/crawl4ai:all
-
-# With security and LLM support
-docker run -p 11235:11235 \
-    -e CRAWL4AI_API_TOKEN=your_secret_token \
-    -e OPENAI_API_KEY=sk-... \
-    -e ANTHROPIC_API_KEY=sk-ant-... \
-    unclecode/crawl4ai:all
-```
-
-### Using Docker Compose (Recommended) 🐳
-
-Create a `docker-compose.yml`:
-
-```yaml
-version: '3.8'
-
-services:
-  crawl4ai:
-    image: unclecode/crawl4ai:all
-    ports:
-      - "11235:11235"
-    environment:
-      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API security
-      - MAX_CONCURRENT_TASKS=5
-      # LLM Provider Keys
-      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-      - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
-    volumes:
-      - /dev/shm:/dev/shm
-    deploy:
-      resources:
-        limits:
-          memory: 4G
-        reservations:
-          memory: 1G
-```
-
-You can run it in two ways:
-
-1. Using environment variables directly:
-```bash
-CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up
-```
-
-2. Using a `.env` file (recommended):
-Create a `.env` file in the same directory:
-```env
-# API Security (optional)
-CRAWL4AI_API_TOKEN=your_secret_token
-
-# LLM Provider Keys
-OPENAI_API_KEY=sk-...
-ANTHROPIC_API_KEY=sk-ant-...
-
-# Other Configuration
-MAX_CONCURRENT_TASKS=5
-```
-
-Then simply run:
-```bash
-docker-compose up
-```
-
-### Testing the Deployment 🧪
-
-```python
-import requests
-
-# For unsecured instances
-def test_unsecured():
-    # Health check
-    health = requests.get("http://localhost:11235/health")
-    print("Health check:", health.json())
-
-    # Basic crawl
-    response = requests.post(
-        "http://localhost:11235/crawl",
-        json={
-            "urls": "https://www.nbcnews.com/business",
-            "priority": 10
-        }
-    )
-    task_id = response.json()["task_id"]
-    print("Task ID:", task_id)
-
-# For secured instances
-def test_secured(api_token):
-    headers = {"Authorization": f"Bearer {api_token}"}
-    
-    # Basic crawl with authentication
-    response = requests.post(
-        "http://localhost:11235/crawl",
-        headers=headers,
-        json={
-            "urls": "https://www.nbcnews.com/business",
-            "priority": 10
-        }
-    )
-    task_id = response.json()["task_id"]
-    print("Task ID:", task_id)
-```
-
-### LLM Extraction Example 🤖
-
-When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction:
-
-```python
-request = {
-    "urls": "https://example.com",
-    "extraction_config": {
-        "type": "llm",
-        "params": {
-            "provider": "openai/gpt-4",
-            "instruction": "Extract main topics from the page"
-        }
-    }
-}
-
-# Make the request (add headers if using API security)
-response = requests.post("http://localhost:11235/crawl", json=request)
-```
-
-> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure!
-
-
-## Usage Examples 📝
-
-### Basic Crawling
-
-```python
-request = {
-    "urls": "https://www.nbcnews.com/business",
-    "priority": 10
-}
-
-response = requests.post("http://localhost:11235/crawl", json=request)
-task_id = response.json()["task_id"]
-
-# Get results
-result = requests.get(f"http://localhost:11235/task/{task_id}")
-```
-
-### Structured Data Extraction
-
-```python
-schema = {
-    "name": "Crypto Prices",
-    "baseSelector": ".cds-tableRow-t45thuk",
-    "fields": [
-        {
-            "name": "crypto",
-            "selector": "td:nth-child(1) h2",
-            "type": "text",
-        },
-        {
-            "name": "price",
-            "selector": "td:nth-child(2)",
-            "type": "text",
-        }
-    ],
-}
-
-request = {
-    "urls": "https://www.coinbase.com/explore",
-    "extraction_config": {
-        "type": "json_css",
-        "params": {"schema": schema}
-    }
-}
-```
-
-### Dynamic Content Handling
-
-```python
-request = {
-    "urls": "https://www.nbcnews.com/business",
-    "js_code": [
-        "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
-    ],
-    "wait_for": "article.tease-card:nth-child(10)"
-}
-```
-
-### AI-Powered Extraction (Full Version)
-
-```python
-request = {
-    "urls": "https://www.nbcnews.com/business",
-    "extraction_config": {
-        "type": "cosine",
-        "params": {
-            "semantic_filter": "business finance economy",
-            "word_count_threshold": 10,
-            "max_dist": 0.2,
-            "top_k": 3
-        }
-    }
-}
-```
-
-## Platform-Specific Instructions 💻
-
-### macOS
-```bash
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
-```
-
-### Ubuntu
-```bash
-# Basic version
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
-
-# With GPU support
-docker pull unclecode/crawl4ai:gpu
-docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu
-```
-
-### Windows (PowerShell)
-```powershell
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
-```
-
-## Testing 🧪
-
-Save this as `test_docker.py`:
-
-```python
-import requests
-import json
-import time
-import sys
-
-class Crawl4AiTester:
-    def __init__(self, base_url: str = "http://localhost:11235"):
-        self.base_url = base_url
-        
-    def submit_and_wait(self, request_data: dict, timeout: int = 300) -> dict:
-        # Submit crawl job
-        response = requests.post(f"{self.base_url}/crawl", json=request_data)
-        task_id = response.json()["task_id"]
-        print(f"Task ID: {task_id}")
-        
-        # Poll for result
-        start_time = time.time()
-        while True:
-            if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} timeout")
-                
-            result = requests.get(f"{self.base_url}/task/{task_id}")
-            status = result.json()
-            
-            if status["status"] == "completed":
-                return status
-                
-            time.sleep(2)
-
-def test_deployment():
-    tester = Crawl4AiTester()
-    
-    # Test basic crawl
-    request = {
-        "urls": "https://www.nbcnews.com/business",
-        "priority": 10
-    }
-    
-    result = tester.submit_and_wait(request)
-    print("Basic crawl successful!")
-    print(f"Content length: {len(result['result']['markdown'])}")
-
-if __name__ == "__main__":
-    test_deployment()
-```
-
-## Advanced Configuration ⚙️
-
-### Crawler Parameters
-
-The `crawler_params` field allows you to configure the browser instance and crawling behavior. Here are key parameters you can use:
-
-```python
-request = {
-    "urls": "https://example.com",
-    "crawler_params": {
-        # Browser Configuration
-        "headless": True,                    # Run in headless mode
-        "browser_type": "chromium",          # chromium/firefox/webkit
-        "user_agent": "custom-agent",        # Custom user agent
-        "proxy": "http://proxy:8080",        # Proxy configuration
-        
-        # Performance & Behavior
-        "page_timeout": 30000,               # Page load timeout (ms)
-        "verbose": True,                     # Enable detailed logging
-        "semaphore_count": 5,               # Concurrent request limit
-        
-        # Anti-Detection Features
-        "simulate_user": True,               # Simulate human behavior
-        "magic": True,                       # Advanced anti-detection
-        "override_navigator": True,          # Override navigator properties
-        
-        # Session Management
-        "user_data_dir": "./browser-data",   # Browser profile location
-        "use_managed_browser": True,         # Use persistent browser
-    }
-}
-```
-
-### Extra Parameters
-
-The `extra` field allows passing additional parameters directly to the crawler's `arun` function:
-
-```python
-request = {
-    "urls": "https://example.com",
-    "extra": {
-        "word_count_threshold": 10,          # Min words per block
-        "only_text": True,                   # Extract only text
-        "bypass_cache": True,                # Force fresh crawl
-        "process_iframes": True,             # Include iframe content
-    }
-}
-```
-
-### Complete Examples
-
-1. **Advanced News Crawling**
-```python
-request = {
-    "urls": "https://www.nbcnews.com/business",
-    "crawler_params": {
-        "headless": True,
-        "page_timeout": 30000,
-        "remove_overlay_elements": True      # Remove popups
-    },
-    "extra": {
-        "word_count_threshold": 50,          # Longer content blocks
-        "bypass_cache": True                 # Fresh content
-    },
-    "css_selector": ".article-body"
-}
-```
-
-2. **Anti-Detection Configuration**
-```python
-request = {
-    "urls": "https://example.com",
-    "crawler_params": {
-        "simulate_user": True,
-        "magic": True,
-        "override_navigator": True,
-        "user_agent": "Mozilla/5.0 ...",
-        "headers": {
-            "Accept-Language": "en-US,en;q=0.9"
-        }
-    }
-}
-```
-
-3. **LLM Extraction with Custom Parameters**
-```python
-request = {
-    "urls": "https://openai.com/pricing",
-    "extraction_config": {
-        "type": "llm",
-        "params": {
-            "provider": "openai/gpt-4",
-            "schema": pricing_schema
-        }
-    },
-    "crawler_params": {
-        "verbose": True,
-        "page_timeout": 60000
-    },
-    "extra": {
-        "word_count_threshold": 1,
-        "only_text": True
-    }
-}
-```
-
-4. **Session-Based Dynamic Content**
-```python
-request = {
-    "urls": "https://example.com",
-    "crawler_params": {
-        "session_id": "dynamic_session",
-        "headless": False,
-        "page_timeout": 60000
-    },
-    "js_code": ["window.scrollTo(0, document.body.scrollHeight);"],
-    "wait_for": "js:() => document.querySelectorAll('.item').length > 10",
-    "extra": {
-        "delay_before_return_html": 2.0
-    }
-}
-```
-
-5. **Screenshot with Custom Timing**
-```python
-request = {
-    "urls": "https://example.com",
-    "screenshot": True,
-    "crawler_params": {
-        "headless": True,
-        "screenshot_wait_for": ".main-content"
-    },
-    "extra": {
-        "delay_before_return_html": 3.0
-    }
-}
-```
-
-### Parameter Reference Table
-
-| Category | Parameter | Type | Description |
-|----------|-----------|------|-------------|
-| Browser | headless | bool | Run browser in headless mode |
-| Browser | browser_type | str | Browser engine selection |
-| Browser | user_agent | str | Custom user agent string |
-| Network | proxy | str | Proxy server URL |
-| Network | headers | dict | Custom HTTP headers |
-| Timing | page_timeout | int | Page load timeout (ms) |
-| Timing | delay_before_return_html | float | Wait before capture |
-| Anti-Detection | simulate_user | bool | Human behavior simulation |
-| Anti-Detection | magic | bool | Advanced protection |
-| Session | session_id | str | Browser session ID |
-| Session | user_data_dir | str | Profile directory |
-| Content | word_count_threshold | int | Minimum words per block |
-| Content | only_text | bool | Text-only extraction |
-| Content | process_iframes | bool | Include iframe content |
-| Debug | verbose | bool | Detailed logging |
-| Debug | log_console | bool | Browser console logs |
-
-## Troubleshooting 🔍
-
-### Common Issues
-
-1. **Connection Refused**
-   ```
-   Error: Connection refused at localhost:11235
-   ```
-   Solution: Ensure the container is running and ports are properly mapped.
-
-2. **Resource Limits**
-   ```
-   Error: No available slots
-   ```
-   Solution: Increase MAX_CONCURRENT_TASKS or container resources.
-
-3. **GPU Access**
-   ```
-   Error: GPU not found
-   ```
-   Solution: Ensure proper NVIDIA drivers and use `--gpus all` flag.
-
-### Debug Mode
-
-Access container for debugging:
-```bash
-docker run -it --entrypoint /bin/bash unclecode/crawl4ai:all
-```
-
-View container logs:
-```bash
-docker logs [container_id]
-```
-
-## Best Practices 🌟
-
-1. **Resource Management**
-   - Set appropriate memory and CPU limits
-   - Monitor resource usage via health endpoint
-   - Use basic version for simple crawling tasks
-
-2. **Scaling**
-   - Use multiple containers for high load
-   - Implement proper load balancing
-   - Monitor performance metrics
-
-3. **Security**
-   - Use environment variables for sensitive data
-   - Implement proper network isolation
-   - Regular security updates
-
-## API Reference 📚
-
-### Health Check
-```http
-GET /health
-```
-
-### Submit Crawl Task
-```http
-POST /crawl
-Content-Type: application/json
-
-{
-    "urls": "string or array",
-    "extraction_config": {
-        "type": "basic|llm|cosine|json_css",
-        "params": {}
-    },
-    "priority": 1-10,
-    "ttl": 3600
-}
-```
-
-### Get Task Status
-```http
-GET /task/{task_id}
-```
-
-For more details, visit the [official documentation](https://docs.crawl4ai.com/).
\ No newline at end of file
diff --git a/docs/md_v2/core/examples.md b/docs/md_v2/core/examples.md
new file mode 100644
index 00000000..93989552
--- /dev/null
+++ b/docs/md_v2/core/examples.md
@@ -0,0 +1,115 @@
+# Code Examples
+
+This page provides a comprehensive list of example scripts that demonstrate various features and capabilities of Crawl4AI. Each example is designed to showcase specific functionality, making it easier for you to understand how to implement these features in your own projects.
+
+## Getting Started Examples
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Hello World | A simple introductory example demonstrating basic usage of AsyncWebCrawler with JavaScript execution and content filtering. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world.py) |
+| Quickstart | A comprehensive collection of examples showcasing various features including basic crawling, content cleaning, link analysis, JavaScript execution, CSS selectors, media handling, custom hooks, proxy configuration, screenshots, and multiple extraction strategies. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) |
+| Quickstart Set 1 | Basic examples for getting started with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_examples_set_1.py) |
+| Quickstart Set 2 | More advanced examples for working with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_examples_set_2.py) |
+
+## Browser & Crawling Features
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Built-in Browser | Demonstrates how to use the built-in browser capabilities. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/builtin_browser_example.py) |
+| Browser Optimization | Focuses on browser performance optimization techniques. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/browser_optimization_example.py) |
+| arun vs arun_many | Compares the `arun` and `arun_many` methods for single vs. multiple URL crawling. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/arun_vs_arun_many.py) |
+| Multiple URLs | Shows how to crawl multiple URLs asynchronously. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/async_webcrawler_multiple_urls_example.py) |
+| Page Interaction | Guide on interacting with dynamic elements through clicks. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/tutorial_dynamic_clicks.md) |
+| Crawler Monitor | Shows how to monitor the crawler's activities and status. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crawler_monitor_example.py) |
+| Full Page Screenshot & PDF | Guide on capturing full-page screenshots and PDFs from massive webpages. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/full_page_screenshot_and_pdf_export.md) |
+
+## Advanced Crawling & Deep Crawling
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
+| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
+| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
+| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
+
+## Extraction Strategies
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Extraction Strategies | Demonstrates different extraction strategies with various input formats (markdown, HTML, fit_markdown) and JSON-based extractors (CSS and XPath). | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/extraction_strategies_examples.py) |
+| Scraping Strategies | Compares the performance of different scraping strategies. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/scraping_strategies_performance.py) |
+| LLM Extraction | Demonstrates LLM-based extraction specifically for OpenAI pricing data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/llm_extraction_openai_pricing.py) |
+| LLM Markdown | Shows how to use LLMs to generate markdown from crawled content. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/llm_markdown_generator.py) |
+| Summarize Page | Shows how to summarize web page content. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/summarize_page.py) |
+
+## E-commerce & Specialized Crawling
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Amazon Product Extraction | Demonstrates how to extract structured product data from Amazon search results using CSS selectors. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/amazon_product_extraction_direct_url.py) |
+| Amazon with Hooks | Shows how to use hooks with Amazon product extraction. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/amazon_product_extraction_using_hooks.py) |
+| Amazon with JavaScript | Demonstrates using custom JavaScript for Amazon product extraction. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/amazon_product_extraction_using_use_javascript.py) |
+| Crypto Analysis | Demonstrates how to crawl and analyze cryptocurrency data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crypto_analysis_example.py) |
+| SERP API | Demonstrates using Crawl4AI with search engine result pages. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/serp_api_project_11_feb.py) |
+
+## Customization & Security
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Hooks | Illustrates how to use hooks at different stages of the crawling process for advanced customization. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hooks_example.py) |
+| Identity-Based Browsing | Illustrates identity-based browsing configurations for authentic browsing experiences. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/identity_based_browsing.py) |
+| Proxy Rotation | Shows how to use proxy rotation for web scraping and avoiding IP blocks. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/proxy_rotation_demo.py) |
+| SSL Certificate | Illustrates SSL certificate handling and verification. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/ssl_example.py) |
+| Language Support | Shows how to handle different languages during crawling. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/language_support_example.py) |
+| Geolocation | Demonstrates how to use geolocation features. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/use_geo_location.py) |
+
+## Docker & Deployment
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Docker Config | Demonstrates how to create and use Docker configuration objects. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_config_obj.py) |
+| Docker Basic | A test suite for Docker deployment, showcasing various functionalities through the Docker API. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py) |
+| Docker REST API | Shows how to interact with Crawl4AI Docker using REST API calls. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py) |
+| Docker SDK | Demonstrates using the Python SDK for Crawl4AI Docker. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py) |
+
+## Application Examples
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Research Assistant | Demonstrates how to build a research assistant using Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/research_assistant.py) |
+| REST Call | Shows how to make REST API calls with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/rest_call.py) |
+| Chainlit Integration | Shows how to integrate Crawl4AI with Chainlit. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/chainlit.md) |
+| Crawl4AI vs FireCrawl | Compares Crawl4AI with the FireCrawl library. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crawlai_vs_firecrawl.py) |
+
+## Content Generation & Markdown
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Content Source | Demonstrates how to work with different content sources in markdown generation. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/markdown/content_source_example.py) |
+| Content Source (Short) | A simplified version of content source usage. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/markdown/content_source_short_example.py) |
+| Built-in Browser Guide | Guide for using the built-in browser capabilities. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/README_BUILTIN_BROWSER.md) |
+
+## Running the Examples
+
+To run any of these examples, you'll need to have Crawl4AI installed:
+
+```bash
+pip install crawl4ai
+```
+
+Then, you can run an example script like this:
+
+```bash
+python -m docs.examples.hello_world
+```
+
+For examples that require additional dependencies or environment variables, refer to the comments at the top of each file.
+
+Some examples may require:
+- API keys (for LLM-based examples)
+- Docker setup (for Docker-related examples)
+- Additional dependencies (specified in the example files)
+
+## Contributing New Examples
+
+If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
\ No newline at end of file
diff --git a/docs/md_v2/core/fit-markdown.md b/docs/md_v2/core/fit-markdown.md
index 6c894330..3c6d3e02 100644
--- a/docs/md_v2/core/fit-markdown.md
+++ b/docs/md_v2/core/fit-markdown.md
@@ -10,11 +10,10 @@
 
 In **`CrawlerRunConfig`**, you can specify a **`content_filter`** to shape how content is pruned or ranked before final markdown generation. A filter’s logic is applied **before** or **during** the HTML→Markdown process, producing:
 
-- **`result.markdown_v2.raw_markdown`** (unfiltered)
-- **`result.markdown_v2.fit_markdown`** (filtered or “fit” version)
-- **`result.markdown_v2.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`)
+- **`result.markdown.raw_markdown`** (unfiltered)
+- **`result.markdown.fit_markdown`** (filtered or “fit” version)
+- **`result.markdown.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`)
 
-> **Note**: We’re currently storing the result in `markdown_v2`, but eventually we’ll unify it as `result.markdown`.
 
 ### 1.2 Common Filters
 
@@ -62,8 +61,8 @@ async def main():
         
         if result.success:
             # 'fit_markdown' is your pruned content, focusing on "denser" text
-            print("Raw Markdown length:", len(result.markdown_v2.raw_markdown))
-            print("Fit Markdown length:", len(result.markdown_v2.fit_markdown))
+            print("Raw Markdown length:", len(result.markdown.raw_markdown))
+            print("Fit Markdown length:", len(result.markdown.fit_markdown))
         else:
             print("Error:", result.error_message)
 
@@ -123,7 +122,7 @@ async def main():
         )
         if result.success:
             print("Fit Markdown (BM25 query-based):")
-            print(result.markdown_v2.fit_markdown)
+            print(result.markdown.fit_markdown)
         else:
             print("Error:", result.error_message)
 
@@ -144,11 +143,11 @@ if __name__ == "__main__":
 
 ## 4. Accessing the “Fit” Output
 
-After the crawl, your “fit” content is found in **`result.markdown_v2.fit_markdown`**. In future versions, it will be **`result.markdown.fit_markdown`**. Meanwhile:
+After the crawl, your “fit” content is found in **`result.markdown.fit_markdown`**. 
 
 ```python
-fit_md = result.markdown_v2.fit_markdown
-fit_html = result.markdown_v2.fit_html
+fit_md = result.markdown.fit_markdown
+fit_html = result.markdown.fit_html
 ```
 
 If the content filter is **BM25**, you might see additional logic or references in `fit_markdown` that highlight relevant segments. If it’s **Pruning**, the text is typically well-cleaned but not necessarily matched to a query.
@@ -167,7 +166,6 @@ prune_filter = PruningContentFilter(
 )
 md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
 config = CrawlerRunConfig(markdown_generator=md_generator)
-# => result.markdown_v2.fit_markdown
 ```
 
 ### 5.2 BM25
@@ -179,7 +177,6 @@ bm25_filter = BM25ContentFilter(
 )
 md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
 config = CrawlerRunConfig(markdown_generator=md_generator)
-# => result.markdown_v2.fit_markdown
 ```
 
 ---
@@ -203,7 +200,7 @@ Thus, **multi-level** filtering occurs:
 
 1. The crawler’s `excluded_tags` are removed from the HTML first.  
 2. The content filter (Pruning, BM25, or custom) prunes or ranks the remaining text blocks.  
-3. The final “fit” content is generated in `result.markdown_v2.fit_markdown`.
+3. The final “fit” content is generated in `result.markdown.fit_markdown`.
 
 ---
 
@@ -241,7 +238,7 @@ class MyCustomFilter(RelevantContentFilter):
 - **PruningContentFilter**: Great if you just want the “meatiest” text without a user query.  
 - **BM25ContentFilter**: Perfect for query-based extraction or searching.  
 - Combine with **`excluded_tags`, `exclude_external_links`, `word_count_threshold`** to refine your final “fit” text.  
-- Fit markdown ends up in **`result.markdown_v2.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions.
+- Fit markdown ends up in **`result.markdown.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions.
 
 With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
 
diff --git a/docs/md_v2/core/link-media.md b/docs/md_v2/core/link-media.md
index ed56e8fb..58bedcbc 100644
--- a/docs/md_v2/core/link-media.md
+++ b/docs/md_v2/core/link-media.md
@@ -4,7 +4,35 @@ In this tutorial, you’ll learn how to:
 
 1. Extract links (internal, external) from crawled pages  
 2. Filter or exclude specific domains (e.g., social media or custom domains)  
-3. Access and manage media data (especially images) in the crawl result  
+3. Access and ma### 3.2 Excluding Images
+
+#### Excluding External Images
+
+If you're dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_external_images=True
+)
+```
+
+This setting attempts to discard images from outside the primary domain, keeping only those from the site you're crawling.
+
+#### Excluding All Images
+
+If you want to completely remove all images from the page to maximize performance and reduce memory usage, use:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    exclude_all_images=True
+)
+```
+
+This setting removes all images very early in the processing pipeline, which significantly improves memory efficiency and processing speed. This is particularly useful when:
+- You don't need image data in your results
+- You're crawling image-heavy pages that cause memory issues
+- You want to focus only on text content
+- You need to maximize crawling speeddata (especially images) in the crawl result  
 4. Configure your crawler to exclude or prioritize certain images
 
 > **Prerequisites**  
@@ -133,19 +161,28 @@ This approach is handy when you still want external links but need to block cert
 
 ### 3.1 Accessing `result.media`
 
-By default, Crawl4AI collects images, audio, and video URLs it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`).
+By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
 
 **Basic Example**:
 
 ```python
 if result.success:
+    # Get images
     images_info = result.media.get("images", [])
     print(f"Found {len(images_info)} images in total.")
-    for i, img in enumerate(images_info[:5]):  # Inspect just the first 5
+    for i, img in enumerate(images_info[:3]):  # Inspect just the first 3
         print(f"[Image {i}] URL: {img['src']}")
         print(f"           Alt text: {img.get('alt', '')}")
         print(f"           Score: {img.get('score')}")
         print(f"           Description: {img.get('desc', '')}\n")
+    
+    # Get tables
+    tables = result.media.get("tables", [])
+    print(f"Found {len(tables)} data tables in total.")
+    for i, table in enumerate(tables):
+        print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
+        print(f"           Columns: {len(table.get('headers', []))}")
+        print(f"           Rows: {len(table.get('rows', []))}")
 ```
 
 **Structure Example**:
@@ -171,6 +208,19 @@ result.media = {
   ],
   "audio": [
     # Similar structure but with audio-specific fields
+  ],
+  "tables": [
+    {
+      "headers": ["Name", "Age", "Location"],
+      "rows": [
+        ["John Doe", "34", "New York"],
+        ["Jane Smith", "28", "San Francisco"],
+        ["Alex Johnson", "42", "Chicago"]
+      ],
+      "caption": "Employee Directory",
+      "summary": "Directory of company employees"
+    },
+    # More tables if present
   ]
 }
 ```
@@ -199,12 +249,91 @@ crawler_cfg = CrawlerRunConfig(
 
 This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling.
 
-### 3.3 Additional Media Config
+### 3.3 Working with Tables
+
+Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
+
+- Presence of thead and tbody sections
+- Use of th elements for headers
+- Column consistency
+- Text density
+- And other factors
+
+Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
+
+**Accessing Table Data**:
+
+```python
+if result.success:
+    tables = result.media.get("tables", [])
+    print(f"Found {len(tables)} data tables on the page")
+    
+    if tables:
+        # Access the first table
+        first_table = tables[0]
+        print(f"Table caption: {first_table.get('caption', 'No caption')}")
+        print(f"Headers: {first_table.get('headers', [])}")
+        
+        # Print the first 3 rows
+        for i, row in enumerate(first_table.get('rows', [])[:3]):
+            print(f"Row {i+1}: {row}")
+```
+
+**Configuring Table Extraction**:
+
+You can adjust the sensitivity of the table detection algorithm with:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+    table_score_threshold=5  # Lower value = more tables detected (default: 7)
+)
+```
+
+Each extracted table contains:
+- `headers`: Column header names
+- `rows`: List of rows, each containing cell values
+- `caption`: Table caption text (if available)
+- `summary`: Table summary attribute (if specified)
+
+### 3.4 Additional Media Config
 
 - **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.  
 - **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.  
+- **`capture_mhtml`**: Set to `True` if you want an MHTML snapshot of the page in `result.mhtml`. This format preserves the entire web page with all its resources (CSS, images, scripts) in a single file, making it perfect for archiving or offline viewing.
 - **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction.
 
+#### Example: Capturing Page as MHTML
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    crawler_cfg = CrawlerRunConfig(
+        capture_mhtml=True  # Enable MHTML capture
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=crawler_cfg)
+        
+        if result.success and result.mhtml:
+            # Save the MHTML snapshot to a file
+            with open("example.mhtml", "w", encoding="utf-8") as f:
+                f.write(result.mhtml)
+            print("MHTML snapshot saved to example.mhtml")
+        else:
+            print("Failed to capture MHTML:", result.error_message)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+The MHTML format is particularly useful because:
+- It captures the complete page state including all resources
+- It can be opened in most modern browsers for offline viewing
+- It preserves the page exactly as it appeared during crawling
+- It's a single file, making it easy to store and transfer
+
 ---
 
 ## 4. Putting It All Together: Link & Media Filtering
@@ -273,4 +402,11 @@ if __name__ == "__main__":
 
 ---
 
-**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
\ No newline at end of file
+**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
+### Table Extraction Tips
+
+- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
+- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
+- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
+
+The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.
diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md
index ab8f9b05..e6f5e12a 100644
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -111,13 +111,71 @@ Some commonly used `options`:
 - **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page.  
 - **`include_sup_sub`** (bool): Attempt to handle `<sup>` / `<sub>` in a more readable way.
 
+## 4. Selecting the HTML Source for Markdown Generation
+
+The `content_source` parameter allows you to control which HTML content is used as input for markdown generation. This gives you flexibility in how the HTML is processed before conversion to markdown.
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Option 1: Use the raw HTML directly from the webpage (before any processing)
+    raw_md_generator = DefaultMarkdownGenerator(
+        content_source="raw_html",
+        options={"ignore_links": True}
+    )
+    
+    # Option 2: Use the cleaned HTML (after scraping strategy processing - default)
+    cleaned_md_generator = DefaultMarkdownGenerator(
+        content_source="cleaned_html",  # This is the default
+        options={"ignore_links": True}
+    )
+    
+    # Option 3: Use preprocessed HTML optimized for schema extraction
+    fit_md_generator = DefaultMarkdownGenerator(
+        content_source="fit_html",
+        options={"ignore_links": True}
+    )
+    
+    # Use one of the generators in your crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=raw_md_generator  # Try each of the generators
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success:
+            print("Markdown:\n", result.markdown.raw_markdown[:500])
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+### HTML Source Options
+
+- **`"cleaned_html"`** (default): Uses the HTML after it has been processed by the scraping strategy. This HTML is typically cleaner and more focused on content, with some boilerplate removed.
+
+- **`"raw_html"`**: Uses the original HTML directly from the webpage, before any cleaning or processing. This preserves more of the original content, but may include navigation bars, ads, footers, and other elements that might not be relevant to the main content.
+
+- **`"fit_html"`**: Uses HTML preprocessed for schema extraction. This HTML is optimized for structured data extraction and may have certain elements simplified or removed.
+
+### When to Use Each Option
+
+- Use **`"cleaned_html"`** (default) for most cases where you want a balance of content preservation and noise removal.
+- Use **`"raw_html"`** when you need to preserve all original content, or when the cleaning process is removing content you actually want to keep.
+- Use **`"fit_html"`** when working with structured data or when you need HTML that's optimized for schema extraction.
+
 ---
 
-## 4. Content Filters
+## 5. Content Filters
 
 **Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want.
 
-### 4.1 BM25ContentFilter
+### 5.1 BM25ContentFilter
 
 If you have a **search query**, BM25 is a good choice:
 
@@ -146,7 +204,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
 
 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
 
-### 4.2 PruningContentFilter
+### 5.2 PruningContentFilter
 
 If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections.
 
@@ -170,19 +228,18 @@ prune_filter = PruningContentFilter(
 - You want a broad cleanup without a user query.  
 - The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
 
-### 4.3 LLMContentFilter
+### 5.3 LLMContentFilter
 
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
 from crawl4ai.content_filter_strategy import LLMContentFilter
 
 async def main():
     # Initialize LLM filter with specific instruction
     filter = LLMContentFilter(
-        provider="openai/gpt-4o",  # or your preferred provider
-        api_token="your-api-token",  # or use environment variable
+        llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
         instruction="""
         Focus on extracting the core educational content.
         Include:
@@ -205,7 +262,7 @@ async def main():
 
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun("https://example.com", config=config)
-        print(result.fit_markdown)  # Filtered markdown content
+        print(result.markdown.fit_markdown)  # Filtered markdown content
 ```
 
 **Key Features:**
@@ -248,16 +305,13 @@ filter = LLMContentFilter(
 
 ---
 
-## 5. Using Fit Markdown
+## 6. Using Fit Markdown
 
-When a content filter is active, the library produces two forms of markdown inside `result.markdown_v2` or (if using the simplified field) `result.markdown`:
+When a content filter is active, the library produces two forms of markdown inside `result.markdown`:
 
 1. **`raw_markdown`**: The full unfiltered markdown.  
 2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments.
 
-**Note**:  
-> In earlier examples, you may see references to `result.markdown_v2`. Depending on your library version, you might access `result.markdown`, `result.markdown_v2`, or an object named `MarkdownGenerationResult`. The idea is the same: you’ll have a raw version and a filtered (“fit”) version if a filter is used.
-
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
@@ -277,7 +331,7 @@ async def main():
             print("Raw markdown:\n", result.markdown)
             
             # If a filter is used, we also have .fit_markdown:
-            md_object = result.markdown_v2  # or your equivalent
+            md_object = result.markdown  # or your equivalent
             print("Filtered markdown:\n", md_object.fit_markdown)
         else:
             print("Crawl failed:", result.error_message)
@@ -288,7 +342,7 @@ if __name__ == "__main__":
 
 ---
 
-## 6. The `MarkdownGenerationResult` Object
+## 7. The `MarkdownGenerationResult` Object
 
 If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as:
 
@@ -301,7 +355,7 @@ If your library stores detailed markdown output in an object like `MarkdownGener
 **Example**:
 
 ```python
-md_obj = result.markdown_v2  # your library’s naming may vary
+md_obj = result.markdown  # your library’s naming may vary
 print("RAW:\n", md_obj.raw_markdown)
 print("CITED:\n", md_obj.markdown_with_citations)
 print("REFERENCES:\n", md_obj.references_markdown)
@@ -319,7 +373,7 @@ Below is a **revised section** under “Combining Filters (BM25 + Pruning)” th
 
 ---
 
-## 7. Combining Filters (BM25 + Pruning) in Two Passes
+## 8. Combining Filters (BM25 + Pruning) in Two Passes
 
 You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead:
 
@@ -411,7 +465,7 @@ If your codebase or pipeline design allows applying multiple filters in one pass
 
 ---
 
-## 8. Common Pitfalls & Tips
+## 9. Common Pitfalls & Tips
 
 1. **No Markdown Output?**  
    - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements.  
@@ -431,11 +485,12 @@ If your codebase or pipeline design allows applying multiple filters in one pass
 
 ---
 
-## 9. Summary & Next Steps
+## 10. Summary & Next Steps
 
 In this **Markdown Generation Basics** tutorial, you learned to:
 
 - Configure the **DefaultMarkdownGenerator** with HTML-to-text options.  
+- Select different HTML sources using the `content_source` parameter.  
 - Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal.  
 - Distinguish between raw and filtered markdown (`fit_markdown`).  
 - Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.).
diff --git a/docs/md_v2/core/quickstart.md b/docs/md_v2/core/quickstart.md
index 04614533..de0b7e5e 100644
--- a/docs/md_v2/core/quickstart.md
+++ b/docs/md_v2/core/quickstart.md
@@ -128,6 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
 
 ```python
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import LLMConfig
 
 # Generate a schema (one-time cost)
 html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
@@ -135,15 +136,13 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
 # Using OpenAI (requires API token)
 schema = JsonCssExtractionStrategy.generate_schema(
     html,
-    llm_provider="openai/gpt-4o",  # Default provider
-    api_token="your-openai-token"  # Required for OpenAI
+    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")  # Required for OpenAI
 )
 
 # Or using Ollama (open source, no token needed)
 schema = JsonCssExtractionStrategy.generate_schema(
     html,
-    llm_provider="ollama/llama3.3",  # Open source alternative
-    api_token=None  # Not needed for Ollama
+    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
 )
 
 # Use the schema for fast, repeated extractions
@@ -212,7 +211,7 @@ import os
 import json
 import asyncio
 from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
 
 class OpenAIModelFee(BaseModel):
@@ -242,8 +241,7 @@ async def extract_structured_data_using_llm(
         word_count_threshold=1,
         page_timeout=80000,
         extraction_strategy=LLMExtractionStrategy(
-            provider=provider,
-            api_token=api_token,
+            llm_config = LLMConfig(provider=provider,api_token=api_token),
             schema=OpenAIModelFee.model_json_schema(),
             extraction_type="schema",
             instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
@@ -259,12 +257,6 @@ async def extract_structured_data_using_llm(
         print(result.extracted_content)
 
 if __name__ == "__main__":
-    # Use ollama with llama3.3
-    # asyncio.run(
-    #     extract_structured_data_using_llm(
-    #         provider="ollama/llama3.3", api_token="no-token"
-    #     )
-    # )
 
     asyncio.run(
         extract_structured_data_using_llm(
@@ -304,7 +296,7 @@ async def quick_parallel_example():
         # Stream results as they complete
         async for result in await crawler.arun_many(urls, config=run_conf):
             if result.success:
-                print(f"[OK] {result.url}, length: {len(result.markdown_v2.raw_markdown)}")
+                print(f"[OK] {result.url}, length: {len(result.markdown.raw_markdown)}")
             else:
                 print(f"[ERROR] {result.url} => {result.error_message}")
 
@@ -313,7 +305,7 @@ async def quick_parallel_example():
         results = await crawler.arun_many(urls, config=run_conf)
         for res in results:
             if res.success:
-                print(f"[OK] {res.url}, length: {len(res.markdown_v2.raw_markdown)}")
+                print(f"[OK] {res.url}, length: {len(res.markdown.raw_markdown)}")
             else:
                 print(f"[ERROR] {res.url} => {res.error_message}")
 
diff --git a/docs/md_v2/core/simple-crawling.md b/docs/md_v2/core/simple-crawling.md
index ec63984c..094b5cc7 100644
--- a/docs/md_v2/core/simple-crawling.md
+++ b/docs/md_v2/core/simple-crawling.md
@@ -39,8 +39,8 @@ result = await crawler.arun(
 # Different content formats
 print(result.html)         # Raw HTML
 print(result.cleaned_html) # Cleaned HTML
-print(result.markdown)     # Markdown version
-print(result.fit_markdown) # Most relevant content in markdown
+print(result.markdown.raw_markdown) # Raw markdown from cleaned html
+print(result.markdown.fit_markdown) # Most relevant content in markdown
 
 # Check success status
 print(result.success)      # True if crawl succeeded
diff --git a/docs/md_v2/extraction/llm-strategies.md b/docs/md_v2/extraction/llm-strategies.md
index dc2dba1a..9f6a6b3e 100644
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -2,7 +2,7 @@
 
 In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
 
-1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more).  
+1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more).  
 2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.  
 3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
 
@@ -18,13 +18,19 @@ In some cases, you need to extract **complex or unstructured** information from
 
 ---
 
-## 2. Provider-Agnostic via LightLLM
+## 2. Provider-Agnostic via LiteLLM
 
-Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
+You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
+
+```python
+llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+```
+
+Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
 
 - **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).  
 - **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.  
-- **`api_base`** (optional): If your provider has a custom endpoint.  
+- **`base_url`** (optional): If your provider has a custom endpoint.  
 
 This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
 
@@ -52,27 +58,25 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
 
 Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
 
-1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.  
-2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.  
-3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
-4. **`extraction_type`** (str): `"schema"` or `"block"`.  
-5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
-6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
-7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
-8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
-9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
+1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.    
+2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
+3. **`extraction_type`** (str): `"schema"` or `"block"`.  
+4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
+5. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
+6. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
+7. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
+8. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
    - `"markdown"`: The raw markdown (default).  
    - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.  
    - `"html"`: The cleaned or raw HTML.  
-10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
-11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
+9. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
+10. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
 
 **Example**:
 
 ```python
 extraction_strategy = LLMExtractionStrategy(
-    provider="openai/gpt-4",
-    api_token="YOUR_OPENAI_KEY",
+    llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
     schema=MyModel.model_json_schema(),
     extraction_type="schema",
     instruction="Extract a list of items from the text with 'name' and 'price' fields.",
@@ -97,7 +101,7 @@ import asyncio
 import json
 from pydantic import BaseModel, Field
 from typing import List
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
 
 class Product(BaseModel):
@@ -107,9 +111,8 @@ class Product(BaseModel):
 async def main():
     # 1. Define the LLM extraction strategy
     llm_strategy = LLMExtractionStrategy(
-        provider="openai/gpt-4o-mini",            # e.g. "ollama/llama2"
-        api_token=os.getenv('OPENAI_API_KEY'),
-        schema=Product.schema_json(),            # Or use model_json_schema()
+        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
+        schema=Product.schema_json(), # Or use model_json_schema()
         extraction_type="schema",
         instruction="Extract all product objects with 'name' and 'price' from the content.",
         chunk_token_threshold=1000,
@@ -235,8 +238,7 @@ class KnowledgeGraph(BaseModel):
 async def main():
     # LLM extraction strategy
     llm_strat = LLMExtractionStrategy(
-        provider="openai/gpt-4",
-        api_token=os.getenv('OPENAI_API_KEY'),
+        llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
         schema=KnowledgeGraph.schema_json(),
         extraction_type="schema",
         instruction="Extract entities and relationships from the content. Return valid JSON.",
@@ -288,7 +290,7 @@ if __name__ == "__main__":
 
 ## 11. Conclusion
 
-**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
+**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LiteLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
 
 - Put your LLM strategy **in `CrawlerRunConfig`**.  
 - Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.  
@@ -319,4 +321,4 @@ If your site’s data is consistent or repetitive, consider [`JsonCssExtractionS
 
 ---
 
-That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
\ No newline at end of file
+That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
diff --git a/docs/md_v2/extraction/no-llm-strategies.md b/docs/md_v2/extraction/no-llm-strategies.md
index 97002dad..23fa7ad2 100644
--- a/docs/md_v2/extraction/no-llm-strategies.md
+++ b/docs/md_v2/extraction/no-llm-strategies.md
@@ -1,15 +1,20 @@
 # Extracting JSON (No LLM)
 
-One of Crawl4AI’s **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. By defining a **schema** with CSS or XPath selectors, you can extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
+One of Crawl4AI's **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. Crawl4AI offers several strategies for LLM-free extraction:
+
+1. **Schema-based extraction** with CSS or XPath selectors via `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`
+2. **Regular expression extraction** with `RegexExtractionStrategy` for fast pattern matching
+
+These approaches let you extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
 
 **Why avoid LLM for basic extractions?**
 
-1. **Faster & Cheaper**: No API calls or GPU overhead.  
-2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free.  
-3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate.  
-4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel.
+1. **Faster & Cheaper**: No API calls or GPU overhead.  
+2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. Pattern-based extraction is practically carbon-free.  
+3. **Precise & Repeatable**: CSS/XPath selectors and regex patterns do exactly what you specify. LLM outputs can vary or hallucinate.  
+4. **Scales Readily**: For thousands of pages, pattern-based extraction runs quickly and in parallel.
 
-Below, we’ll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We’ll also highlight advanced features like **nested fields** and **base element attributes**.
+Below, we'll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We'll also highlight advanced features like **nested fields** and **base element attributes**.
 
 ---
 
@@ -17,17 +22,17 @@ Below, we’ll explore how to craft these schemas and use them with **JsonCssExt
 
 A schema defines:
 
-1. A **base selector** that identifies each “container” element on the page (e.g., a product row, a blog post card).  
-2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
-3. **Nested** or **list** types for repeated or hierarchical structures.  
+1. A **base selector** that identifies each "container" element on the page (e.g., a product row, a blog post card).  
+2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
+3. **Nested** or **list** types for repeated or hierarchical structures.  
 
-For example, if you have a list of products, each one might have a name, price, reviews, and “related products.” This approach is faster and more reliable than an LLM for consistent, structured pages.
+For example, if you have a list of products, each one might have a name, price, reviews, and "related products." This approach is faster and more reliable than an LLM for consistent, structured pages.
 
 ---
 
 ## 2. Simple Example: Crypto Prices
 
-Let’s begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don’t** call any LLM:
+Let's begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don't** call any LLM:
 
 ```python
 import json
@@ -87,7 +92,7 @@ asyncio.run(extract_crypto_prices())
 
 **Highlights**:
 
-- **`baseSelector`**: Tells us where each “item” (crypto row) is.  
+- **`baseSelector`**: Tells us where each "item" (crypto row) is.  
 - **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.  
 - Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
 
@@ -97,7 +102,7 @@ No LLM is needed, and the performance is **near-instant** for hundreds or thousa
 
 ### **XPath Example with `raw://` HTML**
 
-Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We’ll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
+Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We'll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
 
 ```python
 import json
@@ -168,12 +173,12 @@ asyncio.run(extract_crypto_prices_xpath())
 
 **Key Points**:
 
-1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
-2. **`baseSelector`** and each field’s `"selector"` use **XPath** instead of CSS.  
-3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
+1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
+2. **`baseSelector`** and each field's `"selector"` use **XPath** instead of CSS.  
+3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
 4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**.  
 
-That’s how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
+That's how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
 
 ---
 
@@ -187,7 +192,7 @@ We have a **sample e-commerce** HTML file on GitHub (example):
 ```
 https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
 ```
-This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**.
+This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**.
 
 ```python
 schema = {
@@ -333,24 +338,253 @@ async def extract_ecommerce_data():
 asyncio.run(extract_ecommerce_data())
 ```
 
-If all goes well, you get a **structured** JSON array with each “category,” containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
+If all goes well, you get a **structured** JSON array with each "category," containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
 
 ---
 
-## 4. Why “No LLM” Is Often Better
+## 4. RegexExtractionStrategy - Fast Pattern-Based Extraction
 
-1. **Zero Hallucination**: Schema-based extraction doesn’t guess text. It either finds it or not.  
-2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
-3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
-4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model.
+Crawl4AI now offers a powerful new zero-LLM extraction strategy: `RegexExtractionStrategy`. This strategy provides lightning-fast extraction of common data types like emails, phone numbers, URLs, dates, and more using pre-compiled regular expressions.
 
-**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns.
+### Key Features
+
+- **Zero LLM Dependency**: Extracts data without any AI model calls
+- **Blazing Fast**: Uses pre-compiled regex patterns for maximum performance
+- **Built-in Patterns**: Includes ready-to-use patterns for common data types
+- **Custom Patterns**: Add your own regex patterns for domain-specific extraction
+- **LLM-Assisted Pattern Generation**: Optionally use an LLM once to generate optimized patterns, then reuse them without further LLM calls
+
+### Simple Example: Extracting Common Entities
+
+The easiest way to start is by using the built-in pattern catalog:
+
+```python
+import json
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    RegexExtractionStrategy
+)
+
+async def extract_with_regex():
+    # Create a strategy using built-in patterns for URLs and currencies
+    strategy = RegexExtractionStrategy(
+        pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
+    )
+    
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for item in data[:5]:  # Show first 5 matches
+                print(f"{item['label']}: {item['value']}")
+            print(f"Total matches: {len(data)}")
+
+asyncio.run(extract_with_regex())
+```
+
+### Available Built-in Patterns
+
+`RegexExtractionStrategy` provides these common patterns as IntFlag attributes for easy combining:
+
+```python
+# Use individual patterns
+strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email)
+
+# Combine multiple patterns
+strategy = RegexExtractionStrategy(
+    pattern = (
+        RegexExtractionStrategy.Email | 
+        RegexExtractionStrategy.PhoneUS | 
+        RegexExtractionStrategy.Url
+    )
+)
+
+# Use all available patterns
+strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.All)
+```
+
+Available patterns include:
+- `Email` - Email addresses
+- `PhoneIntl` - International phone numbers
+- `PhoneUS` - US-format phone numbers
+- `Url` - HTTP/HTTPS URLs
+- `IPv4` - IPv4 addresses
+- `IPv6` - IPv6 addresses
+- `Uuid` - UUIDs
+- `Currency` - Currency values (USD, EUR, etc.)
+- `Percentage` - Percentage values
+- `Number` - Numeric values
+- `DateIso` - ISO format dates
+- `DateUS` - US format dates
+- `Time24h` - 24-hour format times
+- `PostalUS` - US postal codes
+- `PostalUK` - UK postal codes
+- `HexColor` - HTML hex color codes
+- `TwitterHandle` - Twitter handles
+- `Hashtag` - Hashtags
+- `MacAddr` - MAC addresses
+- `Iban` - International bank account numbers
+- `CreditCard` - Credit card numbers
+
+### Custom Pattern Example
+
+For more targeted extraction, you can provide custom patterns:
+
+```python
+import json
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    RegexExtractionStrategy
+)
+
+async def extract_prices():
+    # Define a custom pattern for US Dollar prices
+    price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
+    
+    # Create strategy with custom pattern
+    strategy = RegexExtractionStrategy(custom=price_pattern)
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com/products",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for item in data:
+                print(f"Found price: {item['value']}")
+
+asyncio.run(extract_prices())
+```
+
+### LLM-Assisted Pattern Generation
+
+For complex or site-specific patterns, you can use an LLM once to generate an optimized pattern, then save and reuse it without further LLM calls:
+
+```python
+import json
+import asyncio
+from pathlib import Path
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    RegexExtractionStrategy,
+    LLMConfig
+)
+
+async def extract_with_generated_pattern():
+    cache_dir = Path("./pattern_cache")
+    cache_dir.mkdir(exist_ok=True)
+    pattern_file = cache_dir / "price_pattern.json"
+    
+    # 1. Generate or load pattern
+    if pattern_file.exists():
+        pattern = json.load(pattern_file.open())
+        print(f"Using cached pattern: {pattern}")
+    else:
+        print("Generating pattern via LLM...")
+        
+        # Configure LLM
+        llm_config = LLMConfig(
+            provider="openai/gpt-4o-mini",
+            api_token="env:OPENAI_API_KEY",
+        )
+        
+        # Get sample HTML for context
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun("https://example.com/products")
+            html = result.fit_html
+        
+        # Generate pattern (one-time LLM usage)
+        pattern = RegexExtractionStrategy.generate_pattern(
+            label="price",
+            html=html,
+            query="Product prices in USD format",
+            llm_config=llm_config,
+        )
+        
+        # Cache pattern for future use
+        json.dump(pattern, pattern_file.open("w"), indent=2)
+    
+    # 2. Use pattern for extraction (no LLM calls)
+    strategy = RegexExtractionStrategy(custom=pattern)
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/products",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for item in data[:10]:
+                print(f"Extracted: {item['value']}")
+            print(f"Total matches: {len(data)}")
+
+asyncio.run(extract_with_generated_pattern())
+```
+
+This pattern allows you to:
+1. Use an LLM once to generate a highly optimized regex for your specific site
+2. Save the pattern to disk for reuse 
+3. Extract data using only regex (no further LLM calls) in production
+
+### Extraction Results Format
+
+The `RegexExtractionStrategy` returns results in a consistent format:
+
+```json
+[
+  {
+    "url": "https://example.com",
+    "label": "email",
+    "value": "contact@example.com",
+    "span": [145, 163]
+  },
+  {
+    "url": "https://example.com",
+    "label": "url",
+    "value": "https://support.example.com",
+    "span": [210, 235]
+  }
+]
+```
+
+Each match includes:
+- `url`: The source URL
+- `label`: The pattern name that matched (e.g., "email", "phone_us")
+- `value`: The extracted text
+- `span`: The start and end positions in the source content
 
 ---
 
-## 5. Base Element Attributes & Additional Fields
+## 5. Why "No LLM" Is Often Better
 
-It’s easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
+1. **Zero Hallucination**: Pattern-based extraction doesn't guess text. It either finds it or not.  
+2. **Guaranteed Structure**: The same schema or regex yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
+3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
+4. **Scalable**: Adding or updating a field is a matter of adjusting the schema or regex, not re-tuning a model.
+
+**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema or regex approach first for repeated or consistent data patterns.
+
+---
+
+## 6. Base Element Attributes & Additional Fields
+
+It's easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
 
 ```json
 {
@@ -361,11 +595,11 @@ It’s easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from y
 }
 ```
 
-You can define them in **`baseFields`** (extracted from the main container element) or in each field’s sub-lists. This is especially helpful if you need an item’s link or ID stored in the parent `<div>`.
+You can define them in **`baseFields`** (extracted from the main container element) or in each field's sub-lists. This is especially helpful if you need an item's link or ID stored in the parent `<div>`.
 
 ---
 
-## 6. Putting It All Together: Larger Example
+## 7. Putting It All Together: Larger Example
 
 Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author:
 
@@ -389,19 +623,20 @@ Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post o
 
 ---
 
-## 7. Tips & Best Practices
+## 8. Tips & Best Practices
 
-1. **Inspect the DOM** in Chrome DevTools or Firefox’s Inspector to find stable selectors.  
-2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
-3. **Test** your schema on partial HTML or a test page before a big crawl.  
-4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
-5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it’ll often show warnings.  
-6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the “parent” item.  
-7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
+1. **Inspect the DOM** in Chrome DevTools or Firefox's Inspector to find stable selectors.  
+2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
+3. **Test** your schema on partial HTML or a test page before a big crawl.  
+4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
+5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it'll often show warnings.  
+6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the "parent" item.  
+7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
+8. **Consider Using Regex First**: For simple data types like emails, URLs, and dates, `RegexExtractionStrategy` is often the fastest approach.
 
 ---
 
-## 8. Schema Generation Utility
+## 9. Schema Generation Utility
 
 While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
 
@@ -415,6 +650,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
 
 ```python
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai import LLMConfig
 
 # Sample HTML with product information
 html = """
@@ -433,17 +669,15 @@ html = """
 # Option 1: Using OpenAI (requires API token)
 css_schema = JsonCssExtractionStrategy.generate_schema(
     html,
-    schema_type="css",  # This is the default
-    llm_provider="openai/gpt-4o",  # Default provider
-    api_token="your-openai-token"  # Required for OpenAI
+    schema_type="css", 
+    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
 )
 
 # Option 2: Using Ollama (open source, no token needed)
 xpath_schema = JsonXPathExtractionStrategy.generate_schema(
     html,
     schema_type="xpath",
-    llm_provider="ollama/llama3.3",  # Open source alternative
-    api_token=None  # Not needed for Ollama
+    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
 )
 
 # Use the generated schema for fast, repeated extractions
@@ -482,27 +716,26 @@ strategy = JsonCssExtractionStrategy(css_schema)
    - Use OpenAI for production-quality schemas
    - Use Ollama for development, testing, or when you need a self-hosted solution
 
-That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
-
 ---
 
-## 9. Conclusion
+## 10. Conclusion
 
-With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that:
+With Crawl4AI's LLM-free extraction strategies - `JsonCssExtractionStrategy`, `JsonXPathExtractionStrategy`, and now `RegexExtractionStrategy` - you can build powerful pipelines that:
 
 - Scrape any consistent site for structured data.  
-- Support nested objects, repeating lists, or advanced transformations.  
+- Support nested objects, repeating lists, or pattern-based extraction.  
 - Scale to thousands of pages quickly and reliably.
 
-**Next Steps**:
+**Choosing the Right Strategy**:
 
-- Combine your extracted JSON with advanced filtering or summarization in a second pass if needed.  
-- For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded.
+- Use **`RegexExtractionStrategy`** for fast extraction of common data types like emails, phones, URLs, dates, etc.
+- Use **`JsonCssExtractionStrategy`** or **`JsonXPathExtractionStrategy`** for structured data with clear HTML patterns
+- If you need both: first extract structured data with JSON strategies, then use regex on specific fields
 
-**Remember**: For repeated, structured data, you don’t need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
+**Remember**: For repeated, structured data, you don't need to pay for or wait on an LLM. Well-crafted schemas and regex patterns get you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
 
-**Last Updated**: 2025-01-01
+**Last Updated**: 2025-05-02
 
 ---
 
-That’s it for **Extracting JSON (No LLM)**! You’ve seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
\ No newline at end of file
+That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) and regex patterns can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
\ No newline at end of file
diff --git a/docs/md_v2/index.md b/docs/md_v2/index.md
index 7a230d5d..4e54da7d 100644
--- a/docs/md_v2/index.md
+++ b/docs/md_v2/index.md
@@ -72,6 +72,14 @@ asyncio.run(main())
 
 ---
 
+## Video Tutorial
+
+<div align="center">
+  <iframe width="560" height="315" src="https://www.youtube.com/embed/xo3qK6Hg9AA?start=15" title="Crawl4AI Tutorial" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+</div>
+
+---
+
 ## What Does Crawl4AI Do?
 
 Crawl4AI is a feature-rich crawler and scraper that aims to:
diff --git a/docs/notebooks/Crawl4AI_v0.3.72_Release_Announcement.ipynb b/docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb
similarity index 100%
rename from docs/notebooks/Crawl4AI_v0.3.72_Release_Announcement.ipynb
rename to docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb
diff --git a/docs/examples/v0.3.74.overview.py b/docs/releases_review/v0.3.74.overview.py
similarity index 100%
rename from docs/examples/v0.3.74.overview.py
rename to docs/releases_review/v0.3.74.overview.py
diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/releases_review/v0_4_24_walkthrough.py
similarity index 100%
rename from docs/examples/v0_4_24_walkthrough.py
rename to docs/releases_review/v0_4_24_walkthrough.py
diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/releases_review/v0_4_3b2_features_demo.py
similarity index 98%
rename from docs/examples/v0_4_3b2_features_demo.py
rename to docs/releases_review/v0_4_3b2_features_demo.py
index 1032f346..37862784 100644
--- a/docs/examples/v0_4_3b2_features_demo.py
+++ b/docs/releases_review/v0_4_3b2_features_demo.py
@@ -31,9 +31,6 @@ import re
 import random
 from typing import Optional, Dict
 from dotenv import load_dotenv
-
-load_dotenv()
-
 from crawl4ai import (
     AsyncWebCrawler, 
     BrowserConfig,
@@ -48,6 +45,7 @@ from crawl4ai import (
     LLMContentFilter
 )
 
+load_dotenv()
 
 async def demo_memory_dispatcher():
     """Demonstrates the new memory-efficient dispatcher system.
@@ -283,7 +281,7 @@ async def demo_proxy_rotation():
     """
     print("\n=== 8. Proxy Rotation Demo ===")
 
-    async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
+    async def get_next_proxy(proxy_file: str = "proxies.txt") -> Optional[Dict]:
         """Get next proxy from local file"""
         try:
             proxies = os.getenv("PROXIES", "").split(",")
@@ -323,7 +321,7 @@ async def demo_proxy_rotation():
                 if verified:
                     print(f"✅ Proxy working! IP matches: {proxy['ip']}")
                 else:
-                    print(f"❌ Proxy failed or IP mismatch!")
+                    print("❌ Proxy failed or IP mismatch!")
             else:
                 print(f"Failed with proxy {proxy['ip']}")
 
diff --git a/docs/snippets/deep_crawl/1.intro.py b/docs/snippets/deep_crawl/1.intro.py
new file mode 100644
index 00000000..d8fd2f94
--- /dev/null
+++ b/docs/snippets/deep_crawl/1.intro.py
@@ -0,0 +1,78 @@
+import asyncio
+from typing import List
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    BFSDeepCrawlStrategy,
+    CrawlResult,
+    FilterChain,
+    DomainFilter,
+    URLPatternFilter,
+)
+
+# Import necessary classes from crawl4ai library:
+# - AsyncWebCrawler: The main class for web crawling.
+# - CrawlerRunConfig: Configuration class for crawler behavior.
+# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
+# - CrawlResult: Data model for individual crawl results.
+# - FilterChain: Used to chain multiple URL filters.
+# - URLPatternFilter: Filter URLs based on patterns.
+# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
+# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
+
+async def basic_deep_crawl():
+    """
+    Performs a basic deep crawl starting from a seed URL, demonstrating:
+    - Breadth-First Search (BFS) deep crawling strategy.
+    - Filtering URLs based on URL patterns.
+    - Accessing crawl results and metadata.
+    """
+
+    # 1. Define URL Filters:
+    # Create a URLPatternFilter to include only URLs containing "text".
+    # This filter will be used to restrict crawling to URLs that are likely to contain textual content.
+    url_filter = URLPatternFilter(
+        patterns=[
+            "*text*", # Include URLs that contain "text" in their path or URL
+        ]
+    )
+
+    # Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
+    # This filter will be used to restrict crawling to URLs within the "groq.com" domain.
+    domain_filter = DomainFilter(
+        allowed_domains=["groq.com"],
+        blocked_domains=["example.com"],
+    )
+
+    # 2. Configure CrawlerRunConfig for Deep Crawling:
+    # Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2,  # Set the maximum depth of crawling to 2 levels from the start URL
+            max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
+            include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
+            filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
+        ),
+        verbose=True, # Enable verbose logging to see detailed output during crawling
+    )
+
+    # 3. Initialize and Run AsyncWebCrawler:
+    # Use AsyncWebCrawler as a context manager for automatic start and close.
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            # url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
+            url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
+            config=config, # Pass the configured CrawlerRunConfig to arun method
+        )
+
+        # 4. Process and Print Crawl Results:
+        # Iterate through the list of CrawlResult objects returned by the deep crawl.
+        for result in results:
+            # Print the URL and its crawl depth from the metadata for each crawled URL.
+            print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
+
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(basic_deep_crawl())
diff --git a/docs/snippets/deep_crawl/2.filters.py b/docs/snippets/deep_crawl/2.filters.py
new file mode 100644
index 00000000..c50eae0a
--- /dev/null
+++ b/docs/snippets/deep_crawl/2.filters.py
@@ -0,0 +1,162 @@
+import asyncio
+from typing import List
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    BFSDeepCrawlStrategy,
+    CrawlResult,
+    URLFilter, # Base class for filters, not directly used in examples but good to import for context
+    ContentTypeFilter,
+    DomainFilter,
+    FilterChain,
+    URLPatternFilter,
+    SEOFilter # Advanced filter, can be introduced later or as bonus
+)
+
+async def deep_crawl_filter_tutorial_part_2():
+    """
+    Tutorial demonstrating URL filters in Crawl4AI, focusing on isolated filter behavior
+    before integrating them into a deep crawl.
+
+    This tutorial covers:
+    - Testing individual filters with synthetic URLs.
+    - Understanding filter logic and behavior in isolation.
+    - Combining filters using FilterChain.
+    - Integrating filters into a deep crawling example.
+    """
+
+    # === Introduction: URL Filters in Isolation ===
+    print("\n" + "=" * 40)
+    print("=== Introduction: URL Filters in Isolation ===")
+    print("=" * 40 + "\n")
+    print("In this section, we will explore each filter individually using synthetic URLs.")
+    print("This allows us to understand exactly how each filter works before using them in a crawl.\n")
+
+
+    # === 2. ContentTypeFilter - Testing in Isolation ===
+    print("\n" + "=" * 40)
+    print("=== 2. ContentTypeFilter - Testing in Isolation ===")
+    print("=" * 40 + "\n")
+
+    # 2.1. Create ContentTypeFilter:
+    # Create a ContentTypeFilter to allow only 'text/html' and 'application/json' content types 
+    # BASED ON URL EXTENSIONS.
+    content_type_filter = ContentTypeFilter(allowed_types=["text/html", "application/json"])
+    print("ContentTypeFilter created, allowing types (by extension): ['text/html', 'application/json']")
+    print("Note: ContentTypeFilter in Crawl4ai works by checking URL file extensions, not HTTP headers.")
+
+
+    # 2.2. Synthetic URLs for Testing:
+    # ContentTypeFilter checks URL extensions. We provide URLs with different extensions to test.
+    test_urls_content_type = [
+        "https://example.com/page.html",       # Should pass: .html extension (text/html)
+        "https://example.com/data.json",       # Should pass: .json extension (application/json)
+        "https://example.com/image.png",       # Should reject: .png extension (not allowed type)
+        "https://example.com/document.pdf",    # Should reject: .pdf extension (not allowed type)
+        "https://example.com/page",            # Should pass: no extension (defaults to allow) - check default behaviour!
+        "https://example.com/page.xhtml",      # Should pass: .xhtml extension (text/html)
+    ]
+
+    # 2.3. Apply Filter and Show Results:
+    print("\n=== Testing ContentTypeFilter (URL Extension based) ===")
+    for url in test_urls_content_type:
+        passed = content_type_filter.apply(url)
+        result = "PASSED" if passed else "REJECTED"
+        extension = ContentTypeFilter._extract_extension(url) # Show extracted extension for clarity
+        print(f"- URL: {url} - {result} (Extension: '{extension or 'No Extension'}')")
+    print("=" * 40)
+
+    input("Press Enter to continue to DomainFilter example...")
+
+    # === 3. DomainFilter - Testing in Isolation ===
+    print("\n" + "=" * 40)
+    print("=== 3. DomainFilter - Testing in Isolation ===")
+    print("=" * 40 + "\n")
+
+    # 3.1. Create DomainFilter:
+    domain_filter = DomainFilter(allowed_domains=["crawl4ai.com", "example.com"])
+    print("DomainFilter created, allowing domains: ['crawl4ai.com', 'example.com']")
+
+    # 3.2. Synthetic URLs for Testing:
+    test_urls_domain = [
+        "https://docs.crawl4ai.com/api",
+        "https://example.com/products",
+        "https://another-website.org/blog",
+        "https://sub.example.com/about",
+        "https://crawl4ai.com.attacker.net", # Corrected example: now should be rejected
+    ]
+
+    # 3.3. Apply Filter and Show Results:
+    print("\n=== Testing DomainFilter ===")
+    for url in test_urls_domain:
+        passed = domain_filter.apply(url)
+        result = "PASSED" if passed else "REJECTED"
+        print(f"- URL: {url} - {result}")
+    print("=" * 40)
+
+    input("Press Enter to continue to FilterChain example...")
+
+    # === 4. FilterChain - Combining Filters ===
+    print("\n" + "=" * 40)
+    print("=== 4. FilterChain - Combining Filters ===")
+    print("=" * 40 + "\n")
+
+    combined_filter = FilterChain(
+        filters=[
+            URLPatternFilter(patterns=["*api*"]),
+            ContentTypeFilter(allowed_types=["text/html"]), # Still URL extension based
+            DomainFilter(allowed_domains=["docs.crawl4ai.com"]),
+        ]
+    )
+    print("FilterChain created, combining URLPatternFilter, ContentTypeFilter, and DomainFilter.")
+
+
+    test_urls_combined = [
+        "https://docs.crawl4ai.com/api/async-webcrawler",
+        "https://example.com/api/products",
+        "https://docs.crawl4ai.com/core/crawling",
+        "https://another-website.org/api/data",
+    ]
+
+    # 4.3. Apply FilterChain and Show Results
+    print("\n=== Testing FilterChain (URLPatternFilter + ContentTypeFilter + DomainFilter) ===")
+    for url in test_urls_combined:
+        passed = await combined_filter.apply(url)
+        result = "PASSED" if passed else "REJECTED"
+        print(f"- URL: {url} - {result}")
+    print("=" * 40)
+
+    input("Press Enter to continue to Deep Crawl with FilterChain example...")
+
+    # === 5. Deep Crawl with FilterChain ===
+    print("\n" + "=" * 40)
+    print("=== 5. Deep Crawl with FilterChain ===")
+    print("=" * 40 + "\n")
+    print("Finally, let's integrate the FilterChain into a deep crawl example.")
+
+    config_final_crawl = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2,
+            max_pages=10,
+            include_external=False,
+            filter_chain=combined_filter
+        ),
+        verbose=False,
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        results_final_crawl: List[CrawlResult] = await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config_final_crawl
+        )
+
+        print("=== Crawled URLs (Deep Crawl with FilterChain) ===")
+        for result in results_final_crawl:
+            print(f"- {result.url}, Depth: {result.metadata.get('depth', 0)}")
+        print("=" * 40)
+
+    print("\nTutorial Completed! Review the output of each section to understand URL filters.")
+
+
+if __name__ == "__main__":
+    asyncio.run(deep_crawl_filter_tutorial_part_2())
\ No newline at end of file
diff --git a/docs/tutorials/coming_soon.md b/docs/tutorials/coming_soon.md
new file mode 100644
index 00000000..e69de29b
diff --git a/main.py b/main.py
deleted file mode 100644
index 029653cd..00000000
--- a/main.py
+++ /dev/null
@@ -1,526 +0,0 @@
-import asyncio, os
-from fastapi import FastAPI, HTTPException
-from fastapi import FastAPI, HTTPException
-from fastapi.staticfiles import StaticFiles
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.templating import Jinja2Templates
-from fastapi.responses import RedirectResponse
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from fastapi import Depends, Security
-
-from pydantic import BaseModel, HttpUrl, Field
-from typing import Optional, List, Dict, Any, Union
-import psutil
-import time
-import uuid
-import math
-import logging
-from enum import Enum
-from dataclasses import dataclass
-from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
-from crawl4ai.config import MIN_WORD_THRESHOLD
-from crawl4ai.extraction_strategy import (
-    LLMExtractionStrategy,
-    CosineStrategy,
-    JsonCssExtractionStrategy,
-)
-
-__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-class TaskStatus(str, Enum):
-    PENDING = "pending"
-    PROCESSING = "processing"
-    COMPLETED = "completed"
-    FAILED = "failed"
-
-
-class CrawlerType(str, Enum):
-    BASIC = "basic"
-    LLM = "llm"
-    COSINE = "cosine"
-    JSON_CSS = "json_css"
-
-
-class ExtractionConfig(BaseModel):
-    type: CrawlerType
-    params: Dict[str, Any] = {}
-
-
-class ChunkingStrategy(BaseModel):
-    type: str
-    params: Dict[str, Any] = {}
-
-
-class ContentFilter(BaseModel):
-    type: str = "bm25"
-    params: Dict[str, Any] = {}
-
-
-class CrawlRequest(BaseModel):
-    urls: Union[HttpUrl, List[HttpUrl]]
-    word_count_threshold: int = MIN_WORD_THRESHOLD
-    extraction_config: Optional[ExtractionConfig] = None
-    chunking_strategy: Optional[ChunkingStrategy] = None
-    content_filter: Optional[ContentFilter] = None
-    js_code: Optional[List[str]] = None
-    wait_for: Optional[str] = None
-    css_selector: Optional[str] = None
-    screenshot: bool = False
-    magic: bool = False
-    extra: Optional[Dict[str, Any]] = {}
-    session_id: Optional[str] = None
-    cache_mode: Optional[CacheMode] = CacheMode.ENABLED
-    priority: int = Field(default=5, ge=1, le=10)
-    ttl: Optional[int] = 3600
-    crawler_params: Dict[str, Any] = {}
-
-
-@dataclass
-class TaskInfo:
-    id: str
-    status: TaskStatus
-    result: Optional[Union[CrawlResult, List[CrawlResult]]] = None
-    error: Optional[str] = None
-    created_at: float = time.time()
-    ttl: int = 3600
-
-
-class ResourceMonitor:
-    def __init__(self, max_concurrent_tasks: int = 10):
-        self.max_concurrent_tasks = max_concurrent_tasks
-        self.memory_threshold = 0.85
-        self.cpu_threshold = 0.90
-        self._last_check = 0
-        self._check_interval = 1  # seconds
-        self._last_available_slots = max_concurrent_tasks
-
-    async def get_available_slots(self) -> int:
-        current_time = time.time()
-        if current_time - self._last_check < self._check_interval:
-            return self._last_available_slots
-
-        mem_usage = psutil.virtual_memory().percent / 100
-        cpu_usage = psutil.cpu_percent() / 100
-
-        memory_factor = max(
-            0, (self.memory_threshold - mem_usage) / self.memory_threshold
-        )
-        cpu_factor = max(0, (self.cpu_threshold - cpu_usage) / self.cpu_threshold)
-
-        self._last_available_slots = math.floor(
-            self.max_concurrent_tasks * min(memory_factor, cpu_factor)
-        )
-        self._last_check = current_time
-
-        return self._last_available_slots
-
-
-class TaskManager:
-    def __init__(self, cleanup_interval: int = 300):
-        self.tasks: Dict[str, TaskInfo] = {}
-        self.high_priority = asyncio.PriorityQueue()
-        self.low_priority = asyncio.PriorityQueue()
-        self.cleanup_interval = cleanup_interval
-        self.cleanup_task = None
-
-    async def start(self):
-        self.cleanup_task = asyncio.create_task(self._cleanup_loop())
-
-    async def stop(self):
-        if self.cleanup_task:
-            self.cleanup_task.cancel()
-            try:
-                await self.cleanup_task
-            except asyncio.CancelledError:
-                pass
-
-    async def add_task(self, task_id: str, priority: int, ttl: int) -> None:
-        task_info = TaskInfo(id=task_id, status=TaskStatus.PENDING, ttl=ttl)
-        self.tasks[task_id] = task_info
-        queue = self.high_priority if priority > 5 else self.low_priority
-        await queue.put((-priority, task_id))  # Negative for proper priority ordering
-
-    async def get_next_task(self) -> Optional[str]:
-        try:
-            # Try high priority first
-            _, task_id = await asyncio.wait_for(self.high_priority.get(), timeout=0.1)
-            return task_id
-        except asyncio.TimeoutError:
-            try:
-                # Then try low priority
-                _, task_id = await asyncio.wait_for(
-                    self.low_priority.get(), timeout=0.1
-                )
-                return task_id
-            except asyncio.TimeoutError:
-                return None
-
-    def update_task(
-        self, task_id: str, status: TaskStatus, result: Any = None, error: str = None
-    ):
-        if task_id in self.tasks:
-            task_info = self.tasks[task_id]
-            task_info.status = status
-            task_info.result = result
-            task_info.error = error
-
-    def get_task(self, task_id: str) -> Optional[TaskInfo]:
-        return self.tasks.get(task_id)
-
-    async def _cleanup_loop(self):
-        while True:
-            try:
-                await asyncio.sleep(self.cleanup_interval)
-                current_time = time.time()
-                expired_tasks = [
-                    task_id
-                    for task_id, task in self.tasks.items()
-                    if current_time - task.created_at > task.ttl
-                    and task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED]
-                ]
-                for task_id in expired_tasks:
-                    del self.tasks[task_id]
-            except Exception as e:
-                logger.error(f"Error in cleanup loop: {e}")
-
-
-class CrawlerPool:
-    def __init__(self, max_size: int = 10):
-        self.max_size = max_size
-        self.active_crawlers: Dict[AsyncWebCrawler, float] = {}
-        self._lock = asyncio.Lock()
-
-    async def acquire(self, **kwargs) -> AsyncWebCrawler:
-        async with self._lock:
-            # Clean up inactive crawlers
-            current_time = time.time()
-            inactive = [
-                crawler
-                for crawler, last_used in self.active_crawlers.items()
-                if current_time - last_used > 600  # 10 minutes timeout
-            ]
-            for crawler in inactive:
-                await crawler.__aexit__(None, None, None)
-                del self.active_crawlers[crawler]
-
-            # Create new crawler if needed
-            if len(self.active_crawlers) < self.max_size:
-                crawler = AsyncWebCrawler(**kwargs)
-                await crawler.__aenter__()
-                self.active_crawlers[crawler] = current_time
-                return crawler
-
-            # Reuse least recently used crawler
-            crawler = min(self.active_crawlers.items(), key=lambda x: x[1])[0]
-            self.active_crawlers[crawler] = current_time
-            return crawler
-
-    async def release(self, crawler: AsyncWebCrawler):
-        async with self._lock:
-            if crawler in self.active_crawlers:
-                self.active_crawlers[crawler] = time.time()
-
-    async def cleanup(self):
-        async with self._lock:
-            for crawler in list(self.active_crawlers.keys()):
-                await crawler.__aexit__(None, None, None)
-            self.active_crawlers.clear()
-
-
-class CrawlerService:
-    def __init__(self, max_concurrent_tasks: int = 10):
-        self.resource_monitor = ResourceMonitor(max_concurrent_tasks)
-        self.task_manager = TaskManager()
-        self.crawler_pool = CrawlerPool(max_concurrent_tasks)
-        self._processing_task = None
-
-    async def start(self):
-        await self.task_manager.start()
-        self._processing_task = asyncio.create_task(self._process_queue())
-
-    async def stop(self):
-        if self._processing_task:
-            self._processing_task.cancel()
-            try:
-                await self._processing_task
-            except asyncio.CancelledError:
-                pass
-        await self.task_manager.stop()
-        await self.crawler_pool.cleanup()
-
-    def _create_extraction_strategy(self, config: ExtractionConfig):
-        if not config:
-            return None
-
-        if config.type == CrawlerType.LLM:
-            return LLMExtractionStrategy(**config.params)
-        elif config.type == CrawlerType.COSINE:
-            return CosineStrategy(**config.params)
-        elif config.type == CrawlerType.JSON_CSS:
-            return JsonCssExtractionStrategy(**config.params)
-        return None
-
-    async def submit_task(self, request: CrawlRequest) -> str:
-        task_id = str(uuid.uuid4())
-        await self.task_manager.add_task(task_id, request.priority, request.ttl or 3600)
-
-        # Store request data with task
-        self.task_manager.tasks[task_id].request = request
-
-        return task_id
-
-    async def _process_queue(self):
-        while True:
-            try:
-                available_slots = await self.resource_monitor.get_available_slots()
-                if False and available_slots <= 0:
-                    await asyncio.sleep(1)
-                    continue
-
-                task_id = await self.task_manager.get_next_task()
-                if not task_id:
-                    await asyncio.sleep(1)
-                    continue
-
-                task_info = self.task_manager.get_task(task_id)
-                if not task_info:
-                    continue
-
-                request = task_info.request
-                self.task_manager.update_task(task_id, TaskStatus.PROCESSING)
-
-                try:
-                    crawler = await self.crawler_pool.acquire(**request.crawler_params)
-
-                    extraction_strategy = self._create_extraction_strategy(
-                        request.extraction_config
-                    )
-
-                    if isinstance(request.urls, list):
-                        results = await crawler.arun_many(
-                            urls=[str(url) for url in request.urls],
-                            word_count_threshold=MIN_WORD_THRESHOLD,
-                            extraction_strategy=extraction_strategy,
-                            js_code=request.js_code,
-                            wait_for=request.wait_for,
-                            css_selector=request.css_selector,
-                            screenshot=request.screenshot,
-                            magic=request.magic,
-                            session_id=request.session_id,
-                            cache_mode=request.cache_mode,
-                            **request.extra,
-                        )
-                    else:
-                        results = await crawler.arun(
-                            url=str(request.urls),
-                            extraction_strategy=extraction_strategy,
-                            js_code=request.js_code,
-                            wait_for=request.wait_for,
-                            css_selector=request.css_selector,
-                            screenshot=request.screenshot,
-                            magic=request.magic,
-                            session_id=request.session_id,
-                            cache_mode=request.cache_mode,
-                            **request.extra,
-                        )
-
-                    await self.crawler_pool.release(crawler)
-                    self.task_manager.update_task(
-                        task_id, TaskStatus.COMPLETED, results
-                    )
-
-                except Exception as e:
-                    logger.error(f"Error processing task {task_id}: {str(e)}")
-                    self.task_manager.update_task(
-                        task_id, TaskStatus.FAILED, error=str(e)
-                    )
-
-            except Exception as e:
-                logger.error(f"Error in queue processing: {str(e)}")
-                await asyncio.sleep(1)
-
-
-app = FastAPI(title="Crawl4AI API")
-
-# CORS configuration
-origins = ["*"]  # Allow all origins
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=origins,  # List of origins that are allowed to make requests
-    allow_credentials=True,
-    allow_methods=["*"],  # Allows all methods
-    allow_headers=["*"],  # Allows all headers
-)
-
-# API token security
-security = HTTPBearer()
-CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
-
-
-async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
-    if not CRAWL4AI_API_TOKEN:
-        return credentials  # No token verification if CRAWL4AI_API_TOKEN is not set
-    if credentials.credentials != CRAWL4AI_API_TOKEN:
-        raise HTTPException(status_code=401, detail="Invalid token")
-    return credentials
-
-
-def secure_endpoint():
-    """Returns security dependency only if CRAWL4AI_API_TOKEN is set"""
-    return Depends(verify_token) if CRAWL4AI_API_TOKEN else None
-
-
-# Check if site directory exists
-if os.path.exists(__location__ + "/site"):
-    # Mount the site directory as a static directory
-    app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs")
-
-site_templates = Jinja2Templates(directory=__location__ + "/site")
-
-crawler_service = CrawlerService()
-
-
-@app.on_event("startup")
-async def startup_event():
-    await crawler_service.start()
-
-
-@app.on_event("shutdown")
-async def shutdown_event():
-    await crawler_service.stop()
-
-
-@app.get("/")
-def read_root():
-    if os.path.exists(__location__ + "/site"):
-        return RedirectResponse(url="/mkdocs")
-    # Return a json response
-    return {"message": "Crawl4AI API service is running"}
-
-
-@app.post("/crawl", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
-async def crawl(request: CrawlRequest) -> Dict[str, str]:
-    task_id = await crawler_service.submit_task(request)
-    return {"task_id": task_id}
-
-
-@app.get(
-    "/task/{task_id}", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []
-)
-async def get_task_status(task_id: str):
-    task_info = crawler_service.task_manager.get_task(task_id)
-    if not task_info:
-        raise HTTPException(status_code=404, detail="Task not found")
-
-    response = {
-        "status": task_info.status,
-        "created_at": task_info.created_at,
-    }
-
-    if task_info.status == TaskStatus.COMPLETED:
-        # Convert CrawlResult to dict for JSON response
-        if isinstance(task_info.result, list):
-            response["results"] = [result.dict() for result in task_info.result]
-        else:
-            response["result"] = task_info.result.dict()
-    elif task_info.status == TaskStatus.FAILED:
-        response["error"] = task_info.error
-
-    return response
-
-
-@app.post("/crawl_sync", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
-async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
-    task_id = await crawler_service.submit_task(request)
-
-    # Wait up to 60 seconds for task completion
-    for _ in range(60):
-        task_info = crawler_service.task_manager.get_task(task_id)
-        if not task_info:
-            raise HTTPException(status_code=404, detail="Task not found")
-
-        if task_info.status == TaskStatus.COMPLETED:
-            # Return same format as /task/{task_id} endpoint
-            if isinstance(task_info.result, list):
-                return {
-                    "status": task_info.status,
-                    "results": [result.dict() for result in task_info.result],
-                }
-            return {"status": task_info.status, "result": task_info.result.dict()}
-
-        if task_info.status == TaskStatus.FAILED:
-            raise HTTPException(status_code=500, detail=task_info.error)
-
-        await asyncio.sleep(1)
-
-    # If we get here, task didn't complete within timeout
-    raise HTTPException(status_code=408, detail="Task timed out")
-
-
-@app.post(
-    "/crawl_direct", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []
-)
-async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
-    try:
-        crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
-        extraction_strategy = crawler_service._create_extraction_strategy(
-            request.extraction_config
-        )
-
-        try:
-            if isinstance(request.urls, list):
-                results = await crawler.arun_many(
-                    urls=[str(url) for url in request.urls],
-                    extraction_strategy=extraction_strategy,
-                    js_code=request.js_code,
-                    wait_for=request.wait_for,
-                    css_selector=request.css_selector,
-                    screenshot=request.screenshot,
-                    magic=request.magic,
-                    cache_mode=request.cache_mode,
-                    session_id=request.session_id,
-                    **request.extra,
-                )
-                return {"results": [result.dict() for result in results]}
-            else:
-                result = await crawler.arun(
-                    url=str(request.urls),
-                    extraction_strategy=extraction_strategy,
-                    js_code=request.js_code,
-                    wait_for=request.wait_for,
-                    css_selector=request.css_selector,
-                    screenshot=request.screenshot,
-                    magic=request.magic,
-                    cache_mode=request.cache_mode,
-                    session_id=request.session_id,
-                    **request.extra,
-                )
-                return {"result": result.dict()}
-        finally:
-            await crawler_service.crawler_pool.release(crawler)
-    except Exception as e:
-        logger.error(f"Error in direct crawl: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@app.get("/health")
-async def health_check():
-    available_slots = await crawler_service.resource_monitor.get_available_slots()
-    memory = psutil.virtual_memory()
-    return {
-        "status": "healthy",
-        "available_slots": available_slots,
-        "memory_usage": memory.percent,
-        "cpu_usage": psutil.cpu_percent(),
-    }
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    uvicorn.run(app, host="0.0.0.0", port=11235)
diff --git a/mkdocs.yml b/mkdocs.yml
index 16f44b05..23f4ceda 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,4 +1,4 @@
-site_name: Crawl4AI Documentation (v0.4.3b2)
+site_name: Crawl4AI Documentation (v0.6.x)
 site_description:  🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
 site_url: https://docs.crawl4ai.com
 repo_url: https://github.com/unclecode/crawl4ai
@@ -7,17 +7,21 @@ docs_dir: docs/md_v2
 
 nav:
   - Home: 'index.md'
+  - "Ask AI": "core/ask-ai.md"
+  - "Quick Start": "core/quickstart.md"
+  - "Code Examples": "core/examples.md"
   - Setup & Installation:
     - "Installation": "core/installation.md"
-    - "Docker Deployment": "core/docker-deploymeny.md"
-  - "Quick Start": "core/quickstart.md"
+    - "Docker Deployment": "core/docker-deployment.md"
   - "Blog & Changelog":
     - "Blog Home": "blog/index.md"
     - "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
   - Core:
+    - "Command Line Interface": "core/cli.md"
     - "Simple Crawling": "core/simple-crawling.md"
+    - "Deep Crawling": "core/deep-crawling.md"
     - "Crawler Result": "core/crawler-result.md"
-    - "Browser & Crawler Config": "core/browser-crawler-config.md"
+    - "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
     - "Markdown Generation": "core/markdown-generation.md"
     - "Fit Markdown": "core/fit-markdown.md"
     - "Page Interaction": "core/page-interaction.md"
@@ -36,6 +40,7 @@ nav:
     - "Crawl Dispatcher": "advanced/crawl-dispatcher.md"
     - "Identity Based Crawling": "advanced/identity-based-crawling.md"
     - "SSL Certificate": "advanced/ssl-certificate.md"
+    - "Network & Console Capture": "advanced/network-console-capture.md"
   - Extraction:
     - "LLM-Free Strategies": "extraction/no-llm-strategies.md"
     - "LLM Strategies": "extraction/llm-strategies.md"
@@ -45,7 +50,7 @@ nav:
     - "AsyncWebCrawler": "api/async-webcrawler.md"
     - "arun()": "api/arun.md"
     - "arun_many()": "api/arun_many.md"
-    - "Browser & Crawler Config": "api/parameters.md"
+    - "Browser, Crawler & LLM Config": "api/parameters.md"
     - "CrawlResult": "api/crawl-result.md"
     - "Strategies": "api/strategies.md"
 
@@ -73,6 +78,7 @@ extra:
   version: !ENV [CRAWL4AI_VERSION, 'development']
 
 extra_css:
+  - assets/layout.css
   - assets/styles.css
   - assets/highlight.css
   - assets/dmvendor.css
@@ -80,4 +86,10 @@ extra_css:
 extra_javascript:
   - assets/highlight.min.js
   - assets/highlight_init.js
-  - https://buttons.github.io/buttons.js
\ No newline at end of file
+  - https://buttons.github.io/buttons.js
+  - assets/toc.js
+  - assets/github_stats.js 
+  - assets/selection_ask_ai.js
+  - assets/copy_code.js
+  - assets/floating_ask_ai_button.js
+  - assets/mobile_menu.js
\ No newline at end of file
diff --git a/prompts/prompt_net_requests.md b/prompts/prompt_net_requests.md
new file mode 100644
index 00000000..d033591e
--- /dev/null
+++ b/prompts/prompt_net_requests.md
@@ -0,0 +1,489 @@
+I want to enhance the `AsyncPlaywrightCrawlerStrategy` to optionally capture network requests and console messages during a crawl, storing them in the final `CrawlResult`.
+
+Here's a breakdown of the proposed changes across the relevant files:
+
+**1. Configuration (`crawl4ai/async_configs.py`)**
+
+*   **Goal:** Add flags to `CrawlerRunConfig` to enable/disable capturing.
+*   **Changes:**
+    *   Add two new boolean attributes to `CrawlerRunConfig`:
+        *   `capture_network_requests: bool = False`
+        *   `capture_console_messages: bool = False`
+    *   Update `__init__`, `from_kwargs`, `to_dict`, and implicitly `clone`/`dump`/`load` to include these new attributes.
+
+```python
+# ==== File: crawl4ai/async_configs.py ====
+# ... (imports) ...
+
+class CrawlerRunConfig():
+    # ... (existing attributes) ...
+
+    # NEW: Network and Console Capturing Parameters
+    capture_network_requests: bool = False
+    capture_console_messages: bool = False
+
+    # Experimental Parameters
+    experimental: Dict[str, Any] = None,
+
+    def __init__(
+        self,
+        # ... (existing parameters) ...
+
+        # NEW: Network and Console Capturing Parameters
+        capture_network_requests: bool = False,
+        capture_console_messages: bool = False,
+
+        # Experimental Parameters
+        experimental: Dict[str, Any] = None,
+    ):
+        # ... (existing assignments) ...
+
+        # NEW: Assign new parameters
+        self.capture_network_requests = capture_network_requests
+        self.capture_console_messages = capture_console_messages
+
+        # Experimental Parameters
+        self.experimental = experimental or {}
+
+        # ... (rest of __init__) ...
+
+    @staticmethod
+    def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
+        return CrawlerRunConfig(
+            # ... (existing kwargs gets) ...
+
+            # NEW: Get new parameters
+            capture_network_requests=kwargs.get("capture_network_requests", False),
+            capture_console_messages=kwargs.get("capture_console_messages", False),
+
+            # Experimental Parameters
+            experimental=kwargs.get("experimental"),
+        )
+
+    def to_dict(self):
+        return {
+            # ... (existing dict entries) ...
+
+            # NEW: Add new parameters to dict
+            "capture_network_requests": self.capture_network_requests,
+            "capture_console_messages": self.capture_console_messages,
+
+            "experimental": self.experimental,
+        }
+
+    # clone(), dump(), load() should work automatically if they rely on to_dict() and from_kwargs()
+    # or the serialization logic correctly handles all attributes.
+```
+
+**2. Data Models (`crawl4ai/models.py`)**
+
+*   **Goal:** Add fields to store the captured data in the response/result objects.
+*   **Changes:**
+    *   Add `network_requests: Optional[List[Dict[str, Any]]] = None` and `console_messages: Optional[List[Dict[str, Any]]] = None` to `AsyncCrawlResponse`.
+    *   Add the same fields to `CrawlResult`.
+
+```python
+# ==== File: crawl4ai/models.py ====
+# ... (imports) ...
+
+# ... (Existing dataclasses/models) ...
+
+class AsyncCrawlResponse(BaseModel):
+    html: str
+    response_headers: Dict[str, str]
+    js_execution_result: Optional[Dict[str, Any]] = None
+    status_code: int
+    screenshot: Optional[str] = None
+    pdf_data: Optional[bytes] = None
+    get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+    downloaded_files: Optional[List[str]] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    redirected_url: Optional[str] = None
+    # NEW: Fields for captured data
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+# ... (Existing models like MediaItem, Link, etc.) ...
+
+class CrawlResult(BaseModel):
+    url: str
+    html: str
+    success: bool
+    cleaned_html: Optional[str] = None
+    media: Dict[str, List[Dict]] = {}
+    links: Dict[str, List[Dict]] = {}
+    downloaded_files: Optional[List[str]] = None
+    js_execution_result: Optional[Dict[str, Any]] = None
+    screenshot: Optional[str] = None
+    pdf: Optional[bytes] = None
+    mhtml: Optional[str] = None # Added mhtml based on the provided models.py
+    _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
+    extracted_content: Optional[str] = None
+    metadata: Optional[dict] = None
+    error_message: Optional[str] = None
+    session_id: Optional[str] = None
+    response_headers: Optional[dict] = None
+    status_code: Optional[int] = None
+    ssl_certificate: Optional[SSLCertificate] = None
+    dispatch_result: Optional[DispatchResult] = None
+    redirected_url: Optional[str] = None
+    # NEW: Fields for captured data
+    network_requests: Optional[List[Dict[str, Any]]] = None
+    console_messages: Optional[List[Dict[str, Any]]] = None
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    # ... (Existing __init__, properties, model_dump for markdown compatibility) ...
+
+# ... (Rest of the models) ...
+```
+
+**3. Crawler Strategy (`crawl4ai/async_crawler_strategy.py`)**
+
+*   **Goal:** Implement the actual capturing logic within `AsyncPlaywrightCrawlerStrategy._crawl_web`.
+*   **Changes:**
+    *   Inside `_crawl_web`, initialize empty lists `captured_requests = []` and `captured_console = []`.
+    *   Conditionally attach Playwright event listeners (`page.on(...)`) based on the `config.capture_network_requests` and `config.capture_console_messages` flags.
+    *   Define handler functions for these listeners to extract relevant data and append it to the respective lists. Include timestamps.
+    *   Pass the captured lists to the `AsyncCrawlResponse` constructor at the end of the method.
+
+```python
+# ==== File: crawl4ai/async_crawler_strategy.py ====
+# ... (imports) ...
+import time # Make sure time is imported
+
+class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
+    # ... (existing methods like __init__, start, close, etc.) ...
+
+    async def _crawl_web(
+        self, url: str, config: CrawlerRunConfig
+    ) -> AsyncCrawlResponse:
+        """
+        Internal method to crawl web URLs with the specified configuration.
+        Includes optional network and console capturing. # MODIFIED DOCSTRING
+        """
+        config.url = url
+        response_headers = {}
+        execution_result = None
+        status_code = None
+        redirected_url = url
+
+        # Reset downloaded files list for new crawl
+        self._downloaded_files = []
+
+        # Initialize capture lists - IMPORTANT: Reset per crawl
+        captured_requests: List[Dict[str, Any]] = []
+        captured_console: List[Dict[str, Any]] = []
+
+        # Handle user agent ... (existing code) ...
+
+        # Get page for session
+        page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+
+        # ... (existing code for cookies, navigator overrides, hooks) ...
+
+        # --- Setup Capturing Listeners ---
+        # NOTE: These listeners are attached *before* page.goto()
+
+        # Network Request Capturing
+        if config.capture_network_requests:
+            async def handle_request_capture(request):
+                try:
+                    post_data_str = None
+                    try:
+                        # Be cautious with large post data
+                        post_data = request.post_data_buffer
+                        if post_data:
+                             # Attempt to decode, fallback to base64 or size indication
+                             try:
+                                 post_data_str = post_data.decode('utf-8', errors='replace')
+                             except UnicodeDecodeError:
+                                 post_data_str = f"[Binary data: {len(post_data)} bytes]"
+                    except Exception:
+                        post_data_str = "[Error retrieving post data]"
+
+                    captured_requests.append({
+                        "event_type": "request",
+                        "url": request.url,
+                        "method": request.method,
+                        "headers": dict(request.headers), # Convert Header dict
+                        "post_data": post_data_str,
+                        "resource_type": request.resource_type,
+                        "is_navigation_request": request.is_navigation_request(),
+                        "timestamp": time.time()
+                    })
+                except Exception as e:
+                    self.logger.warning(f"Error capturing request details for {request.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "request_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+            async def handle_response_capture(response):
+                try:
+                    # Avoid capturing full response body by default due to size/security
+                    # security_details = await response.security_details() # Optional: More SSL info
+                    captured_requests.append({
+                        "event_type": "response",
+                        "url": response.url,
+                        "status": response.status,
+                        "status_text": response.status_text,
+                        "headers": dict(response.headers), # Convert Header dict
+                        "from_service_worker": response.from_service_worker,
+                        # "security_details": security_details, # Uncomment if needed
+                        "request_timing": response.request.timing, # Detailed timing info
+                        "timestamp": time.time()
+                    })
+                except Exception as e:
+                    self.logger.warning(f"Error capturing response details for {response.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "response_capture_error", "url": response.url, "error": str(e), "timestamp": time.time()})
+
+            async def handle_request_failed_capture(request):
+                 try:
+                    captured_requests.append({
+                        "event_type": "request_failed",
+                        "url": request.url,
+                        "method": request.method,
+                        "resource_type": request.resource_type,
+                        "failure_text": request.failure.error_text if request.failure else "Unknown failure",
+                        "timestamp": time.time()
+                    })
+                 except Exception as e:
+                    self.logger.warning(f"Error capturing request failed details for {request.url}: {e}", tag="CAPTURE")
+                    captured_requests.append({"event_type": "request_failed_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+            page.on("request", handle_request_capture)
+            page.on("response", handle_response_capture)
+            page.on("requestfailed", handle_request_failed_capture)
+
+        # Console Message Capturing
+        if config.capture_console_messages:
+            def handle_console_capture(msg):
+                 try:
+                    location = msg.location()
+                    # Attempt to resolve JSHandle args to primitive values
+                    resolved_args = []
+                    try:
+                        for arg in msg.args:
+                            resolved_args.append(arg.json_value()) # May fail for complex objects
+                    except Exception:
+                         resolved_args.append("[Could not resolve JSHandle args]")
+
+                    captured_console.append({
+                        "type": msg.type(), # e.g., 'log', 'error', 'warning'
+                        "text": msg.text(),
+                        "args": resolved_args, # Captured arguments
+                        "location": f"{location['url']}:{location['lineNumber']}:{location['columnNumber']}" if location else "N/A",
+                        "timestamp": time.time()
+                    })
+                 except Exception as e:
+                    self.logger.warning(f"Error capturing console message: {e}", tag="CAPTURE")
+                    captured_console.append({"type": "console_capture_error", "error": str(e), "timestamp": time.time()})
+
+            def handle_pageerror_capture(err):
+                 try:
+                    captured_console.append({
+                        "type": "error", # Consistent type for page errors
+                        "text": err.message,
+                        "stack": err.stack,
+                        "timestamp": time.time()
+                    })
+                 except Exception as e:
+                    self.logger.warning(f"Error capturing page error: {e}", tag="CAPTURE")
+                    captured_console.append({"type": "pageerror_capture_error", "error": str(e), "timestamp": time.time()})
+
+            page.on("console", handle_console_capture)
+            page.on("pageerror", handle_pageerror_capture)
+        # --- End Setup Capturing Listeners ---
+
+
+        # Set up console logging if requested (Keep original logging logic separate or merge carefully)
+        if config.log_console:
+            # ... (original log_console setup using page.on(...) remains here) ...
+            # This allows logging to screen *and* capturing to the list if both flags are True
+            def log_consol(msg, console_log_type="debug"):
+                # ... existing implementation ...
+                pass # Placeholder for existing code
+
+            page.on("console", lambda msg: log_consol(msg, "debug"))
+            page.on("pageerror", lambda e: log_consol(e, "error"))
+
+
+        try:
+            # ... (existing code for SSL, downloads, goto, waits, JS execution, etc.) ...
+
+            # Get final HTML content
+            # ... (existing code for selector logic or page.content()) ...
+            if config.css_selector:
+                # ... existing selector logic ...
+                html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
+            else:
+                html = await page.content()
+
+            await self.execute_hook(
+                "before_return_html", page=page, html=html, context=context, config=config
+            )
+
+            # Handle PDF and screenshot generation
+            # ... (existing code) ...
+
+            # Define delayed content getter
+            # ... (existing code) ...
+
+            # Return complete response - ADD CAPTURED DATA HERE
+            return AsyncCrawlResponse(
+                html=html,
+                response_headers=response_headers,
+                js_execution_result=execution_result,
+                status_code=status_code,
+                screenshot=screenshot_data,
+                pdf_data=pdf_data,
+                get_delayed_content=get_delayed_content,
+                ssl_certificate=ssl_cert,
+                downloaded_files=(
+                    self._downloaded_files if self._downloaded_files else None
+                ),
+                redirected_url=redirected_url,
+                # NEW: Pass captured data conditionally
+                network_requests=captured_requests if config.capture_network_requests else None,
+                console_messages=captured_console if config.capture_console_messages else None,
+            )
+
+        except Exception as e:
+            raise e # Re-raise the original exception
+
+        finally:
+            # If no session_id is given we should close the page
+            if not config.session_id:
+                # Detach listeners before closing to prevent potential errors during close
+                if config.capture_network_requests:
+                    page.remove_listener("request", handle_request_capture)
+                    page.remove_listener("response", handle_response_capture)
+                    page.remove_listener("requestfailed", handle_request_failed_capture)
+                if config.capture_console_messages:
+                    page.remove_listener("console", handle_console_capture)
+                    page.remove_listener("pageerror", handle_pageerror_capture)
+                # Also remove logging listeners if they were attached
+                if config.log_console:
+                    # Need to figure out how to remove the lambdas if necessary,
+                    # or ensure they don't cause issues on close. Often, it's fine.
+                    pass
+
+                await page.close()
+
+    # ... (rest of AsyncPlaywrightCrawlerStrategy methods) ...
+
+```
+
+**4. Core Crawler (`crawl4ai/async_webcrawler.py`)**
+
+*   **Goal:** Ensure the captured data from `AsyncCrawlResponse` is transferred to the final `CrawlResult`.
+*   **Changes:**
+    *   In `arun`, when processing a non-cached result (inside the `if not cached_result or not html:` block), after receiving `async_response` and calling `aprocess_html` to get `crawl_result`, copy the `network_requests` and `console_messages` from `async_response` to `crawl_result`.
+
+```python
+# ==== File: crawl4ai/async_webcrawler.py ====
+# ... (imports) ...
+
+class AsyncWebCrawler:
+    # ... (existing methods) ...
+
+    async def arun(
+        self,
+        url: str,
+        config: CrawlerRunConfig = None,
+        **kwargs,
+    ) -> RunManyReturn:
+        # ... (existing setup, cache check) ...
+
+        async with self._lock or self.nullcontext():
+            try:
+                # ... (existing logging, cache context setup) ...
+
+                if cached_result:
+                    # ... (existing cache handling logic) ...
+                    # Note: Captured network/console usually not useful from cache
+                    # Ensure they are None or empty if read from cache, unless stored explicitly
+                    cached_result.network_requests = cached_result.network_requests or None
+                    cached_result.console_messages = cached_result.console_messages or None
+                    # ... (rest of cache logic) ...
+
+                # Fetch fresh content if needed
+                if not cached_result or not html:
+                    t1 = time.perf_counter()
+
+                    # ... (existing user agent update, robots.txt check) ...
+
+                    ##############################
+                    # Call CrawlerStrategy.crawl #
+                    ##############################
+                    async_response = await self.crawler_strategy.crawl(
+                        url,
+                        config=config,
+                    )
+
+                    # ... (existing assignment of html, screenshot, pdf, js_result from async_response) ...
+
+                    t2 = time.perf_counter()
+                    # ... (existing logging) ...
+
+                    ###############################################################
+                    # Process the HTML content, Call CrawlerStrategy.process_html #
+                    ###############################################################
+                    crawl_result: CrawlResult = await self.aprocess_html(
+                        # ... (existing args) ...
+                    )
+
+                    # --- Transfer data from AsyncCrawlResponse to CrawlResult ---
+                    crawl_result.status_code = async_response.status_code
+                    crawl_result.redirected_url = async_response.redirected_url or url
+                    crawl_result.response_headers = async_response.response_headers
+                    crawl_result.downloaded_files = async_response.downloaded_files
+                    crawl_result.js_execution_result = js_execution_result
+                    crawl_result.ssl_certificate = async_response.ssl_certificate
+                    # NEW: Copy captured data
+                    crawl_result.network_requests = async_response.network_requests
+                    crawl_result.console_messages = async_response.console_messages
+                    # ------------------------------------------------------------
+
+                    crawl_result.success = bool(html)
+                    crawl_result.session_id = getattr(config, "session_id", None)
+
+                    # ... (existing logging) ...
+
+                    # Update cache if appropriate
+                    if cache_context.should_write() and not bool(cached_result):
+                        # crawl_result now includes network/console data if captured
+                        await async_db_manager.acache_url(crawl_result)
+
+                    return CrawlResultContainer(crawl_result)
+
+                else: # Cached result was used
+                     # ... (existing logging for cache hit) ...
+                    cached_result.success = bool(html)
+                    cached_result.session_id = getattr(config, "session_id", None)
+                    cached_result.redirected_url = cached_result.redirected_url or url
+                    return CrawlResultContainer(cached_result)
+
+            except Exception as e:
+                # ... (existing error handling) ...
+                return CrawlResultContainer(
+                    CrawlResult(
+                        url=url, html="", success=False, error_message=error_message
+                    )
+                )
+
+    # ... (aprocess_html remains unchanged regarding capture) ...
+
+    # ... (arun_many remains unchanged regarding capture) ...
+```
+
+**Summary of Changes:**
+
+1.  **Configuration:** Added `capture_network_requests` and `capture_console_messages` flags to `CrawlerRunConfig`.
+2.  **Models:** Added corresponding `network_requests` and `console_messages` fields (List of Dicts) to `AsyncCrawlResponse` and `CrawlResult`.
+3.  **Strategy:** Implemented conditional event listeners in `AsyncPlaywrightCrawlerStrategy._crawl_web` to capture data into lists when flags are true. Populated these fields in the returned `AsyncCrawlResponse`. Added basic error handling within capture handlers. Added timestamps.
+4.  **Crawler:** Modified `AsyncWebCrawler.arun` to copy the captured data from `AsyncCrawlResponse` into the final `CrawlResult` for non-cached fetches.
+
+This approach keeps the capturing logic contained within the Playwright strategy, uses clear configuration flags, and integrates the results into the existing data flow. The data format (list of dictionaries) is flexible for storing varied information from requests/responses/console messages.
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 38e1f89f..be44397e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ dynamic = ["version"]
 description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
 readme = "README.md"
 requires-python = ">=3.9"
-license = {text = "MIT"}
+license = "Apache-2.0"
 authors = [
     {name = "Unclecode", email = "unclecode@kidocode.com"}
 ]
@@ -36,13 +36,18 @@ dependencies = [
     "aiofiles",
     "rich>=13.9.4",
     "cssselect>=1.2.0",
-    "httpx==0.27.2",
-    "fake-useragent>=2.0.3"
+    "httpx>=0.27.2",
+    "fake-useragent>=2.0.3",
+    "click>=8.1.7",
+    "pyperclip>=1.8.2",
+    "chardet>=5.2.0",
+    "aiohttp>=3.11.11",
+    "brotli>=1.1.0",
+    "humanize>=4.10.0",
 ]
 classifiers = [
     "Development Status :: 4 - Beta",
     "Intended Audience :: Developers",
-    "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
@@ -52,17 +57,20 @@ classifiers = [
 ]
 
 [project.optional-dependencies]
+pdf = ["PyPDF2"]  
 torch = ["torch", "nltk", "scikit-learn"]
 transformer = ["transformers", "tokenizers"]
 cosine = ["torch", "transformers", "nltk"]
 sync = ["selenium"]
 all = [
+    "PyPDF2",
     "torch",
     "nltk",
     "scikit-learn",
     "transformers",
     "tokenizers",
-    "selenium"
+    "selenium",
+    "PyPDF2"  
 ]
 
 [project.scripts]
@@ -70,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main"
 crawl4ai-migrate = "crawl4ai.migrations:main"
 crawl4ai-setup = "crawl4ai.install:post_install"
 crawl4ai-doctor = "crawl4ai.install:doctor"
-crawl = "crawl4ai.cli:cli"
+crwl = "crawl4ai.cli:main"
 
 [tool.setuptools]
 packages = {find = {where = ["."], include = ["crawl4ai*"]}}
diff --git a/requirements.txt b/requirements.txt
index 19832b50..0bb596d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,4 +20,6 @@ pyOpenSSL>=24.3.0
 psutil>=6.1.1
 nltk>=3.9.1
 rich>=13.9.4
-cssselect>=1.2.0
\ No newline at end of file
+cssselect>=1.2.0
+chardet>=5.2.0
+brotli>=1.1.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 16b1b53c..a0b91041 100644
--- a/setup.py
+++ b/setup.py
@@ -49,13 +49,12 @@ setup(
     url="https://github.com/unclecode/crawl4ai",
     author="Unclecode",
     author_email="unclecode@kidocode.com",
-    license="MIT",
+    license="Apache-2.0",
     packages=find_packages(),
     package_data={"crawl4ai": ["js_snippet/*.js"]},
     classifiers=[
         "Development Status :: 3 - Alpha",
         "Intended Audience :: Developers",
-        "License :: OSI Approved :: Apache Software License",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
diff --git a/ssl_certificate.json b/ssl_certificate.json
deleted file mode 100644
index f6480807..00000000
--- a/ssl_certificate.json
+++ /dev/null
@@ -1,63 +0,0 @@
-{
-  "subject": {
-    "C": "US",
-    "ST": "California",
-    "L": "Los Angeles",
-    "O": "Internet Corporation for Assigned Names and Numbers",
-    "CN": "www.example.org"
-  },
-  "issuer": {
-    "C": "US",
-    "O": "DigiCert Inc",
-    "CN": "DigiCert Global G2 TLS RSA SHA256 2020 CA1"
-  },
-  "version": 2,
-  "serial_number": "0x75bcef30689c8addf13e51af4afe187",
-  "not_before": "20240130000000Z",
-  "not_after": "20250301235959Z",
-  "fingerprint": "45463a42413a32363a44383a43313a43453a33373a37393a41433a37373a36333a30413a39303a46383a32313a36333a41333a44363a38393a32453a44363a41463a45453a34303a38363a37323a43463a31393a45423a41373a41333a3632",
-  "signature_algorithm": "sha256WithRSAEncryption",
-  "raw_cert": "MIIHbjCCBlagAwIBAgIQB1vO8waJyK3fE+Ua9K/hhzANBgkqhkiG9w0BAQsFADBZMQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMTMwMQYDVQQDEypEaWdpQ2VydCBHbG9iYWwgRzIgVExTIFJTQSBTSEEyNTYgMjAyMCBDQTEwHhcNMjQwMTMwMDAwMDAwWhcNMjUwMzAxMjM1OTU5WjCBljELMAkGA1UEBhMCVVMxEzARBgNVBAgTCkNhbGlmb3JuaWExFDASBgNVBAcTC0xvcyBBbmdlbGVzMUIwQAYDVQQKDDlJbnRlcm5ldMKgQ29ycG9yYXRpb27CoGZvcsKgQXNzaWduZWTCoE5hbWVzwqBhbmTCoE51bWJlcnMxGDAWBgNVBAMTD3d3dy5leGFtcGxlLm9yZzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAIaFD7sO+cpf2fXgCjIsM9mqDgcpqC8IrXi9wga/9y0rpqcnPVOmTMNLsid3INbBVEm4CNr5cKlh9rJJnWlX2vttJDRyLkfwBD+dsVvivGYxWTLmqX6/1LDUZPVrynv/cltemtg/1Aay88jcj2ZaRoRmqBgVeacIzgU8+zmJ7236TnFSe7fkoKSclsBhPaQKcE3Djs1uszJs8sdECQTdoFX9I6UgeLKFXtg7rRf/hcW5dI0zubhXbrW8aWXbCzySVZn0c7RkJMpnTCiZzNxnPXnHFpwr5quqqjVyN/aBKkjoP04Zmr+eRqoyk/+lslq0sS8eaYSSHbC5ja/yMWyVhvMCAwEAAaOCA/IwggPuMB8GA1UdIwQYMBaAFHSFgMBmx9833s+9KTeqAx2+7c0XMB0GA1UdDgQWBBRM/tASTS4hz2v68vK4TEkCHTGRijCBgQYDVR0RBHoweIIPd3d3LmV4YW1wbGUub3JnggtleGFtcGxlLm5ldIILZXhhbXBsZS5lZHWCC2V4YW1wbGUuY29tggtleGFtcGxlLm9yZ4IPd3d3LmV4YW1wbGUuY29tgg93d3cuZXhhbXBsZS5lZHWCD3d3dy5leGFtcGxlLm5ldDA+BgNVHSAENzA1MDMGBmeBDAECAjApMCcGCCsGAQUFBwIBFhtodHRwOi8vd3d3LmRpZ2ljZXJ0LmNvbS9DUFMwDgYDVR0PAQH/BAQDAgWgMB0GA1UdJQQWMBQGCCsGAQUFBwMBBggrBgEFBQcDAjCBnwYDVR0fBIGXMIGUMEigRqBEhkJodHRwOi8vY3JsMy5kaWdpY2VydC5jb20vRGlnaUNlcnRHbG9iYWxHMlRMU1JTQVNIQTI1NjIwMjBDQTEtMS5jcmwwSKBGoESGQmh0dHA6Ly9jcmw0LmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydEdsb2JhbEcyVExTUlNBU0hBMjU2MjAyMENBMS0xLmNybDCBhwYIKwYBBQUHAQEEezB5MCQGCCsGAQUFBzABhhhodHRwOi8vb2NzcC5kaWdpY2VydC5jb20wUQYIKwYBBQUHMAKGRWh0dHA6Ly9jYWNlcnRzLmRpZ2ljZXJ0LmNvbS9EaWdpQ2VydEdsb2JhbEcyVExTUlNBU0hBMjU2MjAyMENBMS0xLmNydDAMBgNVHRMBAf8EAjAAMIIBfQYKKwYBBAHWeQIEAgSCAW0EggFpAWcAdABOdaMnXJoQwzhbbNTfP1LrHfDgjhuNacCx+mSxYpo53wAAAY1b0vxkAAAEAwBFMEMCH0BRCgxPbBBVxhcWZ26a8JCe83P1JZ6wmv56GsVcyMACIDgpMbEo5HJITTRPnoyT4mG8cLrWjEvhchUdEcWUuk1TAHYAfVkeEuF4KnscYWd8Xv340IdcFKBOlZ65Ay/ZDowuebgAAAGNW9L8MAAABAMARzBFAiBdv5Z3pZFbfgoM3tGpCTM3ZxBMQsxBRSdTS6d8d2NAcwIhALLoCT9mTMN9OyFzIBV5MkXVLyuTf2OAzAOa7d8x2H6XAHcA5tIxY0B3jMEQQQbXcbnOwdJA9paEhvu6hzId/R43jlAAAAGNW9L8XwAABAMASDBGAiEA4Koh/VizdQU1tjZ2E2VGgWSXXkwnQmiYhmAeKcVLHeACIQD7JIGFsdGol7kss2pe4lYrCgPVc+iGZkuqnj26hqhr0TANBgkqhkiG9w0BAQsFAAOCAQEABOFuAj4N4yNG9OOWNQWTNSICC4Rd4nOG1HRP/Bsnrz7KrcPORtb6D+Jx+Q0amhO31QhIvVBYs14gY4Ypyj7MzHgm4VmPXcqLvEkxb2G9Qv9hYuEiNSQmm1fr5QAN/0AzbEbCM3cImLJ69kP5bUjfv/76KB57is8tYf9sh5ikLGKauxCM/zRIcGa3bXLDafk5S2g5Vr2hs230d/NGW1wZrE+zdGuMxfGJzJP+DAFviBfcQnFg4+1zMEKcqS87oniOyG+60RMM0MdejBD7AS43m9us96Gsun/4kufLQUTIFfnzxLutUV++3seshgefQOy5C/ayi8y1VTNmujPCxPCi6Q==",
-  "extensions": [
-    {
-      "name": "authorityKeyIdentifier",
-      "value": "74:85:80:C0:66:C7:DF:37:DE:CF:BD:29:37:AA:03:1D:BE:ED:CD:17"
-    },
-    {
-      "name": "subjectKeyIdentifier",
-      "value": "4C:FE:D0:12:4D:2E:21:CF:6B:FA:F2:F2:B8:4C:49:02:1D:31:91:8A"
-    },
-    {
-      "name": "subjectAltName",
-      "value": "DNS:www.example.org, DNS:example.net, DNS:example.edu, DNS:example.com, DNS:example.org, DNS:www.example.com, DNS:www.example.edu, DNS:www.example.net"
-    },
-    {
-      "name": "certificatePolicies",
-      "value": "Policy: 2.23.140.1.2.2\n  CPS: http://www.digicert.com/CPS"
-    },
-    {
-      "name": "keyUsage",
-      "value": "Digital Signature, Key Encipherment"
-    },
-    {
-      "name": "extendedKeyUsage",
-      "value": "TLS Web Server Authentication, TLS Web Client Authentication"
-    },
-    {
-      "name": "crlDistributionPoints",
-      "value": "Full Name:\n  URI:http://crl3.digicert.com/DigiCertGlobalG2TLSRSASHA2562020CA1-1.crl\nFull Name:\n  URI:http://crl4.digicert.com/DigiCertGlobalG2TLSRSASHA2562020CA1-1.crl"
-    },
-    {
-      "name": "authorityInfoAccess",
-      "value": "OCSP - URI:http://ocsp.digicert.com\nCA Issuers - URI:http://cacerts.digicert.com/DigiCertGlobalG2TLSRSASHA2562020CA1-1.crt"
-    },
-    {
-      "name": "basicConstraints",
-      "value": "CA:FALSE"
-    },
-    {
-      "name": "ct_precert_scts",
-      "value": "Signed Certificate Timestamp:\n    Version   : v1 (0x0)\n    Log ID    : 4E:75:A3:27:5C:9A:10:C3:38:5B:6C:D4:DF:3F:52:EB:\n                1D:F0:E0:8E:1B:8D:69:C0:B1:FA:64:B1:62:9A:39:DF\n    Timestamp : Jan 30 19:22:50.340 2024 GMT\n    Extensions: none\n    Signature : ecdsa-with-SHA256\n                30:43:02:1F:40:51:0A:0C:4F:6C:10:55:C6:17:16:67:\n                6E:9A:F0:90:9E:F3:73:F5:25:9E:B0:9A:FE:7A:1A:C5:\n                5C:C8:C0:02:20:38:29:31:B1:28:E4:72:48:4D:34:4F:\n                9E:8C:93:E2:61:BC:70:BA:D6:8C:4B:E1:72:15:1D:11:\n                C5:94:BA:4D:53\nSigned Certificate Timestamp:\n    Version   : v1 (0x0)\n    Log ID    : 7D:59:1E:12:E1:78:2A:7B:1C:61:67:7C:5E:FD:F8:D0:\n                87:5C:14:A0:4E:95:9E:B9:03:2F:D9:0E:8C:2E:79:B8\n    Timestamp : Jan 30 19:22:50.288 2024 GMT\n    Extensions: none\n    Signature : ecdsa-with-SHA256\n                30:45:02:20:5D:BF:96:77:A5:91:5B:7E:0A:0C:DE:D1:\n                A9:09:33:37:67:10:4C:42:CC:41:45:27:53:4B:A7:7C:\n                77:63:40:73:02:21:00:B2:E8:09:3F:66:4C:C3:7D:3B:\n                21:73:20:15:79:32:45:D5:2F:2B:93:7F:63:80:CC:03:\n                9A:ED:DF:31:D8:7E:97\nSigned Certificate Timestamp:\n    Version   : v1 (0x0)\n    Log ID    : E6:D2:31:63:40:77:8C:C1:10:41:06:D7:71:B9:CE:C1:\n                D2:40:F6:96:84:86:FB:BA:87:32:1D:FD:1E:37:8E:50\n    Timestamp : Jan 30 19:22:50.335 2024 GMT\n    Extensions: none\n    Signature : ecdsa-with-SHA256\n                30:46:02:21:00:E0:AA:21:FD:58:B3:75:05:35:B6:36:\n                76:13:65:46:81:64:97:5E:4C:27:42:68:98:86:60:1E:\n                29:C5:4B:1D:E0:02:21:00:FB:24:81:85:B1:D1:A8:97:\n                B9:2C:B3:6A:5E:E2:56:2B:0A:03:D5:73:E8:86:66:4B:\n                AA:9E:3D:BA:86:A8:6B:D1"
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/async/test_chunking_and_extraction_strategies.py b/tests/async/test_chunking_and_extraction_strategies.py
index ab9daddc..90e17a9d 100644
--- a/tests/async/test_chunking_and_extraction_strategies.py
+++ b/tests/async/test_chunking_and_extraction_strategies.py
@@ -7,6 +7,7 @@ import json
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 
+from crawl4ai import LLMConfig
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 from crawl4ai.chunking_strategy import RegexChunking
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
@@ -48,8 +49,7 @@ async def test_llm_extraction_strategy():
     async with AsyncWebCrawler(verbose=True) as crawler:
         url = "https://www.nbcnews.com/business"
         extraction_strategy = LLMExtractionStrategy(
-            provider="openai/gpt-4o-mini",
-            api_token=os.getenv("OPENAI_API_KEY"),
+            llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
             instruction="Extract only content related to technology",
         )
         result = await crawler.arun(
diff --git a/tests/browser/docker/__init__.py b/tests/browser/docker/__init__.py
new file mode 100644
index 00000000..b86e573c
--- /dev/null
+++ b/tests/browser/docker/__init__.py
@@ -0,0 +1,4 @@
+"""Docker browser strategy tests.
+
+This package contains tests for the Docker browser strategy implementation.
+"""
\ No newline at end of file
diff --git a/tests/browser/docker/test_docker_browser.py b/tests/browser/docker/test_docker_browser.py
new file mode 100644
index 00000000..2ec64a6b
--- /dev/null
+++ b/tests/browser/docker/test_docker_browser.py
@@ -0,0 +1,651 @@
+"""Test examples for Docker Browser Strategy.
+
+These examples demonstrate the functionality of Docker Browser Strategy
+and serve as functional tests.
+"""
+
+import asyncio
+import os
+import sys
+import shutil
+import uuid
+
+# Add the project root to Python path if running directly
+if __name__ == "__main__":
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
+
+from crawl4ai.browser import BrowserManager
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.browser import DockerConfig
+from crawl4ai.browser import DockerRegistry
+from crawl4ai.browser import DockerUtils
+
+# Create a logger for clear terminal output
+logger = AsyncLogger(verbose=True, log_file=None)
+
+# Global Docker utils instance
+docker_utils = DockerUtils(logger)
+
+async def test_docker_components():
+    """Test Docker utilities, registry, and image building.
+    
+    This function tests the core Docker components before running the browser tests.
+    It validates DockerRegistry, DockerUtils, and builds test images to ensure
+    everything is functioning correctly.
+    """
+    logger.info("Testing Docker components", tag="SETUP")
+    
+    # Create a test registry directory
+    registry_dir = os.path.join(os.path.dirname(__file__), "test_registry")
+    registry_file = os.path.join(registry_dir, "test_registry.json")
+    os.makedirs(registry_dir, exist_ok=True)
+    
+    try:
+        # 1. Test DockerRegistry
+        logger.info("Testing DockerRegistry...", tag="SETUP")
+        registry = DockerRegistry(registry_file)
+        
+        # Test saving and loading registry
+        test_container_id = "test-container-123"
+        registry.register_container(test_container_id, 9876, "test-hash-123")
+        registry.save()
+        
+        # Create a new registry instance that loads from the file
+        registry2 = DockerRegistry(registry_file)
+        port = registry2.get_container_host_port(test_container_id)
+        hash_value = registry2.get_container_config_hash(test_container_id)
+        
+        if port != 9876 or hash_value != "test-hash-123":
+            logger.error("DockerRegistry persistence failed", tag="SETUP")
+            return False
+            
+        # Clean up test container from registry
+        registry2.unregister_container(test_container_id)
+        logger.success("DockerRegistry works correctly", tag="SETUP")
+        
+        # 2. Test DockerUtils
+        logger.info("Testing DockerUtils...", tag="SETUP")
+        
+        # Test port detection
+        in_use = docker_utils.is_port_in_use(22)  # SSH port is usually in use
+        logger.info(f"Port 22 in use: {in_use}", tag="SETUP")
+        
+        # Get next available port
+        available_port = docker_utils.get_next_available_port(9000)
+        logger.info(f"Next available port: {available_port}", tag="SETUP")
+        
+        # Test config hash generation
+        config_dict = {"mode": "connect", "headless": True}
+        config_hash = docker_utils.generate_config_hash(config_dict)
+        logger.info(f"Generated config hash: {config_hash[:8]}...", tag="SETUP")
+        
+        # 3. Test Docker is available
+        logger.info("Checking Docker availability...", tag="SETUP")
+        if not await check_docker_available():
+            logger.error("Docker is not available - cannot continue tests", tag="SETUP")
+            return False
+            
+        # 4. Test building connect image
+        logger.info("Building connect mode Docker image...", tag="SETUP")
+        connect_image = await docker_utils.ensure_docker_image_exists(None, "connect")
+        if not connect_image:
+            logger.error("Failed to build connect mode image", tag="SETUP")
+            return False
+        logger.success(f"Successfully built connect image: {connect_image}", tag="SETUP")
+        
+        # 5. Test building launch image
+        logger.info("Building launch mode Docker image...", tag="SETUP")
+        launch_image = await docker_utils.ensure_docker_image_exists(None, "launch")
+        if not launch_image:
+            logger.error("Failed to build launch mode image", tag="SETUP")
+            return False
+        logger.success(f"Successfully built launch image: {launch_image}", tag="SETUP")
+        
+        # 6. Test creating and removing container
+        logger.info("Testing container creation and removal...", tag="SETUP")
+        container_id = await docker_utils.create_container(
+            image_name=launch_image,
+            host_port=available_port,
+            container_name="crawl4ai-test-container"
+        )
+        
+        if not container_id:
+            logger.error("Failed to create test container", tag="SETUP")
+            return False
+            
+        logger.info(f"Created test container: {container_id[:12]}", tag="SETUP")
+        
+        # Verify container is running
+        running = await docker_utils.is_container_running(container_id)
+        if not running:
+            logger.error("Test container is not running", tag="SETUP")
+            await docker_utils.remove_container(container_id)
+            return False
+            
+        # Test commands in container
+        logger.info("Testing command execution in container...", tag="SETUP")
+        returncode, stdout, stderr = await docker_utils.exec_in_container(
+            container_id, ["ls", "-la", "/"]
+        )
+        
+        if returncode != 0:
+            logger.error(f"Command execution failed: {stderr}", tag="SETUP")
+            await docker_utils.remove_container(container_id)
+            return False
+            
+        # Verify Chrome is installed in the container
+        returncode, stdout, stderr = await docker_utils.exec_in_container(
+            container_id, ["which", "chromium"]
+        )
+        
+        if returncode != 0:
+            logger.error("Chrome not found in container", tag="SETUP")
+            await docker_utils.remove_container(container_id)
+            return False
+            
+        chrome_path = stdout.strip()
+        logger.info(f"Chrome found at: {chrome_path}", tag="SETUP")
+        
+        # Test Chrome version
+        returncode, stdout, stderr = await docker_utils.exec_in_container(
+            container_id, ["chromium", "--version"]
+        )
+        
+        if returncode != 0:
+            logger.error(f"Failed to get Chrome version: {stderr}", tag="SETUP")
+            await docker_utils.remove_container(container_id)
+            return False
+            
+        logger.info(f"Chrome version: {stdout.strip()}", tag="SETUP")
+        
+        # Remove test container
+        removed = await docker_utils.remove_container(container_id)
+        if not removed:
+            logger.error("Failed to remove test container", tag="SETUP")
+            return False
+            
+        logger.success("Test container removed successfully", tag="SETUP")
+        
+        # All components tested successfully
+        logger.success("All Docker components tested successfully", tag="SETUP")
+        return True
+        
+    except Exception as e:
+        logger.error(f"Docker component tests failed: {str(e)}", tag="SETUP")
+        return False
+    finally:
+        # Clean up registry test directory
+        if os.path.exists(registry_dir):
+            shutil.rmtree(registry_dir)
+
+async def test_docker_connect_mode():
+    """Test Docker browser in connect mode.
+    
+    This tests the basic functionality of creating a browser in Docker
+    connect mode and using it for navigation.
+    """
+    logger.info("Testing Docker browser in connect mode", tag="TEST")
+    
+    # Create temp directory for user data
+    temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data")
+    os.makedirs(temp_dir, exist_ok=True)
+    
+    try:
+        # Create Docker configuration
+        docker_config = DockerConfig(
+            mode="connect",
+            persistent=False,
+            remove_on_exit=True,
+            user_data_dir=temp_dir
+        )
+        
+        # Create browser configuration
+        browser_config = BrowserConfig(
+            browser_mode="docker",
+            headless=True,
+            docker_config=docker_config
+        )
+        
+        # Create browser manager
+        manager = BrowserManager(browser_config=browser_config, logger=logger)
+        
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully", tag="TEST")
+        
+        # Create crawler config
+        crawler_config = CrawlerRunConfig(url="https://example.com")
+        
+        # Get a page
+        page, context = await manager.get_page(crawler_config)
+        logger.info("Got page successfully", tag="TEST")
+        
+        # Navigate to a website
+        await page.goto("https://example.com")
+        logger.info("Navigated to example.com", tag="TEST")
+        
+        # Get page title
+        title = await page.title()
+        logger.info(f"Page title: {title}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.info("Browser closed successfully", tag="TEST")
+        
+        return True
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Ensure cleanup
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+    finally:
+        # Clean up the temp directory
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+
+async def test_docker_launch_mode():
+    """Test Docker browser in launch mode.
+    
+    This tests launching a Chrome browser within a Docker container
+    on demand with custom settings.
+    """
+    logger.info("Testing Docker browser in launch mode", tag="TEST")
+    
+    # Create temp directory for user data
+    temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data_launch")
+    os.makedirs(temp_dir, exist_ok=True)
+    
+    try:
+        # Create Docker configuration
+        docker_config = DockerConfig(
+            mode="launch",
+            persistent=False,
+            remove_on_exit=True,
+            user_data_dir=temp_dir
+        )
+        
+        # Create browser configuration
+        browser_config = BrowserConfig(
+            browser_mode="docker",
+            headless=True,
+            text_mode=True,  # Enable text mode for faster operation
+            docker_config=docker_config
+        )
+        
+        # Create browser manager
+        manager = BrowserManager(browser_config=browser_config, logger=logger)
+        
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully", tag="TEST")
+        
+        # Create crawler config
+        crawler_config = CrawlerRunConfig(url="https://example.com")
+        
+        # Get a page
+        page, context = await manager.get_page(crawler_config)
+        logger.info("Got page successfully", tag="TEST")
+        
+        # Navigate to a website
+        await page.goto("https://example.com")
+        logger.info("Navigated to example.com", tag="TEST")
+        
+        # Get page title
+        title = await page.title()
+        logger.info(f"Page title: {title}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.info("Browser closed successfully", tag="TEST")
+        
+        return True
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Ensure cleanup
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+    finally:
+        # Clean up the temp directory
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+
+async def test_docker_persistent_storage():
+    """Test Docker browser with persistent storage.
+    
+    This tests creating localStorage data in one session and verifying
+    it persists to another session when using persistent storage.
+    """
+    logger.info("Testing Docker browser with persistent storage", tag="TEST")
+    
+    # Create a unique temp directory
+    test_id = uuid.uuid4().hex[:8]
+    temp_dir = os.path.join(os.path.dirname(__file__), f"tmp_user_data_persist_{test_id}")
+    os.makedirs(temp_dir, exist_ok=True)
+    
+    manager1 = None
+    manager2 = None
+    
+    try:
+        # Create Docker configuration with persistence
+        docker_config = DockerConfig(
+            mode="connect",
+            persistent=True,  # Keep container running between sessions
+            user_data_dir=temp_dir,
+            container_user_data_dir="/data"
+        )
+        
+        # Create browser configuration
+        browser_config = BrowserConfig(
+            browser_mode="docker",
+            headless=True,
+            docker_config=docker_config
+        )
+        
+        # Create first browser manager
+        manager1 = BrowserManager(browser_config=browser_config, logger=logger)
+        
+        # Start the browser
+        await manager1.start()
+        logger.info("First browser started successfully", tag="TEST")
+        
+        # Create crawler config
+        crawler_config = CrawlerRunConfig()
+        
+        # Get a page
+        page1, context1 = await manager1.get_page(crawler_config)
+        
+        # Navigate to example.com
+        await page1.goto("https://example.com")
+        
+        # Set localStorage item
+        test_value = f"test_value_{test_id}"
+        await page1.evaluate(f"localStorage.setItem('test_key', '{test_value}')")
+        logger.info(f"Set localStorage test_key = {test_value}", tag="TEST")
+        
+        # Close the first browser manager
+        await manager1.close()
+        logger.info("First browser closed", tag="TEST")
+        
+        # Create second browser manager with same config
+        manager2 = BrowserManager(browser_config=browser_config, logger=logger)
+        
+        # Start the browser
+        await manager2.start()
+        logger.info("Second browser started successfully", tag="TEST")
+        
+        # Get a page
+        page2, context2 = await manager2.get_page(crawler_config)
+        
+        # Navigate to same site
+        await page2.goto("https://example.com")
+        
+        # Get localStorage item
+        value = await page2.evaluate("localStorage.getItem('test_key')")
+        logger.info(f"Retrieved localStorage test_key = {value}", tag="TEST")
+        
+        # Check if persistence worked
+        if value == test_value:
+            logger.success("Storage persistence verified!", tag="TEST")
+        else:
+            logger.error(f"Storage persistence failed! Expected {test_value}, got {value}", tag="TEST")
+        
+        # Clean up
+        await manager2.close()
+        logger.info("Second browser closed successfully", tag="TEST")
+        
+        return value == test_value
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Ensure cleanup
+        try:
+            if manager1:
+                await manager1.close()
+            if manager2:
+                await manager2.close()
+        except:
+            pass
+        return False
+    finally:
+        # Clean up the temp directory
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+
+async def test_docker_parallel_pages():
+    """Test Docker browser with parallel page creation.
+    
+    This tests the ability to create and use multiple pages in parallel
+    from a single Docker browser instance.
+    """
+    logger.info("Testing Docker browser with parallel pages", tag="TEST")
+    
+    try:
+        # Create Docker configuration
+        docker_config = DockerConfig(
+            mode="connect",
+            persistent=False,
+            remove_on_exit=True
+        )
+        
+        # Create browser configuration
+        browser_config = BrowserConfig(
+            browser_mode="docker",
+            headless=True,
+            docker_config=docker_config
+        )
+        
+        # Create browser manager
+        manager = BrowserManager(browser_config=browser_config, logger=logger)
+        
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully", tag="TEST")
+        
+        # Create crawler config
+        crawler_config = CrawlerRunConfig()
+        
+        # Get multiple pages
+        page_count = 3
+        pages = await manager.get_pages(crawler_config, count=page_count)
+        logger.info(f"Got {len(pages)} pages successfully", tag="TEST")
+        
+        if len(pages) != page_count:
+            logger.error(f"Expected {page_count} pages, got {len(pages)}", tag="TEST")
+            await manager.close()
+            return False
+            
+        # Navigate to different sites with each page
+        tasks = []
+        for i, (page, _) in enumerate(pages):
+            tasks.append(page.goto(f"https://example.com?page={i}"))
+            
+        # Wait for all navigations to complete
+        await asyncio.gather(*tasks)
+        logger.info("All pages navigated successfully", tag="TEST")
+        
+        # Get titles from all pages
+        titles = []
+        for i, (page, _) in enumerate(pages):
+            title = await page.title()
+            titles.append(title)
+            logger.info(f"Page {i+1} title: {title}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.info("Browser closed successfully", tag="TEST")
+        
+        return True
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Ensure cleanup
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def test_docker_registry_reuse():
+    """Test Docker container reuse via registry.
+    
+    This tests that containers with matching configurations
+    are reused rather than creating new ones.
+    """
+    logger.info("Testing Docker container reuse via registry", tag="TEST")
+    
+    # Create registry for this test
+    registry_dir = os.path.join(os.path.dirname(__file__), "registry_reuse_test")
+    registry_file = os.path.join(registry_dir, "registry.json")
+    os.makedirs(registry_dir, exist_ok=True)
+    
+    manager1 = None
+    manager2 = None
+    container_id1 = None
+    
+    try:
+        # Create identical Docker configurations with custom registry
+        docker_config1 = DockerConfig(
+            mode="connect",
+            persistent=True,  # Keep container running after closing
+            registry_file=registry_file
+        )
+        
+        # Create first browser configuration
+        browser_config1 = BrowserConfig(
+            browser_mode="docker",
+            headless=True,
+            docker_config=docker_config1
+        )
+        
+        # Create first browser manager
+        manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
+        
+        # Start the first browser
+        await manager1.start()
+        logger.info("First browser started successfully", tag="TEST")
+        
+        # Get container ID from the strategy
+        docker_strategy1 = manager1.strategy
+        container_id1 = docker_strategy1.container_id
+        logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST")
+        
+        # Close the first manager but keep container running
+        await manager1.close()
+        logger.info("First browser closed", tag="TEST")
+        
+        # Create second Docker configuration identical to first
+        docker_config2 = DockerConfig(
+            mode="connect",
+            persistent=True,
+            registry_file=registry_file
+        )
+        
+        # Create second browser configuration
+        browser_config2 = BrowserConfig(
+            browser_mode="docker",
+            headless=True,
+            docker_config=docker_config2
+        )
+        
+        # Create second browser manager
+        manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
+        
+        # Start the second browser - should reuse existing container
+        await manager2.start()
+        logger.info("Second browser started successfully", tag="TEST")
+        
+        # Get container ID from the second strategy
+        docker_strategy2 = manager2.strategy
+        container_id2 = docker_strategy2.container_id
+        logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST")
+        
+        # Verify container reuse
+        if container_id1 == container_id2:
+            logger.success("Container reuse successful - using same container!", tag="TEST")
+        else:
+            logger.error("Container reuse failed - new container created!", tag="TEST")
+        
+        # Clean up
+        docker_strategy2.docker_config.persistent = False
+        docker_strategy2.docker_config.remove_on_exit = True
+        await manager2.close()
+        logger.info("Second browser closed and container removed", tag="TEST")
+        
+        return container_id1 == container_id2
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Ensure cleanup
+        try:
+            if manager1:
+                await manager1.close()
+            if manager2:
+                await manager2.close()
+            # Make sure container is removed
+            if container_id1:
+                await docker_utils.remove_container(container_id1, force=True)
+        except:
+            pass
+        return False
+    finally:
+        # Clean up registry directory
+        if os.path.exists(registry_dir):
+            shutil.rmtree(registry_dir)
+
+async def run_tests():
+    """Run all tests sequentially."""
+    results = []
+    
+    logger.info("Starting Docker Browser Strategy tests", tag="TEST")
+    
+    # Check if Docker is available
+    if not await check_docker_available():
+        logger.error("Docker is not available - skipping tests", tag="TEST")
+        return
+    
+    # First test Docker components
+    # setup_result = await test_docker_components()
+    # if not setup_result:
+    #     logger.error("Docker component tests failed - skipping browser tests", tag="TEST")
+    #     return
+    
+    # Run browser tests
+    results.append(await test_docker_connect_mode())
+    results.append(await test_docker_launch_mode())
+    results.append(await test_docker_persistent_storage())
+    results.append(await test_docker_parallel_pages())
+    results.append(await test_docker_registry_reuse())
+    
+    # Print summary
+    total = len(results)
+    passed = sum(1 for r in results if r)
+    logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
+    
+    if passed == total:
+        logger.success("All tests passed!", tag="SUMMARY")
+    else:
+        logger.error(f"{total - passed} tests failed", tag="SUMMARY")
+
+async def check_docker_available() -> bool:
+    """Check if Docker is available on the system.
+    
+    Returns:
+        bool: True if Docker is available, False otherwise
+    """
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            "docker", "--version",
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        stdout, _ = await proc.communicate()
+        return proc.returncode == 0 and stdout
+    except:
+        return False
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
\ No newline at end of file
diff --git a/tests/browser/manager/demo_browser_manager.py b/tests/browser/manager/demo_browser_manager.py
new file mode 100644
index 00000000..2fde7e8a
--- /dev/null
+++ b/tests/browser/manager/demo_browser_manager.py
@@ -0,0 +1,525 @@
+"""Demo script for testing the enhanced BrowserManager.
+
+This script demonstrates the browser pooling capabilities of the enhanced
+BrowserManager with various configurations and usage patterns.
+"""
+
+import asyncio
+import time
+import random
+
+from crawl4ai.browser.manager import BrowserManager, UnavailableBehavior
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+
+import playwright
+
+SAFE_URLS = [
+    "https://example.com",
+    "https://example.com/page1",
+    "https://httpbin.org/get",
+    "https://httpbin.org/html",
+    "https://httpbin.org/ip",
+    "https://httpbin.org/user-agent",
+    "https://httpbin.org/headers",
+    "https://httpbin.org/cookies",
+    "https://httpstat.us/200",
+    "https://httpstat.us/301",
+    "https://httpstat.us/404",
+    "https://httpstat.us/500",
+    "https://jsonplaceholder.typicode.com/posts/1",
+    "https://jsonplaceholder.typicode.com/posts/2",
+    "https://jsonplaceholder.typicode.com/posts/3",
+    "https://jsonplaceholder.typicode.com/posts/4",
+    "https://jsonplaceholder.typicode.com/posts/5",
+    "https://jsonplaceholder.typicode.com/comments/1",
+    "https://jsonplaceholder.typicode.com/comments/2",
+    "https://jsonplaceholder.typicode.com/users/1",
+    "https://jsonplaceholder.typicode.com/users/2",
+    "https://jsonplaceholder.typicode.com/albums/1",
+    "https://jsonplaceholder.typicode.com/albums/2",
+    "https://jsonplaceholder.typicode.com/photos/1",
+    "https://jsonplaceholder.typicode.com/photos/2",
+    "https://jsonplaceholder.typicode.com/todos/1",
+    "https://jsonplaceholder.typicode.com/todos/2",
+    "https://www.iana.org",
+    "https://www.iana.org/domains",
+    "https://www.iana.org/numbers",
+    "https://www.iana.org/protocols",
+    "https://www.iana.org/about",
+    "https://www.iana.org/time-zones",
+    "https://www.data.gov",
+    "https://catalog.data.gov/dataset",
+    "https://www.archives.gov",
+    "https://www.usa.gov",
+    "https://www.loc.gov",
+    "https://www.irs.gov",
+    "https://www.census.gov",
+    "https://www.bls.gov",
+    "https://www.gpo.gov",
+    "https://www.w3.org",
+    "https://www.w3.org/standards",
+    "https://www.w3.org/WAI",
+    "https://www.rfc-editor.org",
+    "https://www.ietf.org",
+    "https://www.icann.org",
+    "https://www.internetsociety.org",
+    "https://www.python.org"
+]
+
+async def basic_pooling_demo():
+    """Demonstrate basic browser pooling functionality."""
+    print("\n=== Basic Browser Pooling Demo ===")
+    
+    # Create logger
+    logger = AsyncLogger(verbose=True)
+    
+    # Create browser configurations
+    config1 = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        browser_mode="playwright"
+    )
+    
+    config2 = BrowserConfig(
+        browser_type="chromium", 
+        headless=True,
+        browser_mode="cdp"
+    )
+    
+    # Create browser manager with on-demand behavior
+    manager = BrowserManager(
+        browser_config=config1,
+        logger=logger,
+        unavailable_behavior=UnavailableBehavior.ON_DEMAND,
+        max_browsers_per_config=3
+    )
+    
+    try:
+        # Initialize pool with both configurations
+        print("Initializing browser pool...")
+        await manager.initialize_pool(
+            browser_configs=[config1, config2],
+            browsers_per_config=2
+        )
+        
+        # Display initial pool status
+        status = await manager.get_pool_status()
+        print(f"Initial pool status: {status}")
+        
+        # Create crawler run configurations
+        run_config1 = CrawlerRunConfig()
+        run_config2 = CrawlerRunConfig()
+        
+        # Simulate concurrent page requests
+        print("\nGetting pages for parallel crawling...")
+        
+        # Function to simulate crawling
+        async def simulate_crawl(index: int, config: BrowserConfig, run_config: CrawlerRunConfig):
+            print(f"Crawler {index}: Requesting page...")
+            page, context, strategy = await manager.get_page(run_config, config)
+            print(f"Crawler {index}: Got page, navigating to example.com...")
+            
+            try:
+                await page.goto("https://example.com")
+                title = await page.title()
+                print(f"Crawler {index}: Page title: {title}")
+                
+                # Simulate work
+                await asyncio.sleep(random.uniform(1, 3))
+                print(f"Crawler {index}: Work completed, releasing page...")
+                
+                # Check dynamic page content
+                content = await page.content()
+                content_length = len(content)
+                print(f"Crawler {index}: Page content length: {content_length}")
+                
+            except Exception as e:
+                print(f"Crawler {index}: Error: {str(e)}")
+            finally:
+                # Release the page
+                await manager.release_page(page, strategy, config)
+                print(f"Crawler {index}: Page released")
+        
+        # Create 5 parallel crawls
+        crawl_tasks = []
+        for i in range(5):
+            # Alternate between configurations
+            config = config1 if i % 2 == 0 else config2
+            run_config = run_config1 if i % 2 == 0 else run_config2
+            
+            task = asyncio.create_task(simulate_crawl(i+1, config, run_config))
+            crawl_tasks.append(task)
+        
+        # Wait for all crawls to complete
+        await asyncio.gather(*crawl_tasks)
+        
+        # Display final pool status
+        status = await manager.get_pool_status()
+        print(f"\nFinal pool status: {status}")
+        
+    finally:
+        # Clean up
+        print("\nClosing browser manager...")
+        await manager.close()
+        print("Browser manager closed")
+
+
+async def prewarm_pages_demo():
+    """Demonstrate page pre-warming functionality."""
+    print("\n=== Page Pre-warming Demo ===")
+    
+    # Create logger
+    logger = AsyncLogger(verbose=True)
+    
+    # Create browser configuration
+    config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        browser_mode="playwright"
+    )
+    
+    # Create crawler run configurations for pre-warming
+    run_config1 = CrawlerRunConfig(
+        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    )
+    
+    run_config2 = CrawlerRunConfig(
+        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
+    )
+    
+    # Create page pre-warm configurations
+    page_configs = [
+        (config, run_config1, 2),  # 2 pages with run_config1
+        (config, run_config2, 3)   # 3 pages with run_config2
+    ]
+    
+    # Create browser manager
+    manager = BrowserManager(
+        browser_config=config,
+        logger=logger,
+        unavailable_behavior=UnavailableBehavior.EXCEPTION
+    )
+    
+    try:
+        # Initialize pool with pre-warmed pages
+        print("Initializing browser pool with pre-warmed pages...")
+        await manager.initialize_pool(
+            browser_configs=[config],
+            browsers_per_config=2,
+            page_configs=page_configs
+        )
+        
+        # Display pool status
+        status = await manager.get_pool_status()
+        print(f"Pool status after pre-warming: {status}")
+        
+        # Simulate using pre-warmed pages
+        print("\nUsing pre-warmed pages...")
+        
+        async def use_prewarm_page(index: int, run_config: CrawlerRunConfig):
+            print(f"Task {index}: Requesting pre-warmed page...")
+            page, context, strategy = await manager.get_page(run_config, config)
+            
+            try:
+                print(f"Task {index}: Got page, navigating to example.com...")
+                await page.goto("https://example.com")
+                
+                # Verify user agent was applied correctly
+                user_agent = await page.evaluate("() => navigator.userAgent")
+                print(f"Task {index}: User agent: {user_agent}")
+                
+                # Get page title
+                title = await page.title()
+                print(f"Task {index}: Page title: {title}")
+                
+                # Simulate work
+                await asyncio.sleep(1)
+            finally:
+                # Release the page
+                print(f"Task {index}: Releasing page...")
+                await manager.release_page(page, strategy, config)
+        
+        # Create tasks to use pre-warmed pages
+        tasks = []
+        # Use run_config1 pages
+        for i in range(2):
+            tasks.append(asyncio.create_task(use_prewarm_page(i+1, run_config1)))
+        
+        # Use run_config2 pages
+        for i in range(3):
+            tasks.append(asyncio.create_task(use_prewarm_page(i+3, run_config2)))
+        
+        # Wait for all tasks to complete
+        await asyncio.gather(*tasks)
+        
+        # Try to use more pages than we pre-warmed (should raise exception)
+        print("\nTrying to use more pages than pre-warmed...")
+        try:
+            page, context, strategy = await manager.get_page(run_config1, config)
+            try:
+                print("Got extra page (unexpected)")
+                await page.goto("https://example.com")
+            finally:
+                await manager.release_page(page, strategy, config)
+        except Exception as e:
+            print(f"Expected exception when requesting more pages: {str(e)}")
+        
+    finally:
+        # Clean up
+        print("\nClosing browser manager...")
+        await manager.close()
+        print("Browser manager closed")
+
+
+async def prewarm_on_demand_demo():
+    """Demonstrate pre-warming with on-demand browser creation."""
+    print("\n=== Pre-warming with On-Demand Browser Creation Demo ===")
+    
+    # Create logger
+    logger = AsyncLogger(verbose=True)
+    
+    # Create browser configuration
+    config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        browser_mode="playwright"
+    )
+    
+    # Create crawler run configurations
+    run_config = CrawlerRunConfig(
+        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    )
+    
+    # Create page pre-warm configurations - just pre-warm 2 pages
+    page_configs = [
+        (config, run_config, 2)
+    ]
+    
+    # Create browser manager with ON_DEMAND behavior
+    manager = BrowserManager(
+        browser_config=config,
+        logger=logger,
+        unavailable_behavior=UnavailableBehavior.ON_DEMAND,
+        max_browsers_per_config=5  # Allow up to 5 browsers
+    )
+    
+    try:
+        # Initialize pool with pre-warmed pages
+        print("Initializing browser pool with pre-warmed pages...")
+        await manager.initialize_pool(
+            browser_configs=[config],
+            browsers_per_config=1,  # Start with just 1 browser
+            page_configs=page_configs
+        )
+        
+        # Display initial pool status
+        status = await manager.get_pool_status()
+        print(f"Initial pool status: {status}")
+        
+        # Simulate using more pages than pre-warmed - should create browsers on demand
+        print("\nUsing more pages than pre-warmed (should create on demand)...")
+        
+        async def use_page(index: int):
+            print(f"Task {index}: Requesting page...")
+            page, context, strategy = await manager.get_page(run_config, config)
+            
+            try:
+                print(f"Task {index}: Got page, navigating to example.com...")
+                await page.goto("https://example.com")
+                
+                # Get page title
+                title = await page.title()
+                print(f"Task {index}: Page title: {title}")
+                
+                # Simulate work for a varying amount of time
+                work_time = 1 + (index * 0.5)  # Stagger completion times
+                print(f"Task {index}: Working for {work_time} seconds...")
+                await asyncio.sleep(work_time)
+                print(f"Task {index}: Work completed")
+            finally:
+                # Release the page
+                print(f"Task {index}: Releasing page...")
+                await manager.release_page(page, strategy, config)
+        
+        # Create more tasks than pre-warmed pages
+        tasks = []
+        for i in range(5):  # Try to use 5 pages when only 2 are pre-warmed
+            tasks.append(asyncio.create_task(use_page(i+1)))
+        
+        # Wait for all tasks to complete
+        await asyncio.gather(*tasks)
+        
+        # Display final pool status - should show on-demand created browsers
+        status = await manager.get_pool_status()
+        print(f"\nFinal pool status: {status}")
+        
+    finally:
+        # Clean up
+        print("\nClosing browser manager...")
+        await manager.close()
+        print("Browser manager closed")
+
+
+async def high_volume_demo():
+    """Demonstrate high-volume access to pre-warmed pages."""
+    print("\n=== High Volume Pre-warmed Pages Demo ===")
+    
+    # Create logger
+    logger = AsyncLogger(verbose=True)
+    
+    # Create browser configuration
+    config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        browser_mode="playwright"
+    )
+    
+    # Create crawler run configuration
+    run_config = CrawlerRunConfig(
+        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    )
+    
+    # Set up dimensions
+    browser_count = 10
+    pages_per_browser = 5
+    total_pages = browser_count * pages_per_browser
+    
+    # Create page pre-warm configuration
+    page_configs = [
+        (config, run_config, total_pages)
+    ]
+    
+    print(f"Preparing {browser_count} browsers with {pages_per_browser} pages each ({total_pages} total pages)")
+    
+    # Create browser manager with ON_DEMAND behavior as fallback
+    # No need to specify max_browsers_per_config as it will be calculated automatically
+    manager = BrowserManager(
+        browser_config=config,
+        logger=logger,
+        unavailable_behavior=UnavailableBehavior.ON_DEMAND
+    )
+    
+    try:
+        # Initialize pool with browsers and pre-warmed pages
+        print(f"Pre-warming {total_pages} pages...")
+        start_time = time.time()
+        await manager.initialize_pool(
+            browser_configs=[config],
+            browsers_per_config=browser_count,
+            page_configs=page_configs
+        )
+        warmup_time = time.time() - start_time
+        print(f"Pre-warming completed in {warmup_time:.2f} seconds")
+        
+        # Display pool status
+        status = await manager.get_pool_status()
+        print(f"Pool status after pre-warming: {status}")
+        
+        # Simulate using all pre-warmed pages simultaneously
+        print(f"\nSending {total_pages} crawl requests simultaneously...")
+        
+        async def crawl_page(index: int):
+            # url = f"https://example.com/page{index}"
+            url = SAFE_URLS[index % len(SAFE_URLS)]
+            print(f"Page {index}: Requesting page...")            
+            # Measure time to acquire page
+            page_start = time.time()
+            page, context, strategy = await manager.get_page(run_config, config)
+            page_acquisition_time = time.time() - page_start
+            
+            try:
+                # Navigate to the URL
+                nav_start = time.time()
+                await page.goto(url, timeout=5000)
+                navigation_time = time.time() - nav_start
+                
+                # Get the page title
+                title = await page.title()
+                
+                return {
+                    "index": index,
+                    "url": url,
+                    "title": title,
+                    "page_acquisition_time": page_acquisition_time,
+                    "navigation_time": navigation_time
+                }
+            except playwright._impl._errors.TimeoutError as e:
+                # print(f"Page {index}: Navigation timed out - {e}")
+                return {
+                    "index": index,
+                    "url": url,
+                    "title": "Navigation timed out",
+                    "page_acquisition_time": page_acquisition_time,
+                    "navigation_time": 0
+                }
+            finally:
+                # Release the page
+                await manager.release_page(page, strategy, config)
+        
+        # Create and execute all tasks simultaneously
+        start_time = time.time()
+
+        # Non-parallel way
+        # for i in range(total_pages):
+        #     await crawl_page(i+1)
+
+        tasks = [crawl_page(i+1) for i in range(total_pages)]
+        results = await asyncio.gather(*tasks)
+        total_time = time.time() - start_time
+        
+        # # Print all titles
+        # for result in results:
+        #     print(f"Page {result['index']} ({result['url']}): Title: {result['title']}")
+        #     print(f"  Page acquisition time: {result['page_acquisition_time']:.4f}s")
+        #     print(f"  Navigation time: {result['navigation_time']:.4f}s")
+        #     print(f"  Total time: {result['page_acquisition_time'] + result['navigation_time']:.4f}s")
+        #     print("-" * 40)
+        
+        # Report results
+        print(f"\nAll {total_pages} crawls completed in {total_time:.2f} seconds")
+        
+        # Calculate statistics
+        acquisition_times = [r["page_acquisition_time"] for r in results]
+        navigation_times = [r["navigation_time"] for r in results]
+        
+        avg_acquisition = sum(acquisition_times) / len(acquisition_times)
+        max_acquisition = max(acquisition_times)
+        min_acquisition = min(acquisition_times)
+        
+        avg_navigation = sum(navigation_times) / len(navigation_times)
+        max_navigation = max(navigation_times)
+        min_navigation = min(navigation_times)
+        
+        print("\nPage acquisition times:")
+        print(f"  Average: {avg_acquisition:.4f}s")
+        print(f"  Min: {min_acquisition:.4f}s")
+        print(f"  Max: {max_acquisition:.4f}s")
+        
+        print("\nPage navigation times:")
+        print(f"  Average: {avg_navigation:.4f}s")
+        print(f"  Min: {min_navigation:.4f}s")
+        print(f"  Max: {max_navigation:.4f}s")
+        
+        # Display final pool status
+        status = await manager.get_pool_status()
+        print(f"\nFinal pool status: {status}")
+        
+    finally:
+        # Clean up
+        print("\nClosing browser manager...")
+        await manager.close()
+        print("Browser manager closed")
+
+
+async def main():
+    """Run all demos."""
+    # await basic_pooling_demo()
+    # await prewarm_pages_demo()
+    # await prewarm_on_demand_demo()
+    await high_volume_demo()
+    # Additional demo functions can be added here
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/browser/test_browser_manager.py b/tests/browser/test_browser_manager.py
new file mode 100644
index 00000000..d8f9376d
--- /dev/null
+++ b/tests/browser/test_browser_manager.py
@@ -0,0 +1,190 @@
+"""Test examples for BrowserManager.
+
+These examples demonstrate the functionality of BrowserManager
+and serve as functional tests.
+"""
+
+import asyncio
+import os
+import sys
+from typing import List
+
+# Add the project root to Python path if running directly
+if __name__ == "__main__":
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+
+from crawl4ai.browser import BrowserManager
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+
+# Create a logger for clear terminal output
+logger = AsyncLogger(verbose=True, log_file=None)
+
+async def test_basic_browser_manager():
+    """Test basic BrowserManager functionality with default configuration."""
+    logger.info("Starting test_basic_browser_manager", tag="TEST")
+    
+    try:
+        # Create a browser manager with default config
+        manager = BrowserManager(logger=logger)
+        
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully", tag="TEST")
+        
+        # Get a page
+        crawler_config = CrawlerRunConfig(url="https://example.com")
+        page, context = await manager.get_page(crawler_config)
+        logger.info("Page created successfully", tag="TEST")
+        
+        # Navigate to a website
+        await page.goto("https://example.com")
+        title = await page.title()
+        logger.info(f"Page title: {title}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.success("test_basic_browser_manager completed successfully", tag="TEST")
+        return True
+    except Exception as e:
+        logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST")
+        return False
+
+async def test_custom_browser_config():
+    """Test BrowserManager with custom browser configuration."""
+    logger.info("Starting test_custom_browser_config", tag="TEST")
+    
+    try:
+        # Create a custom browser config
+        browser_config = BrowserConfig(
+            browser_type="chromium",
+            headless=True,
+            viewport_width=1280,
+            viewport_height=800,
+            light_mode=True
+        )
+        
+        # Create browser manager with the config
+        manager = BrowserManager(browser_config=browser_config, logger=logger)
+        
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully with custom config", tag="TEST")
+        
+        # Get a page
+        crawler_config = CrawlerRunConfig(url="https://example.com")
+        page, context = await manager.get_page(crawler_config)
+        
+        # Navigate to a website
+        await page.goto("https://example.com")
+        title = await page.title()
+        logger.info(f"Page title: {title}", tag="TEST")
+        
+        # Verify viewport size
+        viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })")
+        logger.info(f"Viewport size: {viewport_size}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.success("test_custom_browser_config completed successfully", tag="TEST")
+        return True
+    except Exception as e:
+        logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST")
+        return False
+
+async def test_multiple_pages():
+    """Test BrowserManager with multiple pages."""
+    logger.info("Starting test_multiple_pages", tag="TEST")
+    
+    try:
+        # Create browser manager
+        manager = BrowserManager(logger=logger)
+        
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully", tag="TEST")
+        
+        # Create multiple pages
+        pages = []
+        urls = ["https://example.com", "https://example.org", "https://mozilla.org"]
+        
+        for i, url in enumerate(urls):
+            crawler_config = CrawlerRunConfig(url=url)
+            page, context = await manager.get_page(crawler_config)
+            await page.goto(url)
+            pages.append((page, url))
+            logger.info(f"Created page {i+1} for {url}", tag="TEST")
+        
+        # Verify all pages are loaded correctly
+        for i, (page, url) in enumerate(pages):
+            title = await page.title()
+            logger.info(f"Page {i+1} title: {title}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.success("test_multiple_pages completed successfully", tag="TEST")
+        return True
+    except Exception as e:
+        logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST")
+        return False
+
+async def test_session_management():
+    """Test session management in BrowserManager."""
+    logger.info("Starting test_session_management", tag="TEST")
+    
+    try:
+        # Create browser manager
+        manager = BrowserManager(logger=logger)
+        
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully", tag="TEST")
+        
+        # Create a session
+        session_id = "test_session_1"
+        crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id)
+        page1, context1 = await manager.get_page(crawler_config)
+        await page1.goto("https://example.com")
+        logger.info(f"Created session with ID: {session_id}", tag="TEST")
+        
+        # Get the same session again
+        page2, context2 = await manager.get_page(crawler_config)
+        
+        # Verify it's the same page/context
+        is_same_page = page1 == page2
+        is_same_context = context1 == context2
+        logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST")
+        
+        # Kill the session
+        await manager.kill_session(session_id)
+        logger.info(f"Killed session with ID: {session_id}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.success("test_session_management completed successfully", tag="TEST")
+        return True
+    except Exception as e:
+        logger.error(f"test_session_management failed: {str(e)}", tag="TEST")
+        return False
+
+async def run_tests():
+    """Run all tests sequentially."""
+    results = []
+    
+    results.append(await test_basic_browser_manager())
+    results.append(await test_custom_browser_config())
+    results.append(await test_multiple_pages())
+    results.append(await test_session_management())
+    
+    # Print summary
+    total = len(results)
+    passed = sum(results)
+    logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
+    
+    if passed == total:
+        logger.success("All tests passed!", tag="SUMMARY")
+    else:
+        logger.error(f"{total - passed} tests failed", tag="SUMMARY")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
diff --git a/tests/browser/test_builtin_browser.py b/tests/browser/test_builtin_browser.py
new file mode 100644
index 00000000..4797648c
--- /dev/null
+++ b/tests/browser/test_builtin_browser.py
@@ -0,0 +1,809 @@
+"""
+Test script for builtin browser functionality in the browser module.
+
+This script tests:
+1. Creating a builtin browser
+2. Getting browser information
+3. Killing the browser
+4. Restarting the browser
+5. Testing operations with different browser strategies
+6. Testing edge cases
+"""
+
+import asyncio
+import os
+import sys
+import time
+from typing import List, Dict, Any
+from colorama import Fore, Style, init
+
+# Add the project root to the path for imports
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.text import Text
+from rich.box import Box, SIMPLE
+
+from crawl4ai.browser import BrowserManager
+from crawl4ai.browser.strategies import BuiltinBrowserStrategy
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+
+# Initialize colorama for cross-platform colored terminal output
+init()
+
+# Define colors for pretty output
+SUCCESS = Fore.GREEN
+WARNING = Fore.YELLOW
+ERROR = Fore.RED
+INFO = Fore.CYAN
+RESET = Fore.RESET
+
+# Create logger
+logger = AsyncLogger(verbose=True)
+
+
+async def test_builtin_browser_creation():
+    """Test creating a builtin browser using the BrowserManager with BuiltinBrowserStrategy"""
+    print(f"\n{INFO}========== Testing Builtin Browser Creation =========={RESET}")
+
+    # Step 1: Create a BrowserManager with builtin mode
+    print(f"\n{INFO}1. Creating BrowserManager with builtin mode{RESET}")
+    browser_config = BrowserConfig(browser_mode="builtin", headless=True, verbose=True)
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+
+    # Step 2: Check if we have a BuiltinBrowserStrategy
+    print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}")
+    if isinstance(manager.strategy, BuiltinBrowserStrategy):
+        print(
+            f"{SUCCESS}Correct strategy type: {manager.strategy.__class__.__name__}{RESET}"
+        )
+    else:
+        print(
+            f"{ERROR}Wrong strategy type: {manager.strategy.__class__.__name__}{RESET}"
+        )
+        return None
+
+    # Step 3: Start the manager to launch or connect to builtin browser
+    print(f"\n{INFO}3. Starting the browser manager{RESET}")
+    try:
+        await manager.start()
+        print(f"{SUCCESS}Browser manager started successfully{RESET}")
+    except Exception as e:
+        print(f"{ERROR}Failed to start browser manager: {str(e)}{RESET}")
+        return None
+
+    # Step 4: Get browser info from the strategy
+    print(f"\n{INFO}4. Getting browser information{RESET}")
+    browser_info = manager.strategy.get_browser_info()
+    if browser_info:
+        print(f"{SUCCESS}Browser info retrieved:{RESET}")
+        for key, value in browser_info.items():
+            if key != "config":  # Skip the verbose config section
+                print(f"  {key}: {value}")
+
+        cdp_url = browser_info.get("cdp_url")
+        print(f"{SUCCESS}CDP URL: {cdp_url}{RESET}")
+    else:
+        print(f"{ERROR}Failed to get browser information{RESET}")
+        cdp_url = None
+
+    # Save manager for later tests
+    return manager, cdp_url
+
+
+async def test_page_operations(manager: BrowserManager):
+    """Test page operations with the builtin browser"""
+    print(
+        f"\n{INFO}========== Testing Page Operations with Builtin Browser =========={RESET}"
+    )
+
+    # Step 1: Get a single page
+    print(f"\n{INFO}1. Getting a single page{RESET}")
+    try:
+        crawler_config = CrawlerRunConfig()
+        page, context = await manager.get_page(crawler_config)
+        print(f"{SUCCESS}Got page successfully{RESET}")
+
+        # Navigate to a test URL
+        await page.goto("https://example.com")
+        title = await page.title()
+        print(f"{SUCCESS}Page title: {title}{RESET}")
+
+        # Close the page
+        await page.close()
+        print(f"{SUCCESS}Page closed successfully{RESET}")
+    except Exception as e:
+        print(f"{ERROR}Page operation failed: {str(e)}{RESET}")
+        return False
+
+    # Step 2: Get multiple pages
+    print(f"\n{INFO}2. Getting multiple pages with get_pages(){RESET}")
+    try:
+        # Request 3 pages
+        crawler_config = CrawlerRunConfig()
+        pages = await manager.get_pages(crawler_config, count=3)
+        print(f"{SUCCESS}Got {len(pages)} pages{RESET}")
+
+        # Test each page
+        for i, (page, context) in enumerate(pages):
+            await page.goto(f"https://example.com?test={i}")
+            title = await page.title()
+            print(f"{SUCCESS}Page {i + 1} title: {title}{RESET}")
+            await page.close()
+
+        print(f"{SUCCESS}All pages tested and closed successfully{RESET}")
+    except Exception as e:
+        print(f"{ERROR}Multiple page operation failed: {str(e)}{RESET}")
+        return False
+
+    return True
+
+
+async def test_browser_status_management(manager: BrowserManager):
+    """Test browser status and management operations"""
+    print(f"\n{INFO}========== Testing Browser Status and Management =========={RESET}")
+
+    # Step 1: Get browser status
+    print(f"\n{INFO}1. Getting browser status{RESET}")
+    try:
+        status = await manager.strategy.get_builtin_browser_status()
+        print(f"{SUCCESS}Browser status:{RESET}")
+        print(f"  Running: {status['running']}")
+        print(f"  CDP URL: {status['cdp_url']}")
+    except Exception as e:
+        print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
+        return False
+
+    # Step 2: Test killing the browser
+    print(f"\n{INFO}2. Testing killing the browser{RESET}")
+    try:
+        result = await manager.strategy.kill_builtin_browser()
+        if result:
+            print(f"{SUCCESS}Browser killed successfully{RESET}")
+        else:
+            print(f"{ERROR}Failed to kill browser{RESET}")
+    except Exception as e:
+        print(f"{ERROR}Browser kill operation failed: {str(e)}{RESET}")
+        return False
+
+    # Step 3: Check status after kill
+    print(f"\n{INFO}3. Checking status after kill{RESET}")
+    try:
+        status = await manager.strategy.get_builtin_browser_status()
+        if not status["running"]:
+            print(f"{SUCCESS}Browser is correctly reported as not running{RESET}")
+        else:
+            print(f"{ERROR}Browser is incorrectly reported as still running{RESET}")
+    except Exception as e:
+        print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
+        return False
+
+    # Step 4: Launch a new browser
+    print(f"\n{INFO}4. Launching a new browser{RESET}")
+    try:
+        cdp_url = await manager.strategy.launch_builtin_browser(
+            browser_type="chromium", headless=True
+        )
+        if cdp_url:
+            print(f"{SUCCESS}New browser launched at: {cdp_url}{RESET}")
+        else:
+            print(f"{ERROR}Failed to launch new browser{RESET}")
+            return False
+    except Exception as e:
+        print(f"{ERROR}Browser launch failed: {str(e)}{RESET}")
+        return False
+
+    return True
+
+
+async def test_multiple_managers():
+    """Test creating multiple BrowserManagers that use the same builtin browser"""
+    print(f"\n{INFO}========== Testing Multiple Browser Managers =========={RESET}")
+
+    # Step 1: Create first manager
+    print(f"\n{INFO}1. Creating first browser manager{RESET}")
+    browser_config1 = BrowserConfig(browser_mode="builtin", headless=True)
+    manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
+
+    # Step 2: Create second manager
+    print(f"\n{INFO}2. Creating second browser manager{RESET}")
+    browser_config2 = BrowserConfig(browser_mode="builtin", headless=True)
+    manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
+
+    # Step 3: Start both managers (should connect to the same builtin browser)
+    print(f"\n{INFO}3. Starting both managers{RESET}")
+    try:
+        await manager1.start()
+        print(f"{SUCCESS}First manager started{RESET}")
+
+        await manager2.start()
+        print(f"{SUCCESS}Second manager started{RESET}")
+
+        # Check if they got the same CDP URL
+        cdp_url1 = manager1.strategy.config.cdp_url
+        cdp_url2 = manager2.strategy.config.cdp_url
+
+        if cdp_url1 == cdp_url2:
+            print(
+                f"{SUCCESS}Both managers connected to the same browser: {cdp_url1}{RESET}"
+            )
+        else:
+            print(
+                f"{WARNING}Managers connected to different browsers: {cdp_url1} and {cdp_url2}{RESET}"
+            )
+    except Exception as e:
+        print(f"{ERROR}Failed to start managers: {str(e)}{RESET}")
+        return False
+
+    # Step 4: Test using both managers
+    print(f"\n{INFO}4. Testing operations with both managers{RESET}")
+    try:
+        # First manager creates a page
+        page1, ctx1 = await manager1.get_page(CrawlerRunConfig())
+        await page1.goto("https://example.com")
+        title1 = await page1.title()
+        print(f"{SUCCESS}Manager 1 page title: {title1}{RESET}")
+
+        # Second manager creates a page
+        page2, ctx2 = await manager2.get_page(CrawlerRunConfig())
+        await page2.goto("https://example.org")
+        title2 = await page2.title()
+        print(f"{SUCCESS}Manager 2 page title: {title2}{RESET}")
+
+        # Clean up
+        await page1.close()
+        await page2.close()
+    except Exception as e:
+        print(f"{ERROR}Failed to use both managers: {str(e)}{RESET}")
+        return False
+
+    # Step 5: Close both managers
+    print(f"\n{INFO}5. Closing both managers{RESET}")
+    try:
+        await manager1.close()
+        print(f"{SUCCESS}First manager closed{RESET}")
+
+        await manager2.close()
+        print(f"{SUCCESS}Second manager closed{RESET}")
+    except Exception as e:
+        print(f"{ERROR}Failed to close managers: {str(e)}{RESET}")
+        return False
+
+    return True
+
+
+async def test_edge_cases():
+    """Test edge cases like multiple starts, killing browser during operations, etc."""
+    print(f"\n{INFO}========== Testing Edge Cases =========={RESET}")
+
+    # Step 1: Test multiple starts with the same manager
+    print(f"\n{INFO}1. Testing multiple starts with the same manager{RESET}")
+    browser_config = BrowserConfig(browser_mode="builtin", headless=True)
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+
+    try:
+        await manager.start()
+        print(f"{SUCCESS}First start successful{RESET}")
+
+        # Try to start again
+        await manager.start()
+        print(f"{SUCCESS}Second start completed without errors{RESET}")
+
+        # Test if it's still functional
+        page, context = await manager.get_page(CrawlerRunConfig())
+        await page.goto("https://example.com")
+        title = await page.title()
+        print(
+            f"{SUCCESS}Page operations work after multiple starts. Title: {title}{RESET}"
+        )
+        await page.close()
+    except Exception as e:
+        print(f"{ERROR}Multiple starts test failed: {str(e)}{RESET}")
+        return False
+    finally:
+        await manager.close()
+
+    # Step 2: Test killing the browser while manager is active
+    print(f"\n{INFO}2. Testing killing the browser while manager is active{RESET}")
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+
+    try:
+        await manager.start()
+        print(f"{SUCCESS}Manager started{RESET}")
+
+        # Kill the browser directly
+        print(f"{INFO}Killing the browser...{RESET}")
+        await manager.strategy.kill_builtin_browser()
+        print(f"{SUCCESS}Browser killed{RESET}")
+
+        # Try to get a page (should fail or launch a new browser)
+        try:
+            page, context = await manager.get_page(CrawlerRunConfig())
+            print(
+                f"{WARNING}Page request succeeded despite killed browser (might have auto-restarted){RESET}"
+            )
+            title = await page.title()
+            print(f"{SUCCESS}Got page title: {title}{RESET}")
+            await page.close()
+        except Exception as e:
+            print(
+                f"{SUCCESS}Page request failed as expected after browser was killed: {str(e)}{RESET}"
+            )
+    except Exception as e:
+        print(f"{ERROR}Kill during operation test failed: {str(e)}{RESET}")
+        return False
+    finally:
+        await manager.close()
+
+    return True
+
+
+async def cleanup_browsers():
+    """Clean up any remaining builtin browsers"""
+    print(f"\n{INFO}========== Cleaning Up Builtin Browsers =========={RESET}")
+
+    browser_config = BrowserConfig(browser_mode="builtin", headless=True)
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+
+    try:
+        # No need to start, just access the strategy directly
+        strategy = manager.strategy
+        if isinstance(strategy, BuiltinBrowserStrategy):
+            result = await strategy.kill_builtin_browser()
+            if result:
+                print(f"{SUCCESS}Successfully killed all builtin browsers{RESET}")
+            else:
+                print(f"{WARNING}No builtin browsers found to kill{RESET}")
+        else:
+            print(f"{ERROR}Wrong strategy type: {strategy.__class__.__name__}{RESET}")
+    except Exception as e:
+        print(f"{ERROR}Cleanup failed: {str(e)}{RESET}")
+    finally:
+        # Just to be safe
+        try:
+            await manager.close()
+        except:
+            pass
+
+
+async def test_performance_scaling():
+    """Test performance with multiple browsers and pages.
+
+    This test creates multiple browsers on different ports,
+    spawns multiple pages per browser, and measures performance metrics.
+    """
+    print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
+
+    # Configuration parameters
+    num_browsers = 10
+    pages_per_browser = 10
+    total_pages = num_browsers * pages_per_browser
+    base_port = 9222
+
+    # Set up a measuring mechanism for memory
+    import psutil
+    import gc
+
+    # Force garbage collection before starting
+    gc.collect()
+    process = psutil.Process()
+    initial_memory = process.memory_info().rss / 1024 / 1024  # in MB
+    peak_memory = initial_memory
+
+    # Report initial configuration
+    print(
+        f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
+    )
+
+    # List to track managers
+    managers: List[BrowserManager] = []
+    all_pages = []
+
+
+
+    # Get crawl4ai home directory
+    crawl4ai_home = os.path.expanduser("~/.crawl4ai")
+    temp_dir = os.path.join(crawl4ai_home, "temp")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    # Create all managers but don't start them yet
+    manager_configs = []
+    for i in range(num_browsers):
+        port = base_port + i
+        browser_config = BrowserConfig(
+            browser_mode="builtin",
+            headless=True,
+            debugging_port=port,
+            user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
+        )
+        manager = BrowserManager(browser_config=browser_config, logger=logger)
+        manager.strategy.shutting_down = True
+        manager_configs.append((manager, i, port))
+
+    # Define async function to start a single manager
+    async def start_manager(manager, index, port):
+        try:
+            await manager.start()
+            return manager
+        except Exception as e:
+            print(
+                f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
+            )
+            return None
+
+    # Start all managers in parallel
+    start_tasks = [
+        start_manager(manager, i, port) for manager, i, port in manager_configs
+    ]
+    started_managers = await asyncio.gather(*start_tasks)
+
+    # Filter out None values (failed starts) and add to managers list
+    managers = [m for m in started_managers if m is not None]
+
+    if len(managers) == 0:
+        print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
+        return False
+
+    if len(managers) < num_browsers:
+        print(
+            f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
+        )
+
+    # Create pages for each browser
+    for i, manager in enumerate(managers):
+        try:
+            pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
+            all_pages.extend(pages)
+        except Exception as e:
+            print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
+
+    # Check memory after page creation
+    gc.collect()
+    current_memory = process.memory_info().rss / 1024 / 1024
+    peak_memory = max(peak_memory, current_memory)
+
+    # Ask for confirmation before loading
+    confirmation = input(
+        f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
+    )
+    # Step 1: Create and start multiple browser managers in parallel
+    start_time = time.time()
+    
+    if confirmation.lower() == "y":
+        load_start_time = time.time()
+
+        # Function to load a single page
+        async def load_page(page_ctx, index):
+            page, _ = page_ctx
+            try:
+                await page.goto(f"https://example.com/page{index}", timeout=30000)
+                title = await page.title()
+                return title
+            except Exception as e:
+                return f"Error: {str(e)}"
+
+        # Load all pages concurrently
+        load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
+        load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
+
+        # Count successes and failures
+        successes = sum(
+            1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
+        )
+        failures = len(load_results) - successes
+
+        load_time = time.time() - load_start_time
+        total_test_time = time.time() - start_time
+
+        # Check memory after loading (peak memory)
+        gc.collect()
+        current_memory = process.memory_info().rss / 1024 / 1024
+        peak_memory = max(peak_memory, current_memory)
+
+        # Calculate key metrics
+        memory_per_page = peak_memory / successes if successes > 0 else 0
+        time_per_crawl = total_test_time / successes if successes > 0 else 0
+        crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
+        crawls_per_minute = crawls_per_second * 60
+        crawls_per_hour = crawls_per_minute * 60
+
+        # Print simplified performance summary
+        from rich.console import Console
+        from rich.table import Table
+
+        console = Console()
+
+        # Create a simple summary table
+        table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
+
+        table.add_column("Metric", style="cyan")
+        table.add_column("Value", style="green")
+
+        table.add_row("Total Crawls Completed", f"{successes}")
+        table.add_row("Total Time", f"{total_test_time:.2f} seconds")
+        table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
+        table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
+        table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
+        table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
+        table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
+        table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
+
+        # Display the table
+        console.print(table)
+
+    # Ask confirmation before cleanup
+    confirmation = input(
+        f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
+    )
+    if confirmation.lower() != "y":
+        print(f"{WARNING}Cleanup aborted by user{RESET}")
+        return False
+
+    # Close all pages
+    for page, _ in all_pages:
+        try:
+            await page.close()
+        except:
+            pass
+
+    # Close all managers
+    for manager in managers:
+        try:
+            await manager.close()
+        except:
+            pass
+
+    # Remove the temp directory
+    import shutil
+
+    if os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+
+    return True
+
+
+async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browser: int = 10):
+    """Test performance with multiple browsers and pages.
+
+    This test creates multiple browsers on different ports,
+    spawns multiple pages per browser, and measures performance metrics.
+    """
+    print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
+
+    # Configuration parameters
+    num_browsers = num_browsers
+    pages_per_browser = pages_per_browser
+    total_pages = num_browsers * pages_per_browser
+    base_port = 9222
+
+    # Set up a measuring mechanism for memory
+    import psutil
+    import gc
+
+    # Force garbage collection before starting
+    gc.collect()
+    process = psutil.Process()
+    initial_memory = process.memory_info().rss / 1024 / 1024  # in MB
+    peak_memory = initial_memory
+
+    # Report initial configuration
+    print(
+        f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
+    )
+
+    # List to track managers
+    managers: List[BrowserManager] = []
+    all_pages = []
+
+    # Get crawl4ai home directory
+    crawl4ai_home = os.path.expanduser("~/.crawl4ai")
+    temp_dir = os.path.join(crawl4ai_home, "temp")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    # Create all managers but don't start them yet
+    manager_configs = []
+    for i in range(num_browsers):
+        port = base_port + i
+        browser_config = BrowserConfig(
+            browser_mode="builtin",
+            headless=True,
+            debugging_port=port,
+            user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
+        )
+        manager = BrowserManager(browser_config=browser_config, logger=logger)
+        manager.strategy.shutting_down = True
+        manager_configs.append((manager, i, port))
+
+    # Define async function to start a single manager
+    async def start_manager(manager, index, port):
+        try:
+            await manager.start()
+            return manager
+        except Exception as e:
+            print(
+                f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
+            )
+            return None
+
+    # Start all managers in parallel
+    start_tasks = [
+        start_manager(manager, i, port) for manager, i, port in manager_configs
+    ]
+    started_managers = await asyncio.gather(*start_tasks)
+
+    # Filter out None values (failed starts) and add to managers list
+    managers = [m for m in started_managers if m is not None]
+
+    if len(managers) == 0:
+        print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
+        return False
+
+    if len(managers) < num_browsers:
+        print(
+            f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
+        )
+
+    # Create pages for each browser
+    for i, manager in enumerate(managers):
+        try:
+            pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
+            all_pages.extend(pages)
+        except Exception as e:
+            print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
+
+    # Check memory after page creation
+    gc.collect()
+    current_memory = process.memory_info().rss / 1024 / 1024
+    peak_memory = max(peak_memory, current_memory)
+
+    # Ask for confirmation before loading
+    confirmation = input(
+        f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
+    )
+    # Step 1: Create and start multiple browser managers in parallel
+    start_time = time.time()
+    
+    if confirmation.lower() == "y":
+        load_start_time = time.time()
+
+        # Function to load a single page
+        async def load_page(page_ctx, index):
+            page, _ = page_ctx
+            try:
+                await page.goto(f"https://example.com/page{index}", timeout=30000)
+                title = await page.title()
+                return title
+            except Exception as e:
+                return f"Error: {str(e)}"
+
+        # Load all pages concurrently
+        load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
+        load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
+
+        # Count successes and failures
+        successes = sum(
+            1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
+        )
+        failures = len(load_results) - successes
+
+        load_time = time.time() - load_start_time
+        total_test_time = time.time() - start_time
+
+        # Check memory after loading (peak memory)
+        gc.collect()
+        current_memory = process.memory_info().rss / 1024 / 1024
+        peak_memory = max(peak_memory, current_memory)
+
+        # Calculate key metrics
+        memory_per_page = peak_memory / successes if successes > 0 else 0
+        time_per_crawl = total_test_time / successes if successes > 0 else 0
+        crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
+        crawls_per_minute = crawls_per_second * 60
+        crawls_per_hour = crawls_per_minute * 60
+
+        # Print simplified performance summary
+        from rich.console import Console
+        from rich.table import Table
+
+        console = Console()
+
+        # Create a simple summary table
+        table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
+
+        table.add_column("Metric", style="cyan")
+        table.add_column("Value", style="green")
+
+        table.add_row("Total Crawls Completed", f"{successes}")
+        table.add_row("Total Time", f"{total_test_time:.2f} seconds")
+        table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
+        table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
+        table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
+        table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
+        table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
+        table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
+
+        # Display the table
+        console.print(table)
+
+    # Ask confirmation before cleanup
+    confirmation = input(
+        f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
+    )
+    if confirmation.lower() != "y":
+        print(f"{WARNING}Cleanup aborted by user{RESET}")
+        return False
+
+    # Close all pages
+    for page, _ in all_pages:
+        try:
+            await page.close()
+        except:
+            pass
+
+    # Close all managers
+    for manager in managers:
+        try:
+            await manager.close()
+        except:
+            pass
+
+    # Remove the temp directory
+    import shutil
+
+    if os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+
+    return True
+
+
+
+async def main():
+    """Run all tests"""
+    try:
+        print(f"{INFO}Starting builtin browser tests with browser module{RESET}")
+
+        # # Run browser creation test
+        # manager, cdp_url = await test_builtin_browser_creation()
+        # if not manager:
+        #     print(f"{ERROR}Browser creation failed, cannot continue tests{RESET}")
+        #     return
+
+        # # Run page operations test
+        # await test_page_operations(manager)
+
+        # # Run browser status and management test
+        # await test_browser_status_management(manager)
+
+        # # Close manager before multiple manager test
+        # await manager.close()
+
+        # Run multiple managers test
+        await test_multiple_managers()
+
+        # Run performance scaling test
+        await test_performance_scaling()
+
+        # Run cleanup test
+        await cleanup_browsers()
+
+        # Run edge cases test
+        await test_edge_cases()
+
+        print(f"\n{SUCCESS}All tests completed!{RESET}")
+
+    except Exception as e:
+        print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}")
+        import traceback
+
+        traceback.print_exc()
+    finally:
+        # Clean up: kill any remaining builtin browsers
+        await cleanup_browsers()
+        print(f"{SUCCESS}Test cleanup complete{RESET}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/browser/test_builtin_strategy.py b/tests/browser/test_builtin_strategy.py
new file mode 100644
index 00000000..7c435b3d
--- /dev/null
+++ b/tests/browser/test_builtin_strategy.py
@@ -0,0 +1,160 @@
+"""Test examples for BuiltinBrowserStrategy.
+
+These examples demonstrate the functionality of BuiltinBrowserStrategy
+and serve as functional tests.
+"""
+
+import asyncio
+import os
+import sys
+
+# Add the project root to Python path if running directly
+if __name__ == "__main__":
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+
+from crawl4ai.browser import BrowserManager
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+
+# Create a logger for clear terminal output
+logger = AsyncLogger(verbose=True, log_file=None)
+
+async def test_builtin_browser():
+    """Test using a builtin browser that persists between sessions."""
+    logger.info("Testing builtin browser", tag="TEST")
+    
+    browser_config = BrowserConfig(
+        browser_mode="builtin",
+        headless=True
+    )
+    
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        # Start should connect to existing builtin browser or create one
+        await manager.start()
+        logger.info("Connected to builtin browser", tag="TEST")
+        
+        # Test page creation
+        crawler_config = CrawlerRunConfig()
+        page, context = await manager.get_page(crawler_config)
+        
+        # Test navigation
+        await page.goto("https://example.com")
+        title = await page.title()
+        logger.info(f"Page title: {title}", tag="TEST")
+        
+        # Close manager (should not close the builtin browser)
+        await manager.close()
+        logger.info("First session closed", tag="TEST")
+        
+        # Create a second manager to verify browser persistence
+        logger.info("Creating second session to verify persistence", tag="TEST")
+        manager2 = BrowserManager(browser_config=browser_config, logger=logger)
+        
+        await manager2.start()
+        logger.info("Connected to existing builtin browser", tag="TEST")
+        
+        page2, context2 = await manager2.get_page(crawler_config)
+        await page2.goto("https://example.org")
+        title2 = await page2.title()
+        logger.info(f"Second session page title: {title2}", tag="TEST")
+        
+        await manager2.close()
+        logger.info("Second session closed successfully", tag="TEST")
+        
+        return True
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def test_builtin_browser_status():
+    """Test getting status of the builtin browser."""
+    logger.info("Testing builtin browser status", tag="TEST")
+    
+    from crawl4ai.browser.strategies import BuiltinBrowserStrategy
+    
+    browser_config = BrowserConfig(
+        browser_mode="builtin",
+        headless=True
+    )
+    
+    # Create strategy directly to access its status methods
+    strategy = BuiltinBrowserStrategy(browser_config, logger)
+    
+    try:
+        # Get status before starting (should be not running)
+        status_before = await strategy.get_builtin_browser_status()
+        logger.info(f"Initial status: {status_before}", tag="TEST")
+        
+        # Start the browser
+        await strategy.start()
+        logger.info("Browser started successfully", tag="TEST")
+        
+        # Get status after starting
+        status_after = await strategy.get_builtin_browser_status()
+        logger.info(f"Status after start: {status_after}", tag="TEST")
+        
+        # Create a page to verify functionality
+        crawler_config = CrawlerRunConfig()
+        page, context = await strategy.get_page(crawler_config)
+        await page.goto("https://example.com")
+        title = await page.title()
+        logger.info(f"Page title: {title}", tag="TEST")
+        
+        # Close strategy (should not kill the builtin browser)
+        await strategy.close()
+        logger.info("Strategy closed successfully", tag="TEST")
+        
+        # Create a new strategy object
+        strategy2 = BuiltinBrowserStrategy(browser_config, logger)
+        
+        # Get status again (should still be running)
+        status_final = await strategy2.get_builtin_browser_status()
+        logger.info(f"Final status: {status_final}", tag="TEST")
+        
+        # Verify that the status shows the browser is running
+        is_running = status_final.get('running', False)
+        logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST")
+        
+        # Kill the builtin browser to clean up
+        logger.info("Killing builtin browser", tag="TEST")
+        success = await strategy2.kill_builtin_browser()
+        logger.info(f"Killed builtin browser successfully: {success}", tag="TEST")
+        
+        return is_running and success
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        try:
+            await strategy.close()
+            
+            # Try to kill the builtin browser to clean up
+            strategy2 = BuiltinBrowserStrategy(browser_config, logger)
+            await strategy2.kill_builtin_browser()
+        except:
+            pass
+        return False
+
+async def run_tests():
+    """Run all tests sequentially."""
+    results = []
+    
+    results.append(await test_builtin_browser())
+    results.append(await test_builtin_browser_status())
+    
+    # Print summary
+    total = len(results)
+    passed = sum(results)
+    logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
+    
+    if passed == total:
+        logger.success("All tests passed!", tag="SUMMARY")
+    else:
+        logger.error(f"{total - passed} tests failed", tag="SUMMARY")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
diff --git a/tests/browser/test_cdp_strategy.py b/tests/browser/test_cdp_strategy.py
new file mode 100644
index 00000000..1df089a5
--- /dev/null
+++ b/tests/browser/test_cdp_strategy.py
@@ -0,0 +1,228 @@
+"""Test examples for CDPBrowserStrategy.
+
+These examples demonstrate the functionality of CDPBrowserStrategy
+and serve as functional tests.
+"""
+
+import asyncio
+import os
+import sys
+
+# Add the project root to Python path if running directly
+if __name__ == "__main__":
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+
+from crawl4ai.browser import BrowserManager
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+
+# Create a logger for clear terminal output
+logger = AsyncLogger(verbose=True, log_file=None)
+
+async def test_cdp_launch_connect():
+    """Test launching a browser and connecting via CDP."""
+    logger.info("Testing launch and connect via CDP", tag="TEST")
+    
+    browser_config = BrowserConfig(
+        use_managed_browser=True,
+        browser_mode="cdp",
+        headless=True
+    )
+    
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        await manager.start()
+        logger.info("Browser launched and connected via CDP", tag="TEST")
+        
+        # Test with multiple pages
+        pages = []
+        for i in range(3):
+            crawler_config = CrawlerRunConfig()
+            page, context = await manager.get_page(crawler_config)
+            await page.goto(f"https://example.com?test={i}")
+            pages.append(page)
+            logger.info(f"Created page {i+1}", tag="TEST")
+        
+        # Verify all pages are working
+        for i, page in enumerate(pages):
+            title = await page.title()
+            logger.info(f"Page {i+1} title: {title}", tag="TEST")
+        
+        await manager.close()
+        logger.info("Browser closed successfully", tag="TEST")
+        
+        return True
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def test_cdp_with_user_data_dir():
+    """Test CDP browser with a user data directory."""
+    logger.info("Testing CDP browser with user data directory", tag="TEST")
+    
+    # Create a temporary user data directory
+    import tempfile
+    user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-")
+    logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST")
+    
+    browser_config = BrowserConfig(
+        headless=True,
+        browser_mode="cdp",
+        user_data_dir=user_data_dir
+    )
+    
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        await manager.start()
+        logger.info("Browser launched with user data directory", tag="TEST")
+        
+        # Navigate to a page and store some data
+        crawler_config = CrawlerRunConfig()
+        page, context = await manager.get_page(crawler_config)
+        
+        # Set a cookie
+        await context.add_cookies([{
+            "name": "test_cookie",
+            "value": "test_value",
+            "url": "https://example.com"
+        }])
+        
+        # Visit the site
+        await page.goto("https://example.com")
+        
+        # Verify cookie was set
+        cookies = await context.cookies(["https://example.com"])
+        has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies)
+        logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST")
+        
+        # Close the browser
+        await manager.close()
+        logger.info("First browser session closed", tag="TEST")
+        
+        # Start a new browser with the same user data directory
+        logger.info("Starting second browser session with same user data directory", tag="TEST")
+        manager2 = BrowserManager(browser_config=browser_config, logger=logger)
+        await manager2.start()
+        
+        # Get a new page and check if the cookie persists
+        page2, context2 = await manager2.get_page(crawler_config)
+        await page2.goto("https://example.com")
+        
+        # Verify cookie persisted
+        cookies2 = await context2.cookies(["https://example.com"])
+        has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2)
+        logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST")
+        
+        # Clean up
+        await manager2.close()
+        
+        # Remove temporary directory
+        import shutil
+        shutil.rmtree(user_data_dir, ignore_errors=True)
+        logger.info(f"Removed temporary user data directory", tag="TEST")
+        
+        return has_test_cookie and has_test_cookie2
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        try:
+            await manager.close()
+        except:
+            pass
+            
+        # Clean up temporary directory
+        try:
+            import shutil
+            shutil.rmtree(user_data_dir, ignore_errors=True)
+        except:
+            pass
+            
+        return False
+
+async def test_cdp_session_management():
+    """Test session management with CDP browser."""
+    logger.info("Testing session management with CDP browser", tag="TEST")
+    
+    browser_config = BrowserConfig(
+        use_managed_browser=True,
+        headless=True
+    )
+    
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        await manager.start()
+        logger.info("Browser launched successfully", tag="TEST")
+        
+        # Create two sessions
+        session1_id = "test_session_1"
+        session2_id = "test_session_2"
+        
+        # Set up first session
+        crawler_config1 = CrawlerRunConfig(session_id=session1_id)
+        page1, context1 = await manager.get_page(crawler_config1)
+        await page1.goto("https://example.com")
+        await page1.evaluate("localStorage.setItem('session1_data', 'test_value')")
+        logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
+        
+        # Set up second session
+        crawler_config2 = CrawlerRunConfig(session_id=session2_id)
+        page2, context2 = await manager.get_page(crawler_config2)
+        await page2.goto("https://example.org")
+        await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')")
+        logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
+        
+        # Get first session again
+        page1_again, _ = await manager.get_page(crawler_config1)
+        
+        # Verify it's the same page and data persists
+        is_same_page = page1 == page1_again
+        data1 = await page1_again.evaluate("localStorage.getItem('session1_data')")
+        logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
+        
+        # Kill first session
+        await manager.kill_session(session1_id)
+        logger.info(f"Killed session 1", tag="TEST")
+        
+        # Verify second session still works
+        data2 = await page2.evaluate("localStorage.getItem('session2_data')")
+        logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.info("Browser closed successfully", tag="TEST")
+        
+        return is_same_page and data1 == "test_value" and data2 == "test_value2"
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def run_tests():
+    """Run all tests sequentially."""
+    results = []
+    
+    # results.append(await test_cdp_launch_connect())
+    results.append(await test_cdp_with_user_data_dir())
+    results.append(await test_cdp_session_management())
+    
+    # Print summary
+    total = len(results)
+    passed = sum(results)
+    logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
+    
+    if passed == total:
+        logger.success("All tests passed!", tag="SUMMARY")
+    else:
+        logger.error(f"{total - passed} tests failed", tag="SUMMARY")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
diff --git a/tests/browser/test_combined.py b/tests/browser/test_combined.py
new file mode 100644
index 00000000..b5bce3cd
--- /dev/null
+++ b/tests/browser/test_combined.py
@@ -0,0 +1,77 @@
+"""Combined test runner for all browser module tests.
+
+This script runs all the browser module tests in sequence and
+provides a comprehensive summary.
+"""
+
+import asyncio
+import os
+import sys
+import time
+
+# Add the project root to Python path if running directly
+if __name__ == "__main__":
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+
+from crawl4ai.async_logger import AsyncLogger
+
+# Create a logger for clear terminal output
+logger = AsyncLogger(verbose=True, log_file=None)
+
+async def run_test_module(module_name, header):
+    """Run all tests in a module and return results."""
+    logger.info(f"\n{'-'*30}", tag="TEST")
+    logger.info(f"RUNNING: {header}", tag="TEST")
+    logger.info(f"{'-'*30}", tag="TEST")
+    
+    # Import the module dynamically
+    module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
+    
+    # Track time for performance measurement
+    start_time = time.time()
+    
+    # Run the tests
+    await module.run_tests()
+    
+    # Calculate time taken
+    time_taken = time.time() - start_time
+    logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
+    
+    return time_taken
+
+async def main():
+    """Run all test modules."""
+    logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
+    
+    # List of test modules to run
+    test_modules = [
+        ("test_browser_manager", "Browser Manager Tests"),
+        ("test_playwright_strategy", "Playwright Strategy Tests"),
+        ("test_cdp_strategy", "CDP Strategy Tests"),
+        ("test_builtin_strategy", "Builtin Browser Strategy Tests"),
+        ("test_profiles", "Profile Management Tests")
+    ]
+    
+    # Run each test module
+    timings = {}
+    for module_name, header in test_modules:
+        try:
+            time_taken = await run_test_module(module_name, header)
+            timings[module_name] = time_taken
+        except Exception as e:
+            logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
+    
+    # Print summary
+    logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
+    logger.info(f"{'-'*50}", tag="SUMMARY")
+    for module_name, header in test_modules:
+        if module_name in timings:
+            logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
+        else:
+            logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
+    logger.info(f"{'-'*50}", tag="SUMMARY")
+    total_time = sum(timings.values())
+    logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/browser/test_launch_standalone.py b/tests/browser/test_launch_standalone.py
new file mode 100644
index 00000000..d60b12f3
--- /dev/null
+++ b/tests/browser/test_launch_standalone.py
@@ -0,0 +1,17 @@
+from crawl4ai.browser_profiler import BrowserProfiler
+import asyncio
+
+
+if __name__ == "__main__":
+    # Test launching a standalone browser
+    async def test_standalone_browser():
+        profiler = BrowserProfiler()
+        cdp_url = await profiler.launch_standalone_browser(
+            browser_type="chromium",
+            user_data_dir="~/.crawl4ai/browser_profile/test-browser-data",
+            debugging_port=9222,
+            headless=False
+        )
+        print(f"CDP URL: {cdp_url}")
+
+    asyncio.run(test_standalone_browser())
\ No newline at end of file
diff --git a/tests/browser/test_parallel_crawling.py b/tests/browser/test_parallel_crawling.py
new file mode 100644
index 00000000..9e72f06e
--- /dev/null
+++ b/tests/browser/test_parallel_crawling.py
@@ -0,0 +1,902 @@
+"""
+Test examples for parallel crawling with the browser module.
+
+These examples demonstrate the functionality of parallel page creation
+and serve as functional tests for multi-page crawling performance.
+"""
+
+import asyncio
+import os
+import sys
+import time
+from typing import List
+
+# Add the project root to Python path if running directly
+if __name__ == "__main__":
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+
+from crawl4ai.browser import BrowserManager
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+
+# Create a logger for clear terminal output
+logger = AsyncLogger(verbose=True, log_file=None)
+
+async def test_get_pages_basic():
+    """Test basic functionality of get_pages method."""
+    logger.info("Testing basic get_pages functionality", tag="TEST")
+    
+    browser_config = BrowserConfig(headless=True)
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        await manager.start()
+        
+        # Request 3 pages
+        crawler_config = CrawlerRunConfig()
+        pages = await manager.get_pages(crawler_config, count=3)
+        
+        # Verify we got the correct number of pages
+        assert len(pages) == 3, f"Expected 3 pages, got {len(pages)}"
+        
+        # Verify each page is valid
+        for i, (page, context) in enumerate(pages):
+            await page.goto("https://example.com")
+            title = await page.title()
+            logger.info(f"Page {i+1} title: {title}", tag="TEST")
+            assert title, f"Page {i+1} has no title"
+        
+        await manager.close()
+        logger.success("Basic get_pages test completed successfully", tag="TEST")
+        return True
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def test_parallel_approaches_comparison():
+    """Compare two parallel crawling approaches:
+    1. Create a page for each URL on-demand (get_page + gather)
+    2. Get all pages upfront with get_pages, then use them (get_pages + gather)
+    """
+    logger.info("Comparing different parallel crawling approaches", tag="TEST")
+    
+    urls = [
+        "https://example.com/page1",
+        "https://crawl4ai.com",
+        "https://kidocode.com",
+        "https://bbc.com",
+        # "https://example.com/page1",
+        # "https://example.com/page2",
+        # "https://example.com/page3",
+        # "https://example.com/page4",
+    ]
+    
+    browser_config = BrowserConfig(headless=False)
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        await manager.start()
+        
+        # Approach 1: Create a page for each URL on-demand and run in parallel
+        logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
+        start_time = time.time()
+        
+        async def fetch_title_approach1(url):
+            """Create a new page for each URL, go to the URL, and get title"""
+            crawler_config = CrawlerRunConfig(url=url)
+            page, context = await manager.get_page(crawler_config)
+            try:
+                await page.goto(url)
+                title = await page.title()
+                return title
+            finally:
+                await page.close()
+        
+        # Run fetch_title_approach1 for each URL in parallel
+        tasks = [fetch_title_approach1(url) for url in urls]
+        approach1_results = await asyncio.gather(*tasks)
+        
+        approach1_time = time.time() - start_time
+        logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
+        
+        # Approach 2: Get all pages upfront with get_pages, then use them in parallel
+        logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
+        start_time = time.time()
+        
+        # Get all pages upfront
+        crawler_config = CrawlerRunConfig()
+        pages = await manager.get_pages(crawler_config, count=len(urls))
+        
+        async def fetch_title_approach2(page_ctx, url):
+            """Use a pre-created page to go to URL and get title"""
+            page, _ = page_ctx
+            try:
+                await page.goto(url)
+                title = await page.title()
+                return title
+            finally:
+                await page.close()
+        
+        # Use the pre-created pages to fetch titles in parallel
+        tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in zip(pages, urls)]
+        approach2_results = await asyncio.gather(*tasks)
+        
+        approach2_time = time.time() - start_time
+        logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
+        
+        # Compare results and performance
+        speedup = approach1_time / approach2_time if approach2_time > 0 else 0
+        if speedup > 1:
+            logger.success(f"Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
+        else:
+            logger.info(f"Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
+        
+        # Verify same content was retrieved in both approaches
+        assert len(approach1_results) == len(approach2_results), "Result count mismatch"
+        
+        # Sort results for comparison since parallel execution might complete in different order
+        assert sorted(approach1_results) == sorted(approach2_results), "Results content mismatch"
+        
+        await manager.close()
+        return True
+        
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def test_multi_browser_scaling(num_browsers=3, pages_per_browser=5):
+    """Test performance with multiple browsers and pages per browser.
+    Compares two approaches:
+    1. On-demand page creation (get_page + gather)
+    2. Pre-created pages (get_pages + gather)
+    """
+    logger.info(f"Testing multi-browser scaling with {num_browsers} browsers × {pages_per_browser} pages", tag="TEST")
+    
+    # Generate test URLs
+    total_pages = num_browsers * pages_per_browser
+    urls = [f"https://example.com/page_{i}" for i in range(total_pages)]
+    
+    # Create browser managers
+    managers = []
+    base_port = 9222
+    
+    try:
+        # Start all browsers in parallel
+        start_tasks = []
+        for i in range(num_browsers):
+            browser_config = BrowserConfig(
+                headless=True  # Using default browser mode like in test_parallel_approaches_comparison
+            )
+            manager = BrowserManager(browser_config=browser_config, logger=logger)
+            start_tasks.append(manager.start())
+            managers.append(manager)
+        
+        await asyncio.gather(*start_tasks)
+        
+        # Distribute URLs among managers
+        urls_per_manager = {}
+        for i, manager in enumerate(managers):
+            start_idx = i * pages_per_browser
+            end_idx = min(start_idx + pages_per_browser, len(urls))
+            urls_per_manager[manager] = urls[start_idx:end_idx]
+        
+        # Approach 1: Create a page for each URL on-demand and run in parallel
+        logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
+        start_time = time.time()
+        
+        async def fetch_title_approach1(manager, url):
+            """Create a new page for the URL, go to the URL, and get title"""
+            crawler_config = CrawlerRunConfig(url=url)
+            page, context = await manager.get_page(crawler_config)
+            try:
+                await page.goto(url)
+                title = await page.title()
+                return title
+            finally:
+                await page.close()
+        
+        # Run fetch_title_approach1 for each URL in parallel
+        tasks = []
+        for manager, manager_urls in urls_per_manager.items():
+            for url in manager_urls:
+                tasks.append(fetch_title_approach1(manager, url))
+        
+        approach1_results = await asyncio.gather(*tasks)
+        
+        approach1_time = time.time() - start_time
+        logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
+        
+        # Approach 2: Get all pages upfront with get_pages, then use them in parallel
+        logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
+        start_time = time.time()
+        
+        # Get all pages upfront for each manager
+        all_pages = []
+        for manager, manager_urls in urls_per_manager.items():
+            crawler_config = CrawlerRunConfig()
+            pages = await manager.get_pages(crawler_config, count=len(manager_urls))
+            all_pages.extend(zip(pages, manager_urls))
+        
+        async def fetch_title_approach2(page_ctx, url):
+            """Use a pre-created page to go to URL and get title"""
+            page, _ = page_ctx
+            try:
+                await page.goto(url)
+                title = await page.title()
+                return title
+            finally:
+                await page.close()
+        
+        # Use the pre-created pages to fetch titles in parallel
+        tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in all_pages]
+        approach2_results = await asyncio.gather(*tasks)
+        
+        approach2_time = time.time() - start_time
+        logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
+        
+        # Compare results and performance
+        speedup = approach1_time / approach2_time if approach2_time > 0 else 0
+        pages_per_second = total_pages / approach2_time
+        
+        # Show a simple summary
+        logger.info(f"📊 Summary: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls", tag="TEST")
+        logger.info(f"⚡ Performance: {pages_per_second:.1f} pages/second ({pages_per_second*60:.0f} pages/minute)", tag="TEST")
+        logger.info(f"🚀 Total crawl time: {approach2_time:.2f} seconds", tag="TEST")
+        
+        if speedup > 1:
+            logger.success(f"✅ Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
+        else:
+            logger.info(f"✅ Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
+        
+        # Close all managers
+        for manager in managers:
+            await manager.close()
+        
+        return True
+    
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Clean up
+        for manager in managers:
+            try:
+                await manager.close()
+            except:
+                pass
+        return False
+
+async def grid_search_optimal_configuration(total_urls=50):
+    """Perform a grid search to find the optimal balance between number of browsers and pages per browser.
+    
+    This function tests different combinations of browser count and pages per browser,
+    while keeping the total number of URLs constant. It measures performance metrics
+    for each configuration to find the "sweet spot" that provides the best speed 
+    with reasonable memory usage.
+    
+    Args:
+        total_urls: Total number of URLs to crawl (default: 50)
+    """
+    logger.info(f"=== GRID SEARCH FOR OPTIMAL CRAWLING CONFIGURATION ({total_urls} URLs) ===", tag="TEST")
+    
+    # Generate test URLs once
+    urls = [f"https://example.com/page_{i}" for i in range(total_urls)]
+    
+    # Define grid search configurations
+    # We'll use more flexible approach: test all browser counts from 1 to min(20, total_urls)
+    # and distribute pages evenly (some browsers may have 1 more page than others)
+    configurations = []
+    
+    # Maximum number of browsers to test
+    max_browsers_to_test = min(20, total_urls)
+    
+    # Try configurations with 1 to max_browsers_to_test browsers
+    for num_browsers in range(1, max_browsers_to_test + 1):
+        base_pages_per_browser = total_urls // num_browsers
+        remainder = total_urls % num_browsers
+        
+        # Generate exact page distribution array
+        if remainder > 0:
+            # First 'remainder' browsers get one more page
+            page_distribution = [base_pages_per_browser + 1] * remainder + [base_pages_per_browser] * (num_browsers - remainder)
+            pages_distribution = f"{base_pages_per_browser+1} pages × {remainder} browsers, {base_pages_per_browser} pages × {num_browsers - remainder} browsers"
+        else:
+            # All browsers get the same number of pages
+            page_distribution = [base_pages_per_browser] * num_browsers
+            pages_distribution = f"{base_pages_per_browser} pages × {num_browsers} browsers"
+        
+        # Format the distribution as a tuple string like (4, 4, 3, 3)
+        distribution_str = str(tuple(page_distribution))
+            
+        configurations.append((num_browsers, base_pages_per_browser, pages_distribution, page_distribution, distribution_str))
+    
+    # Track results
+    results = []
+    
+    # Test each configuration
+    for num_browsers, pages_per_browser, pages_distribution, page_distribution, distribution_str in configurations:
+        logger.info("-" * 80, tag="TEST")
+        logger.info(f"Testing configuration: {num_browsers} browsers with distribution: {distribution_str}", tag="TEST")
+        logger.info(f"Details: {pages_distribution}", tag="TEST")
+        # Sleep a bit for randomness
+        await asyncio.sleep(0.5)
+        
+        try:
+            # Import psutil for memory tracking
+            try:
+                import psutil
+                process = psutil.Process()
+                initial_memory = process.memory_info().rss / (1024 * 1024)  # MB
+            except ImportError:
+                logger.warning("psutil not available, memory metrics will not be tracked", tag="TEST")
+                initial_memory = 0
+            
+            # Create and start browser managers
+            managers = []
+            start_time = time.time()
+            
+            # Start all browsers in parallel
+            start_tasks = []
+            for i in range(num_browsers):
+                browser_config = BrowserConfig(
+                    headless=True
+                )
+                manager = BrowserManager(browser_config=browser_config, logger=logger)
+                start_tasks.append(manager.start())
+                managers.append(manager)
+            
+            await asyncio.gather(*start_tasks)
+            browser_startup_time = time.time() - start_time
+            
+            # Measure memory after browser startup
+            if initial_memory > 0:
+                browser_memory = process.memory_info().rss / (1024 * 1024) - initial_memory
+            else:
+                browser_memory = 0
+            
+            # Distribute URLs among managers using the exact page distribution
+            urls_per_manager = {}
+            total_assigned = 0
+            
+            for i, manager in enumerate(managers):
+                if i < len(page_distribution):
+                    # Get the exact number of pages for this browser from our distribution
+                    manager_pages = page_distribution[i]
+                    
+                    # Get the URL slice for this manager
+                    start_idx = total_assigned
+                    end_idx = start_idx + manager_pages
+                    urls_per_manager[manager] = urls[start_idx:end_idx]
+                    total_assigned += manager_pages
+                else:
+                    # If we have more managers than our distribution (should never happen)
+                    urls_per_manager[manager] = []
+            
+            # Use the more efficient approach (pre-created pages)
+            logger.info("Running page crawling test...", tag="TEST")
+            crawl_start_time = time.time()
+            
+            # Get all pages upfront for each manager
+            all_pages = []
+            for manager, manager_urls in urls_per_manager.items():
+                if not manager_urls:  # Skip managers with no URLs
+                    continue
+                crawler_config = CrawlerRunConfig()
+                pages = await manager.get_pages(crawler_config, count=len(manager_urls))
+                all_pages.extend(zip(pages, manager_urls))
+            
+            # Measure memory after page creation
+            if initial_memory > 0:
+                pages_memory = process.memory_info().rss / (1024 * 1024) - browser_memory - initial_memory
+            else:
+                pages_memory = 0
+                
+            # Function to crawl a URL with a pre-created page
+            async def fetch_title(page_ctx, url):
+                page, _ = page_ctx
+                try:
+                    await page.goto(url)
+                    title = await page.title()
+                    return title
+                finally:
+                    await page.close()
+            
+            # Use the pre-created pages to fetch titles in parallel
+            tasks = [fetch_title(page_ctx, url) for page_ctx, url in all_pages]
+            crawl_results = await asyncio.gather(*tasks)
+            
+            crawl_time = time.time() - crawl_start_time
+            total_time = time.time() - start_time
+            
+            # Final memory measurement
+            if initial_memory > 0:
+                peak_memory = max(browser_memory + pages_memory, process.memory_info().rss / (1024 * 1024) - initial_memory)
+            else:
+                peak_memory = 0
+                
+            # Close all managers
+            for manager in managers:
+                await manager.close()
+                
+            # Calculate metrics
+            pages_per_second = total_urls / crawl_time
+            
+            # Store result metrics
+            result = {
+                "num_browsers": num_browsers,
+                "pages_per_browser": pages_per_browser,
+                "page_distribution": page_distribution,
+                "distribution_str": distribution_str,
+                "total_urls": total_urls,
+                "browser_startup_time": browser_startup_time,
+                "crawl_time": crawl_time,
+                "total_time": total_time,
+                "browser_memory": browser_memory,
+                "pages_memory": pages_memory,
+                "peak_memory": peak_memory,
+                "pages_per_second": pages_per_second,
+                # Calculate efficiency score (higher is better)
+                # This balances speed vs memory usage
+                "efficiency_score": pages_per_second / (peak_memory + 1) if peak_memory > 0 else pages_per_second,
+            }
+            
+            results.append(result)
+            
+            # Log the results
+            logger.info(f"Browser startup: {browser_startup_time:.2f}s", tag="TEST")
+            logger.info(f"Crawl time: {crawl_time:.2f}s", tag="TEST")
+            logger.info(f"Total time: {total_time:.2f}s", tag="TEST")
+            logger.info(f"Performance: {pages_per_second:.1f} pages/second", tag="TEST")
+            
+            if peak_memory > 0:
+                logger.info(f"Browser memory: {browser_memory:.1f}MB", tag="TEST")
+                logger.info(f"Pages memory: {pages_memory:.1f}MB", tag="TEST")
+                logger.info(f"Peak memory: {peak_memory:.1f}MB", tag="TEST")
+                logger.info(f"Efficiency score: {result['efficiency_score']:.6f}", tag="TEST")
+                
+        except Exception as e:
+            logger.error(f"Error testing configuration: {str(e)}", tag="TEST")
+            import traceback
+            traceback.print_exc()
+            
+            # Clean up
+            for manager in managers:
+                try:
+                    await manager.close()
+                except:
+                    pass
+                    
+    # Print summary of all configurations
+    logger.info("=" * 100, tag="TEST")
+    logger.info("GRID SEARCH RESULTS SUMMARY", tag="TEST")
+    logger.info("=" * 100, tag="TEST")
+    
+    # Rank configurations by efficiency score
+    ranked_results = sorted(results, key=lambda x: x["efficiency_score"], reverse=True)
+    
+    # Also determine rankings by different metrics
+    fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
+    lowest_memory = sorted(results, key=lambda x: x["peak_memory"] if x["peak_memory"] > 0 else float('inf'))[0]
+    most_efficient = ranked_results[0]
+    
+    # Print top performers by category
+    logger.info("🏆 TOP PERFORMERS BY CATEGORY:", tag="TEST")
+    logger.info(f"⚡ Fastest: {fastest['num_browsers']} browsers × ~{fastest['pages_per_browser']} pages " + 
+                f"({fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/s)", tag="TEST")
+    
+    if lowest_memory["peak_memory"] > 0:
+        logger.info(f"💾 Lowest memory: {lowest_memory['num_browsers']} browsers × ~{lowest_memory['pages_per_browser']} pages " +
+                    f"({lowest_memory['peak_memory']:.1f}MB)", tag="TEST")
+    
+    logger.info(f"🌟 Most efficient: {most_efficient['num_browsers']} browsers × ~{most_efficient['pages_per_browser']} pages " +
+                f"(score: {most_efficient['efficiency_score']:.6f})", tag="TEST")
+    
+    # Print result table header
+    logger.info("\n📊 COMPLETE RANKING TABLE (SORTED BY EFFICIENCY SCORE):", tag="TEST")
+    logger.info("-" * 120, tag="TEST")
+    
+    # Define table header
+    header = f"{'Rank':<5} | {'Browsers':<8} | {'Distribution':<55} | {'Total Time(s)':<12} | {'Speed(p/s)':<12} | {'Memory(MB)':<12} | {'Efficiency':<10} | {'Notes'}"
+    logger.info(header, tag="TEST")
+    logger.info("-" * 120, tag="TEST")
+    
+    # Print each configuration in ranked order
+    for rank, result in enumerate(ranked_results, 1):
+        # Add special notes for top performers
+        notes = []
+        if result == fastest:
+            notes.append("⚡ Fastest")
+        if result == lowest_memory:
+            notes.append("💾 Lowest Memory")
+        if result == most_efficient:
+            notes.append("🌟 Most Efficient")
+        
+        notes_str = " | ".join(notes) if notes else ""
+        
+        # Format memory if available
+        memory_str = f"{result['peak_memory']:.1f}" if result['peak_memory'] > 0 else "N/A"
+        
+        # Get the distribution string
+        dist_str = result.get('distribution_str', str(tuple([result['pages_per_browser']] * result['num_browsers'])))
+        
+        # Build the row
+        row = f"{rank:<5} | {result['num_browsers']:<8} | {dist_str:<55} | {result['total_time']:.2f}s{' ':<7} | "
+        row += f"{result['pages_per_second']:.2f}{' ':<6} | {memory_str}{' ':<6} | {result['efficiency_score']:.4f}{' ':<4} | {notes_str}"
+        
+        logger.info(row, tag="TEST")
+    
+    logger.info("-" * 120, tag="TEST")
+    
+    # Generate visualization if matplotlib is available
+    try:
+        import matplotlib.pyplot as plt
+        import numpy as np
+        
+        # Extract data for plotting from ranked results
+        browser_counts = [r["num_browsers"] for r in ranked_results]
+        efficiency_scores = [r["efficiency_score"] for r in ranked_results]
+        crawl_times = [r["crawl_time"] for r in ranked_results]
+        total_times = [r["total_time"] for r in ranked_results]
+        
+        # Filter results with memory data
+        memory_results = [r for r in ranked_results if r["peak_memory"] > 0]
+        memory_browser_counts = [r["num_browsers"] for r in memory_results]
+        peak_memories = [r["peak_memory"] for r in memory_results]
+        
+        # Create figure with clean design
+        plt.figure(figsize=(14, 12), facecolor='white')
+        plt.style.use('ggplot')
+        
+        # Create grid for subplots
+        gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 1], hspace=0.3)
+        
+        # Plot 1: Efficiency Score (higher is better)
+        ax1 = plt.subplot(gs[0])
+        bar_colors = ['#3498db'] * len(browser_counts)
+        
+        # Highlight the most efficient
+        most_efficient_idx = browser_counts.index(most_efficient["num_browsers"])
+        bar_colors[most_efficient_idx] = '#e74c3c'  # Red for most efficient
+        
+        bars = ax1.bar(range(len(browser_counts)), efficiency_scores, color=bar_colors)
+        ax1.set_xticks(range(len(browser_counts)))
+        ax1.set_xticklabels([f"{bc}" for bc in browser_counts], rotation=45)
+        ax1.set_xlabel('Number of Browsers')
+        ax1.set_ylabel('Efficiency Score (higher is better)')
+        ax1.set_title('Browser Configuration Efficiency (higher is better)')
+        
+        # Add value labels on top of bars
+        for bar, score in zip(bars, efficiency_scores):
+            height = bar.get_height()
+            ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02*max(efficiency_scores),
+                    f'{score:.3f}', ha='center', va='bottom', rotation=90, fontsize=8)
+        
+        # Highlight best configuration
+        ax1.text(0.02, 0.90, f"🌟 Most Efficient: {most_efficient['num_browsers']} browsers with ~{most_efficient['pages_per_browser']} pages",
+                transform=ax1.transAxes, fontsize=12, verticalalignment='top',
+                bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.3))
+        
+        # Plot 2: Time Performance
+        ax2 = plt.subplot(gs[1])
+        
+        # Plot both total time and crawl time
+        ax2.plot(browser_counts, crawl_times, 'bo-', label='Crawl Time (s)', linewidth=2)
+        ax2.plot(browser_counts, total_times, 'go--', label='Total Time (s)', linewidth=2, alpha=0.6)
+        
+        # Mark the fastest configuration
+        fastest_idx = browser_counts.index(fastest["num_browsers"])
+        ax2.plot(browser_counts[fastest_idx], crawl_times[fastest_idx], 'ro', ms=10, 
+                label=f'Fastest: {fastest["num_browsers"]} browsers')
+        
+        ax2.set_xlabel('Number of Browsers')
+        ax2.set_ylabel('Time (seconds)')
+        ax2.set_title(f'Time Performance for {total_urls} URLs by Browser Count')
+        ax2.grid(True, linestyle='--', alpha=0.7)
+        ax2.legend(loc='upper right')
+        
+        # Plot pages per second on second y-axis
+        pages_per_second = [total_urls/t for t in crawl_times]
+        ax2_twin = ax2.twinx()
+        ax2_twin.plot(browser_counts, pages_per_second, 'r^--', label='Pages/second', alpha=0.5)
+        ax2_twin.set_ylabel('Pages per second')
+        
+        # Add note about the fastest configuration
+        ax2.text(0.02, 0.90, f"⚡ Fastest: {fastest['num_browsers']} browsers with ~{fastest['pages_per_browser']} pages" +
+                f"\n   {fastest['crawl_time']:.2f}s ({fastest['pages_per_second']:.1f} pages/s)",
+                transform=ax2.transAxes, fontsize=12, verticalalignment='top',
+                bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3))
+        
+        # Plot 3: Memory Usage (if available)
+        if memory_results:
+            ax3 = plt.subplot(gs[2])
+            
+            # Prepare data for grouped bar chart
+            memory_per_browser = [m/n for m, n in zip(peak_memories, memory_browser_counts)]
+            memory_per_page = [m/(n*p) for m, n, p in zip(
+                [r["peak_memory"] for r in memory_results],
+                [r["num_browsers"] for r in memory_results],
+                [r["pages_per_browser"] for r in memory_results])]
+            
+            x = np.arange(len(memory_browser_counts))
+            width = 0.35
+            
+            # Create grouped bars
+            ax3.bar(x - width/2, peak_memories, width, label='Total Memory (MB)', color='#9b59b6')
+            ax3.bar(x + width/2, memory_per_browser, width, label='Memory per Browser (MB)', color='#3498db')
+            
+            # Configure axis
+            ax3.set_xticks(x)
+            ax3.set_xticklabels([f"{bc}" for bc in memory_browser_counts], rotation=45)
+            ax3.set_xlabel('Number of Browsers')
+            ax3.set_ylabel('Memory (MB)')
+            ax3.set_title('Memory Usage by Browser Configuration')
+            ax3.legend(loc='upper left')
+            ax3.grid(True, linestyle='--', alpha=0.7)
+            
+            # Add second y-axis for memory per page
+            ax3_twin = ax3.twinx()
+            ax3_twin.plot(x, memory_per_page, 'ro-', label='Memory per Page (MB)')
+            ax3_twin.set_ylabel('Memory per Page (MB)')
+            
+            # Get lowest memory configuration
+            lowest_memory_idx = memory_browser_counts.index(lowest_memory["num_browsers"])
+            
+            # Add note about lowest memory configuration
+            ax3.text(0.02, 0.90, f"💾 Lowest Memory: {lowest_memory['num_browsers']} browsers with ~{lowest_memory['pages_per_browser']} pages" +
+                    f"\n   {lowest_memory['peak_memory']:.1f}MB ({lowest_memory['peak_memory']/total_urls:.2f}MB per page)",
+                    transform=ax3.transAxes, fontsize=12, verticalalignment='top',
+                    bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.3))
+        
+        # Add overall title
+        plt.suptitle(f'Browser Scaling Grid Search Results for {total_urls} URLs', fontsize=16, y=0.98)
+        
+        # Add timestamp and info at the bottom
+        plt.figtext(0.5, 0.01, f"Generated by Crawl4AI at {time.strftime('%Y-%m-%d %H:%M:%S')}", 
+                   ha="center", fontsize=10, style='italic')
+        
+        # Get current directory and save the figure there
+        import os
+        __current_file = os.path.abspath(__file__)
+        current_dir = os.path.dirname(__current_file)
+        output_file = os.path.join(current_dir, 'browser_scaling_grid_search.png')
+        
+        # Adjust layout and save figure with high DPI
+        plt.tight_layout(rect=[0, 0.03, 1, 0.97])
+        plt.savefig(output_file, dpi=200, bbox_inches='tight')
+        logger.success(f"Visualization saved to {output_file}", tag="TEST")
+        
+    except ImportError:
+        logger.warning("matplotlib not available, skipping visualization", tag="TEST")
+    
+    return most_efficient["num_browsers"], most_efficient["pages_per_browser"]
+    
+async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
+    """Find optimal browser configuration for crawling a specific number of URLs.
+    
+    Args:
+        total_urls: Number of URLs to crawl
+        verbose: Whether to print progress
+        rate_limit_delay: Delay between page loads to avoid rate limiting
+        
+    Returns:
+        dict: Contains fastest, lowest_memory, and optimal configurations
+    """
+    if verbose:
+        print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
+    
+    # Generate test URLs with timestamp to avoid caching
+    timestamp = int(time.time())
+    urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
+    
+    # Limit browser configurations to test (1 browser to max 10)
+    max_browsers = min(10, total_urls)
+    configs_to_test = []
+    
+    # Generate configurations (browser count, pages distribution)
+    for num_browsers in range(1, max_browsers + 1):
+        base_pages = total_urls // num_browsers
+        remainder = total_urls % num_browsers
+        
+        # Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
+        if remainder > 0:
+            distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
+        else:
+            distribution = [base_pages] * num_browsers
+            
+        configs_to_test.append((num_browsers, distribution))
+    
+    results = []
+    
+    # Test each configuration
+    for browser_count, page_distribution in configs_to_test:
+        if verbose:
+            print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
+        
+        try:
+            # Track memory if possible
+            try:
+                import psutil
+                process = psutil.Process()
+                start_memory = process.memory_info().rss / (1024 * 1024)  # MB
+            except ImportError:
+                if verbose: 
+                    print("Memory tracking not available (psutil not installed)")
+                start_memory = 0
+            
+            # Start browsers in parallel
+            managers = []
+            start_tasks = []
+            start_time = time.time()
+            
+            for i in range(browser_count):
+                config = BrowserConfig(headless=True)
+                manager = BrowserManager(browser_config=config, logger=logger)
+                start_tasks.append(manager.start())
+                managers.append(manager)
+            
+            await asyncio.gather(*start_tasks)
+            
+            # Distribute URLs among browsers
+            urls_per_manager = {}
+            url_index = 0
+            
+            for i, manager in enumerate(managers):
+                pages_for_this_browser = page_distribution[i]
+                end_index = url_index + pages_for_this_browser
+                urls_per_manager[manager] = urls[url_index:end_index]
+                url_index = end_index
+            
+            # Create pages for each browser
+            all_pages = []
+            for manager, manager_urls in urls_per_manager.items():
+                if not manager_urls:
+                    continue
+                pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
+                all_pages.extend(zip(pages, manager_urls))
+            
+            # Crawl pages with delay to avoid rate limiting
+            async def crawl_page(page_ctx, url):
+                page, _ = page_ctx
+                try:
+                    await page.goto(url)
+                    if rate_limit_delay > 0:
+                        await asyncio.sleep(rate_limit_delay)
+                    title = await page.title()
+                    return title
+                finally:
+                    await page.close()
+            
+            crawl_start = time.time()
+            crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
+            await asyncio.gather(*crawl_tasks)
+            crawl_time = time.time() - crawl_start
+            total_time = time.time() - start_time
+            
+            # Measure final memory usage
+            if start_memory > 0:
+                end_memory = process.memory_info().rss / (1024 * 1024)
+                memory_used = end_memory - start_memory
+            else:
+                memory_used = 0
+            
+            # Close all browsers
+            for manager in managers:
+                await manager.close()
+            
+            # Calculate metrics
+            pages_per_second = total_urls / crawl_time
+            
+            # Calculate efficiency score (higher is better)
+            # This balances speed vs memory
+            if memory_used > 0:
+                efficiency = pages_per_second / (memory_used + 1)
+            else:
+                efficiency = pages_per_second
+            
+            # Store result
+            result = {
+                "browser_count": browser_count,
+                "distribution": tuple(page_distribution),
+                "crawl_time": crawl_time,
+                "total_time": total_time,
+                "memory_used": memory_used,
+                "pages_per_second": pages_per_second, 
+                "efficiency": efficiency
+            }
+            
+            results.append(result)
+            
+            if verbose:
+                print(f"  ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
+                if memory_used > 0:
+                    print(f"  ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
+                print(f"  ✓ Efficiency score: {efficiency:.4f}")
+            
+        except Exception as e:
+            if verbose:
+                print(f"  ✗ Error: {str(e)}")
+            
+            # Clean up
+            for manager in managers:
+                try:
+                    await manager.close()
+                except:
+                    pass
+    
+    # If no successful results, return None
+    if not results:
+        return None
+    
+    # Find best configurations
+    fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
+    
+    # Only consider memory if available
+    memory_results = [r for r in results if r["memory_used"] > 0]
+    if memory_results:
+        lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
+    else:
+        lowest_memory = fastest
+    
+    # Find most efficient (balanced speed vs memory)
+    optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
+    
+    # Print summary
+    if verbose:
+        print("\n=== OPTIMAL CONFIGURATIONS ===")
+        print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
+        print(f"   {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
+        
+        print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
+        if lowest_memory["memory_used"] > 0:
+            print(f"   {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
+        
+        print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
+        print(f"   {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
+    
+    return {
+        "fastest": fastest,
+        "lowest_memory": lowest_memory,
+        "optimal": optimal,
+        "all_configs": results
+    }
+
+async def run_tests():
+    """Run all tests sequentially."""
+    results = []
+    
+    # Find optimal configuration using our utility function
+    configs = await find_optimal_browser_config(
+        total_urls=20,  # Use a small number for faster testing
+        verbose=True,
+        rate_limit_delay=0.2  # 200ms delay between page loads to avoid rate limiting
+    )
+    
+    if configs:
+        # Show the optimal configuration
+        optimal = configs["optimal"]
+        print(f"\n🎯 Recommended configuration for production use:")
+        print(f"   {optimal['browser_count']} browsers with distribution {optimal['distribution']}")
+        print(f"   Estimated performance: {optimal['pages_per_second']:.1f} pages/second")
+        results.append(True)
+    else:
+        print("\n❌ Failed to find optimal configuration")
+        results.append(False)
+    
+    # Print summary
+    total = len(results)
+    passed = sum(results)
+    print(f"\nTests complete: {passed}/{total} passed")
+    
+    if passed == total:
+        print("All tests passed!")
+    else:
+        print(f"{total - passed} tests failed")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
\ No newline at end of file
diff --git a/tests/browser/test_playwright_strategy.py b/tests/browser/test_playwright_strategy.py
new file mode 100644
index 00000000..94003b53
--- /dev/null
+++ b/tests/browser/test_playwright_strategy.py
@@ -0,0 +1,316 @@
+"""Test examples for PlaywrightBrowserStrategy.
+
+These examples demonstrate the functionality of PlaywrightBrowserStrategy
+and serve as functional tests.
+"""
+
+import asyncio
+import os
+import re
+import sys
+
+# Add the project root to Python path if running directly
+if __name__ == "__main__":
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+
+from crawl4ai.browser import BrowserManager
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+
+# Create a logger for clear terminal output
+logger = AsyncLogger(verbose=True, log_file=None)
+
+
+
+async def test_start_close():
+    # Create browser config for standard Playwright
+    browser_config = BrowserConfig(
+        headless=True,
+        viewport_width=1280,
+        viewport_height=800
+    )
+    
+    # Create browser manager with the config
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        for _ in range(4):
+            # Start the browser
+            await manager.start()
+            logger.info("Browser started successfully", tag="TEST")
+
+            # Get a page
+            page, context = await manager.get_page(CrawlerRunConfig())
+            logger.info("Got page successfully", tag="TEST")
+            
+            # Navigate to a website
+            await page.goto("https://example.com")
+            logger.info("Navigated to example.com", tag="TEST")
+            
+            # Get page title
+            title = await page.title()
+            logger.info(f"Page title: {title}", tag="TEST")
+            
+            # Clean up
+            await manager.close()
+            logger.info("Browser closed successfully", tag="TEST")
+   
+            await asyncio.sleep(1)  # Wait for a moment before restarting
+
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Ensure cleanup
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+    return True
+
+async def test_playwright_basic():
+    """Test basic Playwright browser functionality."""
+    logger.info("Testing standard Playwright browser", tag="TEST")
+    
+    # Create browser config for standard Playwright
+    browser_config = BrowserConfig(
+        headless=True,
+        viewport_width=1280,
+        viewport_height=800
+    )
+    
+    # Create browser manager with the config
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully", tag="TEST")
+        
+        # Create crawler config
+        crawler_config = CrawlerRunConfig(url="https://example.com")
+        
+        # Get a page
+        page, context = await manager.get_page(crawler_config)
+        logger.info("Got page successfully", tag="TEST")
+        
+        # Navigate to a website
+        await page.goto("https://example.com")
+        logger.info("Navigated to example.com", tag="TEST")
+        
+        # Get page title
+        title = await page.title()
+        logger.info(f"Page title: {title}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.info("Browser closed successfully", tag="TEST")
+        
+        return True
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Ensure cleanup
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def test_playwright_text_mode():
+    """Test Playwright browser in text-only mode."""
+    logger.info("Testing Playwright text mode", tag="TEST")
+    
+    # Create browser config with text mode enabled
+    browser_config = BrowserConfig(
+        headless=True,
+        text_mode=True  # Enable text-only mode
+    )
+    
+    # Create browser manager with the config
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully in text mode", tag="TEST")
+        
+        # Get a page
+        crawler_config = CrawlerRunConfig(url="https://example.com")
+        page, context = await manager.get_page(crawler_config)
+        
+        # Navigate to a website
+        await page.goto("https://example.com")
+        logger.info("Navigated to example.com", tag="TEST")
+        
+        # Get page title
+        title = await page.title()
+        logger.info(f"Page title: {title}", tag="TEST")
+        
+        # Check if images are blocked in text mode
+        # We'll check if any image requests were made
+        has_images = False
+        async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info:
+            try:
+                # Try to load a page with images
+                await page.goto("https://picsum.photos/", wait_until="domcontentloaded")
+                request = await request_info.value
+                has_images = True
+            except:
+                # Timeout without image requests means text mode is working
+                has_images = False
+        
+        logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.info("Browser closed successfully", tag="TEST")
+        
+        return True
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Ensure cleanup
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def test_playwright_context_reuse():
+    """Test context caching and reuse with identical configurations."""
+    logger.info("Testing context reuse with identical configurations", tag="TEST")
+    
+    # Create browser config
+    browser_config = BrowserConfig(headless=True)
+    
+    # Create browser manager
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        # Start the browser
+        await manager.start()
+        logger.info("Browser started successfully", tag="TEST")
+        
+        # Create identical crawler configs
+        crawler_config1 = CrawlerRunConfig(
+            css_selector="body",
+        )
+        
+        crawler_config2 = CrawlerRunConfig(
+            css_selector="body",
+        )
+        
+        # Get pages with these configs
+        page1, context1 = await manager.get_page(crawler_config1)
+        page2, context2 = await manager.get_page(crawler_config2)
+        
+        # Check if contexts are reused
+        is_same_context = context1 == context2
+        logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
+        
+        # Now try with a different config
+        crawler_config3 = CrawlerRunConfig()
+        
+        page3, context3 = await manager.get_page(crawler_config3)
+        
+        # This should be a different context
+        is_different_context = context1 != context3
+        logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.info("Browser closed successfully", tag="TEST")
+        
+        # Both tests should pass for success
+        return is_same_context and is_different_context
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Ensure cleanup
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def test_playwright_session_management():
+    """Test session management with Playwright browser."""
+    logger.info("Testing session management with Playwright browser", tag="TEST")
+    
+    browser_config = BrowserConfig(
+        headless=True
+    )
+    
+    manager = BrowserManager(browser_config=browser_config, logger=logger)
+    
+    try:
+        await manager.start()
+        logger.info("Browser launched successfully", tag="TEST")
+        
+        # Create two sessions
+        session1_id = "playwright_session_1"
+        session2_id = "playwright_session_2"
+        
+        # Set up first session
+        crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com")
+        page1, context1 = await manager.get_page(crawler_config1)
+        await page1.goto("https://example.com")
+        await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')")
+        logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
+        
+        # Set up second session
+        crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org")
+        page2, context2 = await manager.get_page(crawler_config2)
+        await page2.goto("https://example.org")
+        await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')")
+        logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
+        
+        # Get first session again
+        page1_again, context1_again = await manager.get_page(crawler_config1)
+        
+        # Verify it's the same page and data persists
+        is_same_page = page1 == page1_again
+        is_same_context = context1 == context1_again
+        data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')")
+        logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
+        
+        # Kill first session
+        await manager.kill_session(session1_id)
+        logger.info(f"Killed session 1", tag="TEST")
+        
+        # Verify second session still works
+        data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')")
+        logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
+        
+        # Clean up
+        await manager.close()
+        logger.info("Browser closed successfully", tag="TEST")
+        
+        return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2"
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        try:
+            await manager.close()
+        except:
+            pass
+        return False
+
+async def run_tests():
+    """Run all tests sequentially."""
+    results = []
+    
+    # results.append(await test_start_close())
+    # results.append(await test_playwright_basic())
+    # results.append(await test_playwright_text_mode())
+    # results.append(await test_playwright_context_reuse())
+    results.append(await test_playwright_session_management())
+    
+    # Print summary
+    total = len(results)
+    passed = sum(results)
+    logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
+    
+    if passed == total:
+        logger.success("All tests passed!", tag="SUMMARY")
+    else:
+        logger.error(f"{total - passed} tests failed", tag="SUMMARY")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
diff --git a/tests/browser/test_profiles.py b/tests/browser/test_profiles.py
new file mode 100644
index 00000000..8325b561
--- /dev/null
+++ b/tests/browser/test_profiles.py
@@ -0,0 +1,176 @@
+"""Test examples for BrowserProfileManager.
+
+These examples demonstrate the functionality of BrowserProfileManager
+and serve as functional tests.
+"""
+
+import asyncio
+import os
+import sys
+import uuid
+import shutil
+
+# Add the project root to Python path if running directly
+if __name__ == "__main__":
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+
+from crawl4ai.browser import BrowserManager, BrowserProfileManager
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_logger import AsyncLogger
+
+# Create a logger for clear terminal output
+logger = AsyncLogger(verbose=True, log_file=None)
+
+async def test_profile_creation():
+    """Test creating and managing browser profiles."""
+    logger.info("Testing profile creation and management", tag="TEST")
+    
+    profile_manager = BrowserProfileManager(logger=logger)
+    
+    try:
+        # List existing profiles
+        profiles = profile_manager.list_profiles()
+        logger.info(f"Found {len(profiles)} existing profiles", tag="TEST")
+        
+        # Generate a unique profile name for testing
+        test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}"
+        
+        # Create a test profile directory
+        profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
+        os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
+        
+        # Create a dummy Preferences file to simulate a Chrome profile
+        with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
+            f.write("{\"test\": true}")
+        
+        logger.info(f"Created test profile at: {profile_path}", tag="TEST")
+        
+        # Verify the profile is now in the list
+        profiles = profile_manager.list_profiles()
+        profile_found = any(p["name"] == test_profile_name for p in profiles)
+        logger.info(f"Profile found in list: {profile_found}", tag="TEST")
+        
+        # Try to get the profile path
+        retrieved_path = profile_manager.get_profile_path(test_profile_name)
+        path_match = retrieved_path == profile_path
+        logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST")
+        
+        # Delete the profile
+        success = profile_manager.delete_profile(test_profile_name)
+        logger.info(f"Profile deletion successful: {success}", tag="TEST")
+        
+        # Verify it's gone
+        profiles_after = profile_manager.list_profiles()
+        profile_removed = not any(p["name"] == test_profile_name for p in profiles_after)
+        logger.info(f"Profile removed from list: {profile_removed}", tag="TEST")
+        
+        # Clean up just in case
+        if os.path.exists(profile_path):
+            shutil.rmtree(profile_path, ignore_errors=True)
+        
+        return profile_found and path_match and success and profile_removed
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Clean up test directory
+        try:
+            if os.path.exists(profile_path):
+                shutil.rmtree(profile_path, ignore_errors=True)
+        except:
+            pass
+        return False
+
+async def test_profile_with_browser():
+    """Test using a profile with a browser."""
+    logger.info("Testing using a profile with a browser", tag="TEST")
+    
+    profile_manager = BrowserProfileManager(logger=logger)
+    test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
+    profile_path = None
+    
+    try:
+        # Create a test profile directory
+        profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
+        os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
+        
+        # Create a dummy Preferences file to simulate a Chrome profile
+        with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
+            f.write("{\"test\": true}")
+        
+        logger.info(f"Created test profile at: {profile_path}", tag="TEST")
+        
+        # Now use this profile with a browser
+        browser_config = BrowserConfig(
+            user_data_dir=profile_path,
+            headless=True
+        )
+        
+        manager = BrowserManager(browser_config=browser_config, logger=logger)
+        
+        # Start the browser with the profile
+        await manager.start()
+        logger.info("Browser started with profile", tag="TEST")
+        
+        # Create a page
+        crawler_config = CrawlerRunConfig()
+        page, context = await manager.get_page(crawler_config)
+        
+        # Navigate and set some data to verify profile works
+        await page.goto("https://example.com")
+        await page.evaluate("localStorage.setItem('test_data', 'profile_value')")
+        
+        # Close browser
+        await manager.close()
+        logger.info("First browser session closed", tag="TEST")
+        
+        # Create a new browser with the same profile
+        manager2 = BrowserManager(browser_config=browser_config, logger=logger)
+        await manager2.start()
+        logger.info("Second browser session started with same profile", tag="TEST")
+        
+        # Get a page and check if the data persists
+        page2, context2 = await manager2.get_page(crawler_config)
+        await page2.goto("https://example.com")
+        data = await page2.evaluate("localStorage.getItem('test_data')")
+        
+        # Verify data persisted
+        data_persisted = data == "profile_value"
+        logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST")
+        
+        # Clean up
+        await manager2.close()
+        logger.info("Second browser session closed", tag="TEST")
+        
+        # Delete the test profile
+        success = profile_manager.delete_profile(test_profile_name)
+        logger.info(f"Test profile deleted: {success}", tag="TEST")
+        
+        return data_persisted and success
+    except Exception as e:
+        logger.error(f"Test failed: {str(e)}", tag="TEST")
+        # Clean up
+        try:
+            if profile_path and os.path.exists(profile_path):
+                shutil.rmtree(profile_path, ignore_errors=True)
+        except:
+            pass
+        return False
+
+async def run_tests():
+    """Run all tests sequentially."""
+    results = []
+    
+    results.append(await test_profile_creation())
+    results.append(await test_profile_with_browser())
+    
+    # Print summary
+    total = len(results)
+    passed = sum(results)
+    logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
+    
+    if passed == total:
+        logger.success("All tests passed!", tag="SUMMARY")
+    else:
+        logger.error(f"{total - passed} tests failed", tag="SUMMARY")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
new file mode 100644
index 00000000..b7416dc2
--- /dev/null
+++ b/tests/cli/test_cli.py
@@ -0,0 +1,133 @@
+import pytest
+from click.testing import CliRunner
+from pathlib import Path
+import json
+import yaml
+from crawl4ai.cli import cli, load_config_file, parse_key_values
+import tempfile
+import os
+import click
+
+@pytest.fixture
+def runner():
+    return CliRunner()
+
+@pytest.fixture
+def temp_config_dir():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        old_home = os.environ.get('HOME')
+        os.environ['HOME'] = tmpdir
+        yield Path(tmpdir)
+        if old_home:
+            os.environ['HOME'] = old_home
+
+@pytest.fixture
+def sample_configs(temp_config_dir):
+    configs = {
+        'browser.yml': {
+            'headless': True,
+            'viewport_width': 1280,
+            'user_agent_mode': 'random'
+        },
+        'crawler.yml': {
+            'cache_mode': 'bypass',
+            'wait_until': 'networkidle',
+            'scan_full_page': True
+        },
+        'extract_css.yml': {
+            'type': 'json-css',
+            'params': {'verbose': True}
+        },
+        'css_schema.json': {
+            'name': 'ArticleExtractor',
+            'baseSelector': '.article',
+            'fields': [
+                {'name': 'title', 'selector': 'h1.title', 'type': 'text'},
+                {'name': 'link', 'selector': 'a.read-more', 'type': 'attribute', 'attribute': 'href'}
+            ]
+        }
+    }
+    
+    for filename, content in configs.items():
+        path = temp_config_dir / filename
+        with open(path, 'w') as f:
+            if filename.endswith('.yml'):
+                yaml.dump(content, f)
+            else:
+                json.dump(content, f)
+                
+    return {name: str(temp_config_dir / name) for name in configs}
+
+class TestCLIBasics:
+    def test_help(self, runner):
+        result = runner.invoke(cli, ['--help'])
+        assert result.exit_code == 0
+        assert 'Crawl4AI CLI' in result.output
+
+    def test_examples(self, runner):
+        result = runner.invoke(cli, ['--example'])
+        assert result.exit_code == 0
+        assert 'Examples' in result.output
+
+    def test_missing_url(self, runner):
+        result = runner.invoke(cli)
+        assert result.exit_code != 0
+        assert 'URL argument is required' in result.output
+
+class TestConfigParsing:
+    def test_parse_key_values_basic(self):
+        result = parse_key_values(None, None, "key1=value1,key2=true")
+        assert result == {'key1': 'value1', 'key2': True}
+
+    def test_parse_key_values_invalid(self):
+        with pytest.raises(click.BadParameter):
+            parse_key_values(None, None, "invalid_format")
+
+class TestConfigLoading:
+    def test_load_yaml_config(self, sample_configs):
+        config = load_config_file(sample_configs['browser.yml'])
+        assert config['headless'] is True
+        assert config['viewport_width'] == 1280
+
+    def test_load_json_config(self, sample_configs):
+        config = load_config_file(sample_configs['css_schema.json'])
+        assert config['name'] == 'ArticleExtractor'
+        assert len(config['fields']) == 2
+
+    def test_load_nonexistent_config(self):
+        with pytest.raises(click.BadParameter):
+            load_config_file('nonexistent.yml')
+
+class TestLLMConfig:
+    def test_llm_config_creation(self, temp_config_dir, runner):
+        def input_simulation(inputs):
+            return runner.invoke(cli, ['https://example.com', '-q', 'test question'], 
+                               input='\n'.join(inputs))
+            
+class TestCrawlingFeatures:
+    def test_basic_crawl(self, runner):
+        result = runner.invoke(cli, ['https://example.com'])
+        assert result.exit_code == 0
+
+
+class TestErrorHandling:
+    def test_invalid_config_file(self, runner):
+        result = runner.invoke(cli, [
+            'https://example.com',
+            '--browser-config', 'nonexistent.yml'
+        ])
+        assert result.exit_code != 0
+
+    def test_invalid_schema(self, runner, temp_config_dir):
+        invalid_schema = temp_config_dir / 'invalid_schema.json'
+        with open(invalid_schema, 'w') as f:
+            f.write('invalid json')
+            
+        result = runner.invoke(cli, [
+            'https://example.com',
+            '--schema', str(invalid_schema)
+        ])
+        assert result.exit_code != 0
+
+if __name__ == '__main__':
+    pytest.main(['-v', '-s', '--tb=native', __file__])
\ No newline at end of file
diff --git a/tests/docker/test_config_object.py b/tests/docker/test_config_object.py
new file mode 100644
index 00000000..94a30f05
--- /dev/null
+++ b/tests/docker/test_config_object.py
@@ -0,0 +1,113 @@
+import json
+from crawl4ai import (
+    CrawlerRunConfig,
+    DefaultMarkdownGenerator,
+    RegexChunking,
+    JsonCssExtractionStrategy,
+    BM25ContentFilter,
+    CacheMode
+)
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+from crawl4ai.deep_crawling.filters import FastFilterChain
+from crawl4ai.deep_crawling.filters import FastContentTypeFilter, FastDomainFilter
+from crawl4ai.deep_crawling.scorers import FastKeywordRelevanceScorer
+
+def create_test_config() -> CrawlerRunConfig:
+    # Set up content filtering and markdown generation
+    content_filter = BM25ContentFilter(
+        user_query="technology articles",
+    )
+    
+    markdown_generator = DefaultMarkdownGenerator(
+        content_filter=content_filter,
+        options={"ignore_links": False, "body_width": 0}
+    )
+
+    # Set up extraction strategy
+    extraction_schema = {
+        "name": "ArticleExtractor",
+        "baseSelector": "article.content",
+        "fields": [
+            {"name": "title", "selector": "h1", "type": "text"},
+            {"name": "content", "selector": ".article-body", "type": "html"}
+        ]
+    }
+    extraction_strategy = JsonCssExtractionStrategy(schema=extraction_schema)
+
+    # Set up deep crawling
+    filter_chain = FastFilterChain([
+        FastContentTypeFilter(["text/html"]),
+        FastDomainFilter(blocked_domains=["ads.*"])
+    ])
+
+    url_scorer = FastKeywordRelevanceScorer(
+        keywords=["article", "blog"],
+        weight=1.0
+    )
+
+    deep_crawl_strategy = BFSDeepCrawlStrategy(
+        max_depth=3,
+        filter_chain=filter_chain,
+        url_scorer=url_scorer
+    )
+
+    # Create the config
+    config = CrawlerRunConfig(
+        word_count_threshold=200,
+        extraction_strategy=extraction_strategy,
+        chunking_strategy=RegexChunking(patterns=[r"\n\n"]),
+        markdown_generator=markdown_generator,
+        css_selector="main.content",
+        excluded_tags=["nav", "footer"],
+        keep_attrs=["href", "src"],
+        cache_mode=CacheMode.BYPASS,
+        wait_until="networkidle",
+        page_timeout=30000,
+        scan_full_page=True,
+        deep_crawl_strategy=deep_crawl_strategy,
+        verbose=True,
+        stream=True
+    )
+
+    return config
+
+def test_config_serialization_cycle():
+    # Create original config
+    original_config = create_test_config()
+    
+    # Dump to serializable dictionary
+    serialized = original_config.dump()
+
+    print(json.dumps(serialized, indent=2))
+    
+    # Load back into config object
+    deserialized_config = CrawlerRunConfig.load(serialized)
+    
+    # Verify core attributes
+    assert deserialized_config.word_count_threshold == original_config.word_count_threshold
+    assert deserialized_config.css_selector == original_config.css_selector
+    assert deserialized_config.excluded_tags == original_config.excluded_tags
+    assert deserialized_config.keep_attrs == original_config.keep_attrs
+    assert deserialized_config.cache_mode == original_config.cache_mode
+    assert deserialized_config.wait_until == original_config.wait_until
+    assert deserialized_config.page_timeout == original_config.page_timeout
+    assert deserialized_config.scan_full_page == original_config.scan_full_page
+    assert deserialized_config.verbose == original_config.verbose
+    assert deserialized_config.stream == original_config.stream
+
+    # Verify complex objects
+    assert isinstance(deserialized_config.extraction_strategy, JsonCssExtractionStrategy)
+    assert isinstance(deserialized_config.chunking_strategy, RegexChunking)
+    assert isinstance(deserialized_config.markdown_generator, DefaultMarkdownGenerator)
+    assert isinstance(deserialized_config.markdown_generator.content_filter, BM25ContentFilter)
+    assert isinstance(deserialized_config.deep_crawl_strategy, BFSDeepCrawlStrategy)
+    
+    # Verify deep crawl strategy configuration
+    assert deserialized_config.deep_crawl_strategy.max_depth == 3
+    assert isinstance(deserialized_config.deep_crawl_strategy.filter_chain, FastFilterChain)
+    assert isinstance(deserialized_config.deep_crawl_strategy.url_scorer, FastKeywordRelevanceScorer)
+
+    print("Serialization cycle test passed successfully!")
+
+if __name__ == "__main__":
+    test_config_serialization_cycle()
\ No newline at end of file
diff --git a/tests/docker/test_docker.py b/tests/docker/test_docker.py
new file mode 100644
index 00000000..cf95671e
--- /dev/null
+++ b/tests/docker/test_docker.py
@@ -0,0 +1,175 @@
+import requests
+import time
+import httpx
+import asyncio
+from typing import Dict, Any
+from crawl4ai import (
+    BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
+    PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
+)
+from crawl4ai import LLMConfig
+from crawl4ai.docker_client import Crawl4aiDockerClient
+
+class Crawl4AiTester:
+    def __init__(self, base_url: str = "http://localhost:11235"):
+        self.base_url = base_url
+
+    def submit_and_wait(
+        self, request_data: Dict[str, Any], timeout: int = 300
+    ) -> Dict[str, Any]:
+        # Submit crawl job
+        response = requests.post(f"{self.base_url}/crawl", json=request_data)
+        task_id = response.json()["task_id"]
+        print(f"Task ID: {task_id}")
+
+        # Poll for result
+        start_time = time.time()
+        while True:
+            if time.time() - start_time > timeout:
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )
+
+            result = requests.get(f"{self.base_url}/task/{task_id}")
+            status = result.json()
+
+            if status["status"] == "failed":
+                print("Task failed:", status.get("error"))
+                raise Exception(f"Task failed: {status.get('error')}")
+
+            if status["status"] == "completed":
+                return status
+
+            time.sleep(2)
+
+async def test_direct_api():
+    """Test direct API endpoints without using the client SDK"""
+    print("\n=== Testing Direct API Calls ===")
+    
+    # Test 1: Basic crawl with content filtering
+    browser_config = BrowserConfig(
+        headless=True,
+        viewport_width=1200,
+        viewport_height=800
+    )
+    
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48,
+                threshold_type="fixed",
+                min_word_threshold=0
+            ),
+            options={"ignore_links": True}
+        )
+    )
+
+    request_data = {
+        "urls": ["https://example.com"],
+        "browser_config": browser_config.dump(),
+        "crawler_config": crawler_config.dump()
+    }
+
+    # Make direct API call
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            "http://localhost:8000/crawl",
+            json=request_data,
+            timeout=300
+        )
+        assert response.status_code == 200
+        result = response.json()
+        print("Basic crawl result:", result["success"])
+
+    # Test 2: Structured extraction with JSON CSS
+    schema = {
+        "baseSelector": "article.post",
+        "fields": [
+            {"name": "title", "selector": "h1", "type": "text"},
+            {"name": "content", "selector": ".content", "type": "html"}
+        ]
+    }
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema=schema)
+    )
+
+    request_data["crawler_config"] = crawler_config.dump()
+
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            "http://localhost:8000/crawl",
+            json=request_data
+        )
+        assert response.status_code == 200
+        result = response.json()
+        print("Structured extraction result:", result["success"])
+
+    # Test 3: Get schema
+    # async with httpx.AsyncClient() as client:
+    #     response = await client.get("http://localhost:8000/schema")
+    #     assert response.status_code == 200
+    #     schemas = response.json()
+    #     print("Retrieved schemas for:", list(schemas.keys()))
+
+async def test_with_client():
+    """Test using the Crawl4AI Docker client SDK"""
+    print("\n=== Testing Client SDK ===")
+    
+    async with Crawl4aiDockerClient(verbose=True) as client:
+        # Test 1: Basic crawl
+        browser_config = BrowserConfig(headless=True)
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(
+                    threshold=0.48,
+                    threshold_type="fixed"
+                )
+            )
+        )
+
+        result = await client.crawl(
+            urls=["https://example.com"],
+            browser_config=browser_config,
+            crawler_config=crawler_config
+        )
+        print("Client SDK basic crawl:", result.success)
+
+        # Test 2: LLM extraction with streaming
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=LLMContentFilter(
+                    llm_config=LLMConfig(provider="openai/gpt-40"),
+                    instruction="Extract key technical concepts"
+                )
+            ),
+            stream=True
+        )
+
+        async for result in await client.crawl(
+            urls=["https://example.com"],
+            browser_config=browser_config,
+            crawler_config=crawler_config
+        ):
+            print(f"Streaming result for: {result.url}")
+
+        # # Test 3: Get schema
+        # schemas = await client.get_schema()
+        # print("Retrieved client schemas for:", list(schemas.keys()))
+
+async def main():
+    """Run all tests"""
+    # Test direct API
+    print("Testing direct API calls...")
+    await test_direct_api()
+
+    # Test client SDK
+    print("\nTesting client SDK...")
+    await test_with_client()
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/docker/test_dockerclient.py b/tests/docker/test_dockerclient.py
new file mode 100644
index 00000000..cba6c4c9
--- /dev/null
+++ b/tests/docker/test_dockerclient.py
@@ -0,0 +1,34 @@
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig
+)
+
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
+        await client.authenticate("test@example.com")
+        
+        # Non-streaming crawl
+        results = await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig()
+        )
+        print(f"Non-streaming results: {results}")
+        
+        # Streaming crawl
+        crawler_config = CrawlerRunConfig(stream=True)
+        async for result in await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=crawler_config
+        ):
+            print(f"Streamed result: {result}")
+        
+        # Get schema
+        schema = await client.get_schema()
+        print(f"Schema: {schema}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/docker/test_rest_api_deep_crawl.py b/tests/docker/test_rest_api_deep_crawl.py
new file mode 100644
index 00000000..c535727f
--- /dev/null
+++ b/tests/docker/test_rest_api_deep_crawl.py
@@ -0,0 +1,596 @@
+# ==== File: test_rest_api_deep_crawl.py ====
+
+import pytest
+import pytest_asyncio
+import httpx
+import json
+import asyncio
+import os
+from typing import List, Dict, Any, AsyncGenerator
+
+from dotenv import load_dotenv
+load_dotenv() # Load environment variables from .env file if present
+
+# --- Test Configuration ---
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
+DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
+DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
+
+# --- Helper Functions ---
+def load_proxies_from_env() -> List[Dict]:
+    """Load proxies from PROXIES environment variable"""
+    proxies = []
+    proxies_str = os.getenv("PROXIES", "")
+    if not proxies_str:
+        print("PROXIES environment variable not set or empty.")
+        return proxies
+    try:
+        proxy_list = proxies_str.split(",")
+        for proxy in proxy_list:
+            proxy = proxy.strip()
+            if not proxy:
+                continue
+            parts = proxy.split(":")
+            if len(parts) == 4:
+                ip, port, username, password = parts
+                proxies.append({
+                    "server": f"http://{ip}:{port}", # Assuming http, adjust if needed
+                    "username": username,
+                    "password": password,
+                    "ip": ip  # Store original IP if available
+                })
+            elif len(parts) == 2: # ip:port only
+                 ip, port = parts
+                 proxies.append({
+                    "server": f"http://{ip}:{port}",
+                    "ip": ip
+                 })
+            else:
+                 print(f"Skipping invalid proxy string format: {proxy}")
+
+    except Exception as e:
+        print(f"Error loading proxies from environment: {e}")
+    return proxies
+
+
+async def check_server_health(client: httpx.AsyncClient):
+    """Check if the server is healthy before running tests."""
+    try:
+        response = await client.get("/health")
+        response.raise_for_status()
+        print(f"\nServer healthy: {response.json()}")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
+
+async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
+    """Asserts the basic structure of a single crawl result."""
+    assert isinstance(result, dict)
+    assert "url" in result
+    assert "success" in result
+    assert "html" in result # Basic crawls should return HTML
+    assert "metadata" in result
+    assert isinstance(result["metadata"], dict)
+    assert "depth" in result["metadata"] # Deep crawls add depth
+
+    if check_ssl:
+        assert "ssl_certificate" in result # Check if SSL info is present
+        assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
+
+
+async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
+    """Processes an NDJSON streaming response."""
+    results = []
+    completed = False
+    async for line in response.aiter_lines():
+        if line:
+            try:
+                data = json.loads(line)
+                if data.get("status") == "completed":
+                    completed = True
+                    break # Stop processing after completion marker
+                elif data.get("url"): # Ensure it looks like a result object
+                    results.append(data)
+                else:
+                    print(f"Received non-result JSON line: {data}") # Log other status messages if needed
+            except json.JSONDecodeError:
+                pytest.fail(f"Failed to decode JSON line: {line}")
+    assert completed, "Streaming response did not end with a completion marker."
+    return results
+
+
+# --- Pytest Fixtures ---
+@pytest_asyncio.fixture(scope="function")
+async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
+    """Provides an async HTTP client"""
+    # Increased timeout for potentially longer deep crawls
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
+        yield client
+    # No explicit close needed with 'async with'
+
+# --- Test Class ---
+@pytest.mark.asyncio
+class TestDeepCrawlEndpoints:
+
+    @pytest_asyncio.fixture(autouse=True)
+    async def check_health_before_tests(self, async_client: httpx.AsyncClient):
+        """Fixture to ensure server is healthy before each test in the class."""
+        await check_server_health(async_client)
+
+    # 1. Basic Deep Crawl
+    async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl with limited depth and pages."""
+        max_depth = 1
+        max_pages = 3 # start_url + 2 more
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS", # Use string value for CacheMode
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            # Minimal filters for basic test
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {
+                                            "type": "DomainFilter",
+                                            "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        assert len(data["results"]) > 1 # Should be more than just the start URL
+        assert len(data["results"]) <= max_pages # Respect max_pages
+
+        found_depth_0 = False
+        found_depth_1 = False
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert DEEP_CRAWL_DOMAIN in result["url"]
+            depth = result["metadata"]["depth"]
+            assert depth <= max_depth
+            if depth == 0: found_depth_0 = True
+            if depth == 1: found_depth_1 = True
+
+        assert found_depth_0
+        assert found_depth_1
+
+    # 2. Deep Crawl with Filtering
+    async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl with content type and domain filters."""
+        max_depth = 1
+        max_pages = 5
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {
+                                            "type": "DomainFilter",
+                                            "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+                                        },
+                                        {
+                                            "type": "ContentTypeFilter",
+                                            "params": {"allowed_types": ["text/html"]}
+                                        },
+                                        # Example: Exclude specific paths using regex
+                                        {
+                                            "type": "URLPatternFilter",
+                                             "params": {
+                                                 "patterns": ["*/category-3/*"], # Block category 3
+                                                 "reverse": True # Block if match
+                                             }
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
+
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        assert len(data["results"]) <= max_pages
+
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert DEEP_CRAWL_DOMAIN in result["url"]
+            assert "category-3" not in result["url"] # Check if filter worked
+            assert result["metadata"]["depth"] <= max_depth
+
+    # 3. Deep Crawl with Scoring
+    async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl with URL scoring."""
+        max_depth = 1
+        max_pages = 4
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": { # Keep basic domain filter
+                                "type": "FilterChain",
+                                "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                            },
+                            "url_scorer": { # Add scorer
+                                "type": "CompositeScorer",
+                                "params": {
+                                    "scorers": [
+                                        {   # Favor pages with 'product' in the URL
+                                            "type": "KeywordRelevanceScorer",
+                                            "params": {"keywords": ["product"], "weight": 1.0}
+                                        },
+                                        {   # Penalize deep paths slightly
+                                            "type": "PathDepthScorer",
+                                            "params": {"optimal_depth": 2, "weight": -0.2}
+                                        }
+                                    ]
+                                }
+                            },
+                            # Set a threshold if needed: "score_threshold": 0.1
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
+
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        assert len(data["results"]) <= max_pages
+
+        # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
+        product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
+        print(f"Product URLs found among depth > 0 results: {product_urls_found}")
+        # We expect scoring to prioritize product pages if available within limits
+        # assert product_urls_found # This might be too strict depending on site structure and limits
+
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["metadata"]["depth"] <= max_depth
+
+    # 4. Deep Crawl with CSS Extraction
+    async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
+        max_depth = 6 # Go deep enough to reach product pages
+        max_pages = 20
+        # Schema to extract product details
+        product_schema = {
+            "name": "ProductDetails",
+            "baseSelector": "div.container", # Base for product page
+            "fields": [
+                {"name": "product_title", "selector": "h1", "type": "text"},
+                {"name": "price", "selector": ".product-price", "type": "text"},
+                {"name": "description", "selector": ".product-description p", "type": "text"},
+                {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
+                     {"name": "spec_name", "selector": ".spec-name", "type": "text"},
+                     {"name": "spec_value", "selector": ".spec-value", "type": "text"}
+                ]}
+            ]
+        }
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "extraction_strategy": { # Apply extraction to ALL crawled pages
+                        "type": "JsonCssExtractionStrategy",
+                        "params": {"schema": {"type": "dict", "value": product_schema}}
+                    },
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": { # Only crawl HTML on our domain
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                        {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+                                    ]
+                                }
+                            }
+                            # Optional: Add scoring to prioritize product pages for extraction
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
+
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        # assert len(data["results"]) <= max_pages
+
+        found_extracted_product = False
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert "extracted_content" in result
+            if "product_" in result["url"]: # Check product pages specifically
+                 assert result["extracted_content"] is not None
+                 try:
+                     extracted = json.loads(result["extracted_content"])
+                     # Schema returns list even if one base match
+                     assert isinstance(extracted, list)
+                     if extracted:
+                         item = extracted[0]
+                         assert "product_title" in item and item["product_title"]
+                         assert "price" in item and item["price"]
+                         # Specs might be empty list if not found
+                         assert "specs" in item and isinstance(item["specs"], list)
+                         found_extracted_product = True
+                         print(f"Extracted product: {item.get('product_title')}")
+                 except (json.JSONDecodeError, AssertionError, IndexError) as e:
+                      pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+            # else:
+            #      # Non-product pages might have None or empty list depending on schema match
+            #      assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
+
+        assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
+
+    # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
+    async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl combined with LLMExtractionStrategy."""
+        max_depth = 1 # Limit depth to keep LLM calls manageable
+        max_pages = 3
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "extraction_strategy": { # Apply LLM extraction to crawled pages
+                        "type": "LLMExtractionStrategy",
+                        "params": {
+                            "instruction": "Extract the main H1 title and the text content of the first paragraph.",
+                            "llm_config": { # Example override, rely on server default if possible
+                               "type": "LLMConfig",
+                               "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
+                            },
+                             "schema": { # Expected JSON output
+                                "type": "dict",
+                                "value": {
+                                    "title": "PageContent", "type": "object",
+                                    "properties": {
+                                        "h1_title": {"type": "string"},
+                                        "first_paragraph": {"type": "string"}
+                                    }
+                                }
+                            }
+                        }
+                    },
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                        {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
+        except httpx.RequestError as e:
+             pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
+
+
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        assert len(data["results"]) <= max_pages
+
+        found_llm_extraction = False
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert "extracted_content" in result
+            assert result["extracted_content"] is not None
+            try:
+                extracted = json.loads(result["extracted_content"])
+                if isinstance(extracted, list): extracted = extracted[0] # Handle list output
+                assert isinstance(extracted, dict)
+                assert "h1_title" in extracted # Check keys based on schema
+                assert "first_paragraph" in extracted
+                found_llm_extraction = True
+                print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
+            except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
+                pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+
+        assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
+
+
+    # 6. Deep Crawl with SSL Certificate Fetching
+    async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl with fetch_ssl_certificate enabled."""
+        max_depth = 0 # Only fetch for start URL to keep test fast
+        max_pages = 1
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "fetch_ssl_certificate": True, # <-- Enable SSL fetching
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                        }
+                    }
+                }
+            }
+        }
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+
+        await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
+        assert result["success"] is True
+                # Check if SSL info was actually retrieved
+        if result["ssl_certificate"]:
+            # Assert directly using dictionary keys
+            assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
+            assert "issuer" in result["ssl_certificate"]
+            assert "subject" in result["ssl_certificate"]
+            # --- MODIFIED ASSERTIONS ---
+            assert "not_before" in result["ssl_certificate"] # Check for the actual key
+            assert "not_after" in result["ssl_certificate"]  # Check for the actual key
+            # --- END MODIFICATIONS ---
+            assert "fingerprint" in result["ssl_certificate"] # Check another key
+
+            # This print statement using .get() already works correctly with dictionaries
+            print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
+            print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
+        else:
+            # This part remains the same
+            print("SSL Certificate was null in the result.")
+
+
+    # 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
+    async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
+        """Test BFS deep crawl using proxy rotation."""
+        proxies = load_proxies_from_env()
+        if not proxies:
+            pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
+
+        print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
+
+        max_depth = 1
+        max_pages = 3
+        payload = {
+            "urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
+             # Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "proxy_rotation_strategy": { # <-- Define the strategy
+                        "type": "RoundRobinProxyStrategy",
+                        "params": {
+                             # Convert ProxyConfig dicts back to the serialized format expected by server
+                             "proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
+                        }
+                    },
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": max_depth,
+                            "max_pages": max_pages,
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            # Proxies often cause connection errors, catch them
+            pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
+        except httpx.RequestError as e:
+             pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
+
+        assert data["success"] is True
+        assert len(data["results"]) > 0
+        assert len(data["results"]) <= max_pages
+        # Primary assertion is that the crawl succeeded *with* proxy config
+        print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
+
+        # Verifying specific proxy usage requires server logs or custom headers/responses
+
+
+# --- Main Execution Block (for running script directly) ---
+if __name__ == "__main__":
+    pytest_args = ["-v", "-s", __file__]
+    # Example: Run only proxy test
+    # pytest_args.append("-k test_deep_crawl_with_proxies")
+    print(f"Running pytest with args: {pytest_args}")
+    exit_code = pytest.main(pytest_args)
+    print(f"Pytest finished with exit code: {exit_code}")
\ No newline at end of file
diff --git a/tests/docker/test_serialization.py b/tests/docker/test_serialization.py
new file mode 100644
index 00000000..6ce80005
--- /dev/null
+++ b/tests/docker/test_serialization.py
@@ -0,0 +1,255 @@
+import inspect
+from typing import Any, Dict
+from enum import Enum
+
+from crawl4ai import LLMConfig
+
+def to_serializable_dict(obj: Any) -> Dict:
+    """
+    Recursively convert an object to a serializable dictionary using {type, params} structure
+    for complex objects.
+    """
+    if obj is None:
+        return None
+        
+    # Handle basic types
+    if isinstance(obj, (str, int, float, bool)):
+        return obj
+        
+    # Handle Enum
+    if isinstance(obj, Enum):
+        return {
+            "type": obj.__class__.__name__,
+            "params": obj.value
+        }
+        
+    # Handle datetime objects
+    if hasattr(obj, 'isoformat'):
+        return obj.isoformat()
+        
+    # Handle lists, tuples, and sets
+    if isinstance(obj, (list, tuple, set)):
+        return [to_serializable_dict(item) for item in obj]
+        
+    # Handle dictionaries - preserve them as-is
+    if isinstance(obj, dict):
+        return {
+            "type": "dict",  # Mark as plain dictionary
+            "value": {str(k): to_serializable_dict(v) for k, v in obj.items()}
+        }
+    
+    # Handle class instances
+    if hasattr(obj, '__class__'):
+        # Get constructor signature
+        sig = inspect.signature(obj.__class__.__init__)
+        params = sig.parameters
+        
+        # Get current values
+        current_values = {}
+        for name, param in params.items():
+            if name == 'self':
+                continue
+                
+            value = getattr(obj, name, param.default)
+            
+            # Only include if different from default, considering empty values
+            if not (is_empty_value(value) and is_empty_value(param.default)):
+                if value != param.default:
+                    current_values[name] = to_serializable_dict(value)
+        
+        return {
+            "type": obj.__class__.__name__,
+            "params": current_values
+        }
+        
+    return str(obj)
+
+def from_serializable_dict(data: Any) -> Any:
+    """
+    Recursively convert a serializable dictionary back to an object instance.
+    """
+    if data is None:
+        return None
+        
+    # Handle basic types
+    if isinstance(data, (str, int, float, bool)):
+        return data
+        
+    # Handle typed data
+    if isinstance(data, dict) and "type" in data:
+        # Handle plain dictionaries
+        if data["type"] == "dict":
+            return {k: from_serializable_dict(v) for k, v in data["value"].items()}
+            
+        # Import from crawl4ai for class instances
+        import crawl4ai
+        cls = getattr(crawl4ai, data["type"])
+        
+        # Handle Enum
+        if issubclass(cls, Enum):
+            return cls(data["params"])
+            
+        # Handle class instances
+        constructor_args = {
+            k: from_serializable_dict(v) for k, v in data["params"].items()
+        }
+        return cls(**constructor_args)
+        
+    # Handle lists
+    if isinstance(data, list):
+        return [from_serializable_dict(item) for item in data]
+        
+    # Handle raw dictionaries (legacy support)
+    if isinstance(data, dict):
+        return {k: from_serializable_dict(v) for k, v in data.items()}
+        
+    return data
+    
+def is_empty_value(value: Any) -> bool:
+    """Check if a value is effectively empty/null."""
+    if value is None:
+        return True
+    if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
+        return True
+    return False
+
+# if __name__ == "__main__":
+#     from crawl4ai import (
+#         CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator, 
+#         PruningContentFilter, BM25ContentFilter, LLMContentFilter,
+#         JsonCssExtractionStrategy, CosineStrategy, RegexChunking,
+#         WebScrapingStrategy, LXMLWebScrapingStrategy
+#     )
+
+#     # Test Case 1: BM25 content filtering through markdown generator
+#     config1 = CrawlerRunConfig(
+#         cache_mode=CacheMode.BYPASS,
+#         markdown_generator=DefaultMarkdownGenerator(
+#             content_filter=BM25ContentFilter(
+#                 user_query="technology articles",
+#                 bm25_threshold=1.2,
+#                 language="english"
+#             )
+#         ),
+#         chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
+#         excluded_tags=["nav", "footer", "aside"],
+#         remove_overlay_elements=True
+#     )
+
+#     # Serialize
+#     serialized = to_serializable_dict(config1)
+#     print("\nSerialized Config:")
+#     print(serialized)
+    
+#     # Example output structure would now look like:
+#     """
+#     {
+#         "type": "CrawlerRunConfig",
+#         "params": {
+#             "cache_mode": {
+#                 "type": "CacheMode",
+#                 "params": "bypass"
+#             },
+#             "markdown_generator": {
+#                 "type": "DefaultMarkdownGenerator",
+#                 "params": {
+#                     "content_filter": {
+#                         "type": "BM25ContentFilter",
+#                         "params": {
+#                             "user_query": "technology articles",
+#                             "bm25_threshold": 1.2,
+#                             "language": "english"
+#                         }
+#                     }
+#                 }
+#             }
+#         }
+#     }
+#     """
+    
+#     # Deserialize
+#     deserialized = from_serializable_dict(serialized)
+#     print("\nDeserialized Config:")
+#     print(to_serializable_dict(deserialized))
+    
+#     # Verify they match
+#     assert to_serializable_dict(config1) == to_serializable_dict(deserialized)
+#     print("\nVerification passed: Configuration matches after serialization/deserialization!")
+
+if __name__ == "__main__":
+    from crawl4ai import (
+        CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator, 
+        PruningContentFilter, BM25ContentFilter, LLMContentFilter,
+        JsonCssExtractionStrategy, RegexChunking,
+        WebScrapingStrategy, LXMLWebScrapingStrategy
+    )
+
+    # Test Case 1: BM25 content filtering through markdown generator
+    config1 = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=BM25ContentFilter(
+                user_query="technology articles",
+                bm25_threshold=1.2,
+                language="english"
+            )
+        ),
+        chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
+        excluded_tags=["nav", "footer", "aside"],
+        remove_overlay_elements=True
+    )
+
+    # Test Case 2: LLM-based extraction with pruning filter
+    schema = {
+        "baseSelector": "article.post",
+        "fields": [
+            {"name": "title", "selector": "h1", "type": "text"},
+            {"name": "content", "selector": ".content", "type": "html"}
+        ]
+    }
+    config2 = CrawlerRunConfig(
+        extraction_strategy=JsonCssExtractionStrategy(schema=schema),
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48,
+                threshold_type="fixed",
+                min_word_threshold=0
+            ),
+            options={"ignore_links": True}
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy()
+    )
+
+    # Test Case 3:LLM content filter
+    config3 = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=LLMContentFilter(
+                llm_config = LLMConfig(provider="openai/gpt-4"),
+                instruction="Extract key technical concepts",
+                chunk_token_threshold=2000,
+                overlap_rate=0.1
+            ),
+            options={"ignore_images": True}
+        ),
+        scraping_strategy=WebScrapingStrategy()
+    )
+
+    # Test all configurations
+    test_configs = [config1, config2, config3]
+    
+    for i, config in enumerate(test_configs, 1):
+        print(f"\nTesting Configuration {i}:")
+        
+        # Serialize
+        serialized = to_serializable_dict(config)
+        print(f"\nSerialized Config {i}:")
+        print(serialized)
+        
+        # Deserialize
+        deserialized = from_serializable_dict(serialized)
+        print(f"\nDeserialized Config {i}:")
+        print(to_serializable_dict(deserialized))  # Convert back to dict for comparison
+        
+        # Verify they match
+        assert to_serializable_dict(config) == to_serializable_dict(deserialized)
+        print(f"\nVerification passed: Configuration {i} matches after serialization/deserialization!")
\ No newline at end of file
diff --git a/tests/docker/test_server.py b/tests/docker/test_server.py
new file mode 100644
index 00000000..7bb0195b
--- /dev/null
+++ b/tests/docker/test_server.py
@@ -0,0 +1,146 @@
+import asyncio
+import json
+from typing import Optional
+from urllib.parse import quote
+
+async def test_endpoint(
+    endpoint: str, 
+    url: str, 
+    params: Optional[dict] = None,
+    expected_status: int = 200
+) -> None:
+    """Test an endpoint and print results"""
+    import aiohttp
+    
+    params = params or {}
+    param_str = "&".join(f"{k}={v}" for k, v in params.items())
+    full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
+    if param_str:
+        full_url += f"?{param_str}"
+        
+    print(f"\nTesting: {full_url}")
+    
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(full_url) as response:
+                status = response.status
+                try:
+                    data = await response.json()
+                except:
+                    data = await response.text()
+                
+                print(f"Status: {status} (Expected: {expected_status})")
+                if isinstance(data, dict):
+                    print(f"Response: {json.dumps(data, indent=2)}")
+                else:
+                    print(f"Response: {data[:500]}...")  # First 500 chars
+                assert status == expected_status
+                return data
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return None
+
+async def test_llm_task_completion(task_id: str) -> None:
+    """Poll task until completion"""
+    for _ in range(10):  # Try 10 times
+        result = await test_endpoint("llm", task_id)
+        if result and result.get("status") in ["completed", "failed"]:
+            return result
+        print("Task still processing, waiting 5 seconds...")
+        await asyncio.sleep(5)
+    print("Task timed out")
+
+async def run_tests():
+    print("Starting API Tests...")
+    
+    # Test URLs
+    urls = [
+        "example.com",
+        "https://www.python.org",
+        "https://news.ycombinator.com/news",
+        "https://github.com/trending"
+    ]
+    
+    print("\n=== Testing Markdown Endpoint ===")
+    for url in[] : #urls:
+        # Test different filter types
+        for filter_type in ["raw", "fit", "bm25", "llm"]:
+            params = {"f": filter_type}
+            if filter_type in ["bm25", "llm"]:
+                params["q"] = "extract main content"
+            
+            # Test with and without cache
+            for cache in ["0", "1"]:
+                params["c"] = cache
+                await test_endpoint("md", url, params)
+                await asyncio.sleep(1)  # Be nice to the server
+
+    print("\n=== Testing LLM Endpoint ===")
+    for url in []: # urls:
+        # Test basic extraction
+        result = await test_endpoint(
+            "llm", 
+            url, 
+            {"q": "Extract title and main content"}
+        )
+        if result and "task_id" in result:
+            print("\nChecking task completion...")
+            await test_llm_task_completion(result["task_id"])
+        
+        # Test with schema
+        schema = {
+            "type": "object",
+            "properties": {
+                "title": {"type": "string"},
+                "content": {"type": "string"},
+                "links": {"type": "array", "items": {"type": "string"}}
+            }
+        }
+        result = await test_endpoint(
+            "llm", 
+            url, 
+            {
+                "q": "Extract content with links", 
+                "s": json.dumps(schema),
+                "c": "1"  # Test with cache
+            }
+        )
+        if result and "task_id" in result:
+            print("\nChecking schema task completion...")
+            await test_llm_task_completion(result["task_id"])
+        
+        await asyncio.sleep(2)  # Be nice to the server
+    
+    print("\n=== Testing Error Cases ===")
+    # Test invalid URL
+    await test_endpoint(
+        "md", 
+        "not_a_real_url", 
+        expected_status=500
+    )
+    
+    # Test invalid filter type
+    await test_endpoint(
+        "md", 
+        "example.com", 
+        {"f": "invalid"},
+        expected_status=422
+    )
+    
+    # Test LLM without query
+    await test_endpoint(
+        "llm", 
+        "example.com"
+    )
+    
+    # Test invalid task ID
+    await test_endpoint(
+        "llm", 
+        "llm_invalid_task",
+        expected_status=404
+    )
+    
+    print("\nAll tests completed!")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
\ No newline at end of file
diff --git a/tests/docker/test_server_requests.py b/tests/docker/test_server_requests.py
new file mode 100644
index 00000000..56d2ada4
--- /dev/null
+++ b/tests/docker/test_server_requests.py
@@ -0,0 +1,655 @@
+import pytest
+import pytest_asyncio
+import httpx
+import json
+import asyncio
+import os
+from typing import List, Dict, Any, AsyncGenerator
+
+from dotenv import load_dotenv
+load_dotenv()
+
+
+# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
+# You don't strictly NEED these imports for the tests to run against the server,
+# but they help in understanding the structure you are mimicking in JSON.
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    BM25ContentFilter,
+    BFSDeepCrawlStrategy,
+    FilterChain,
+    ContentTypeFilter,
+    DomainFilter,
+    CompositeScorer,
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+    LLMConfig
+)
+
+# --- Test Configuration ---
+# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
+# Use a known simple HTML page for basic tests
+SIMPLE_HTML_URL = "https://httpbin.org/html"
+# Use a site suitable for scraping tests
+SCRAPE_TARGET_URL = "http://books.toscrape.com/"
+# Use a site with internal links for deep crawl tests
+DEEP_CRAWL_URL = "https://python.org"
+
+# --- Pytest Fixtures ---
+
+# Use the built-in event_loop fixture from pytest_asyncio
+# The custom implementation was causing issues with closing the loop
+
+@pytest_asyncio.fixture(scope="function")  # Changed to function scope to avoid event loop issues
+async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
+    """Provides an async HTTP client"""
+    client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
+    yield client
+    await client.aclose()
+
+# --- Helper Functions ---
+
+async def check_server_health(client: httpx.AsyncClient):
+    """Check if the server is healthy before running tests."""
+    try:
+        response = await client.get("/health")
+        response.raise_for_status()
+        print(f"\nServer healthy: {response.json()}")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
+
+async def assert_crawl_result_structure(result: Dict[str, Any]):
+    """Asserts the basic structure of a single crawl result."""
+    assert isinstance(result, dict)
+    assert "url" in result
+    assert "success" in result
+    assert "html" in result
+    # Add more common checks if needed
+
+async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
+    """Processes an NDJSON streaming response."""
+    results = []
+    completed = False
+    async for line in response.aiter_lines():
+        if line:
+            try:
+                data = json.loads(line)
+                if data.get("status") == "completed":
+                    completed = True
+                    break # Stop processing after completion marker
+                else:
+                    results.append(data)
+            except json.JSONDecodeError:
+                pytest.fail(f"Failed to decode JSON line: {line}")
+    assert completed, "Streaming response did not end with a completion marker."
+    return results
+
+
+# --- Test Class ---
+
+@pytest.mark.asyncio
+class TestCrawlEndpoints:
+
+    @pytest_asyncio.fixture(autouse=True)
+    async def check_health_before_tests(self, async_client: httpx.AsyncClient):
+        """Fixture to ensure server is healthy before each test in the class."""
+        await check_server_health(async_client)
+
+    # 1. Simple Requests (Primitives)
+    async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
+        """Test /crawl with a single URL and simple config values."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {
+                    "headless": True,
+                }
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False, # Explicitly false for /crawl
+                    "screenshot": False,
+                    "cache_mode": CacheMode.BYPASS.value # Use enum value
+                }
+            }
+        }
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error: {e}")
+            print(f"Response content: {e.response.text}")
+            raise
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert result["url"] == SIMPLE_HTML_URL
+        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
+        # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
+        # It might be null, missing, or populated depending on the server's default behavior
+
+    async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
+        """Test /crawl/stream with a single URL and simple config values."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {
+                    "headless": True,
+                }
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": True, # Must be true for /crawl/stream
+                    "screenshot": False,
+                    "cache_mode": CacheMode.BYPASS.value
+                }
+            }
+        }
+        async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
+            response.raise_for_status()
+            results = await process_streaming_response(response)
+
+        assert len(results) == 1
+        result = results[0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert result["url"] == SIMPLE_HTML_URL
+        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
+
+
+    # 2. Multi-URL and Dispatcher
+    async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
+        """Test /crawl with multiple URLs, implicitly testing dispatcher."""
+        urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
+        payload = {
+            "urls": urls,
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {"headless": True}
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        assert len(data["results"]) == len(urls)
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["url"] in urls
+
+    async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
+        """Test /crawl/stream with multiple URLs."""
+        urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
+        payload = {
+            "urls": urls,
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {"headless": True}
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
+            }
+        }
+        async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
+            response.raise_for_status()
+            results = await process_streaming_response(response)
+
+        assert len(results) == len(urls)
+        processed_urls = set()
+        for result in results:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["url"] in urls
+            processed_urls.add(result["url"])
+        assert processed_urls == set(urls) # Ensure all URLs were processed
+
+
+    # 3. Class Values and Nested Classes (Markdown Generator)
+    async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
+        """Test /crawl with MarkdownGenerator using PruningContentFilter."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "cache_mode": CacheMode.ENABLED.value, # Test different cache mode
+                    "markdown_generator": {
+                        "type": "DefaultMarkdownGenerator",
+                        "params": {
+                            "content_filter": {
+                                "type": "PruningContentFilter",
+                                "params": {
+                                    "threshold": 0.5, # Example param
+                                    "threshold_type": "relative"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "markdown" in result
+        assert isinstance(result["markdown"], dict)
+        assert "raw_markdown" in result["markdown"]
+        assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
+        assert "Moby-Dick" in result["markdown"]["raw_markdown"]
+        # Fit markdown content might be different/shorter due to pruning
+        assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
+
+    async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
+        """Test /crawl with MarkdownGenerator using BM25ContentFilter."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "markdown_generator": {
+                        "type": "DefaultMarkdownGenerator",
+                        "params": {
+                            "content_filter": {
+                                "type": "BM25ContentFilter",
+                                "params": {
+                                    "user_query": "Herman Melville", # Query for BM25
+                                    "bm25_threshold": 0.1, # Lower threshold to increase matches
+                                    "language": "english"  # Valid parameters
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Payload for BM25 test: {json.dumps(payload)}")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "markdown" in result
+        assert isinstance(result["markdown"], dict)
+        assert "raw_markdown" in result["markdown"]
+        assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
+        
+        # Print values for debug
+        print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
+        print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
+        
+        # Either fit_markdown has content (possibly including our query terms)
+        # or it might be empty if no good BM25 matches were found
+        # Don't assert specific content since it can be environment-dependent
+
+
+    # 4. Deep Crawling
+    async def test_deep_crawl(self, async_client: httpx.AsyncClient):
+        """Test /crawl with a deep crawl strategy."""
+        payload = {
+            "urls": [DEEP_CRAWL_URL], # Start URL
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": 1, # Limit depth for testing speed
+                            "max_pages": 5, # Limit pages to crawl
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {
+                                            "type": "ContentTypeFilter",
+                                            "params": {"allowed_types": ["text/html"]}
+                                        },
+                                        {
+                                            "type": "DomainFilter",
+                                            "params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
+                                        }
+                                    ]
+                                }
+                            },
+                            "url_scorer": {
+                                "type": "CompositeScorer",
+                                "params": {
+                                    "scorers": [
+                                        {
+                                            "type": "KeywordRelevanceScorer",
+                                            "params": {"keywords": ["documentation", "tutorial"]}
+                                        },
+                                        {
+                                            "type": "PathDepthScorer",
+                                            "params": {"weight": 0.5, "optimal_depth": 2}
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        # Expect more than 1 result due to deep crawl (start URL + crawled links)
+        assert len(data["results"]) > 1
+        assert len(data["results"]) <= 6 # Start URL + max_links=5
+
+        start_url_found = False
+        crawled_urls_found = False
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            
+            # Print URL for debugging
+            print(f"Crawled URL: {result['url']}")
+            
+            # Allow URLs that contain python.org (including subdomains like docs.python.org)
+            assert "python.org" in result["url"]
+            if result["url"] == DEEP_CRAWL_URL:
+                start_url_found = True
+            else:
+                crawled_urls_found = True
+
+        assert start_url_found
+        assert crawled_urls_found
+
+
+    # 5. Extraction without LLM (JSON/CSS)
+    async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
+        """Test /crawl with JsonCssExtractionStrategy."""
+        payload = {
+            "urls": [SCRAPE_TARGET_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "extraction_strategy": {
+                        "type": "JsonCssExtractionStrategy",
+                        "params": {
+                            "schema": { 
+                                "type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
+                                "value": {
+                                    "name": "BookList",
+                                    "baseSelector": "ol.row li.col-xs-6", # Select each book item
+                                    "fields": [
+                                        {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
+                                        {"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
+                                        {"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "extracted_content" in result
+        assert result["extracted_content"] is not None
+
+        # Extracted content should be a JSON string representing a list of dicts
+        try:
+            extracted_data = json.loads(result["extracted_content"])
+            assert isinstance(extracted_data, list)
+            assert len(extracted_data) > 0 # Should find some books
+            # Check structure of the first extracted item
+            first_item = extracted_data[0]
+            assert "title" in first_item
+            assert "price" in first_item
+            assert "rating" in first_item
+            assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
+        except (json.JSONDecodeError, AssertionError) as e:
+            pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
+
+
+    # 6. Extraction with LLM
+    async def test_llm_extraction(self, async_client: httpx.AsyncClient):
+        """
+        Test /crawl with LLMExtractionStrategy.
+        NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
+              configured via .llm.env or environment variables.
+              This test uses the default provider configured in the server's config.yml.
+        """
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "extraction_strategy": {
+                        "type": "LLMExtractionStrategy",
+                        "params": {
+                            "instruction": "Extract the main title and the author mentioned in the text into JSON.",
+                            # LLMConfig is implicitly defined by server's config.yml and .llm.env
+                            # If you needed to override provider/token PER REQUEST:
+                            "llm_config": {
+                               "type": "LLMConfig",
+                               "params": {
+                                  "provider": "openai/gpt-4o", # Example override
+                                  "api_token": os.getenv("OPENAI_API_KEY") # Example override
+                               }
+                            },
+                            "schema": { # Optional: Provide a schema for structured output
+                                "type": "dict", # IMPORTANT: Wrap schema dict
+                                "value": {
+                                    "title": "Book Info",
+                                    "type": "object",
+                                    "properties": {
+                                        "title": {"type": "string", "description": "The main title of the work"},
+                                        "author": {"type": "string", "description": "The author of the work"}
+                                    },
+                                     "required": ["title", "author"]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            # Catch potential server errors (like 500 due to missing/invalid API keys)
+            pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
+        except httpx.RequestError as e:
+             pytest.fail(f"LLM extraction request failed: {e}.")
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "extracted_content" in result
+        assert result["extracted_content"] is not None
+
+        # Extracted content should be JSON (because we provided a schema)
+        try:
+            extracted_data = json.loads(result["extracted_content"])
+            print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
+            
+            # Handle both dict and list formats (server returns a list)
+            if isinstance(extracted_data, list):
+                assert len(extracted_data) > 0
+                extracted_item = extracted_data[0]  # Take first item
+                assert isinstance(extracted_item, dict)
+                assert "title" in extracted_item
+                assert "author" in extracted_item
+                assert "Moby-Dick" in extracted_item.get("title", "")
+                assert "Herman Melville" in extracted_item.get("author", "")
+            else:
+                assert isinstance(extracted_data, dict)
+                assert "title" in extracted_data
+                assert "author" in extracted_data
+                assert "Moby-Dick" in extracted_data.get("title", "")
+                assert "Herman Melville" in extracted_data.get("author", "")
+        except (json.JSONDecodeError, AssertionError) as e:
+            pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
+        except Exception as e: # Catch any other unexpected error
+            pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
+            
+if __name__ == "__main__":
+    # Define arguments for pytest programmatically
+    # -v: verbose output
+    # -s: show print statements immediately (useful for debugging)
+    # __file__: tells pytest to run tests in the current file
+    pytest_args = ["-v", "-s", __file__]
+
+    # You can add more pytest arguments here if needed, for example:
+    # '-k test_llm_extraction': Run only the LLM test function
+    # pytest_args.append("-k test_llm_extraction")
+
+    print(f"Running pytest with args: {pytest_args}")
+
+    # Execute pytest
+    exit_code = pytest.main(pytest_args)
+
+    print(f"Pytest finished with exit code: {exit_code}")
\ No newline at end of file
diff --git a/tests/docker/test_server_token.py b/tests/docker/test_server_token.py
new file mode 100644
index 00000000..220b6ca2
--- /dev/null
+++ b/tests/docker/test_server_token.py
@@ -0,0 +1,212 @@
+import asyncio
+import json
+from typing import Optional
+from urllib.parse import quote
+
+async def get_token(session, email: str = "test@example.com") -> str:
+    """Fetch a JWT token from the /token endpoint."""
+    url = "http://localhost:8000/token"
+    payload = {"email": email}
+    print(f"\nFetching token from {url} with email: {email}")
+    try:
+        async with session.post(url, json=payload) as response:
+            status = response.status
+            data = await response.json()
+            print(f"Token Response Status: {status}")
+            print(f"Token Response: {json.dumps(data, indent=2)}")
+            if status == 200:
+                return data["access_token"]
+            else:
+                raise Exception(f"Failed to get token: {data.get('detail', 'Unknown error')}")
+    except Exception as e:
+        print(f"Error fetching token: {str(e)}")
+        raise
+
+async def test_endpoint(
+    session,
+    endpoint: str,
+    url: str,
+    token: str,
+    params: Optional[dict] = None,
+    expected_status: int = 200
+) -> Optional[dict]:
+    """Test an endpoint with token and print results."""
+    params = params or {}
+    param_str = "&".join(f"{k}={v}" for k, v in params.items())
+    full_url = f"http://localhost:8000/{endpoint}/{quote(url)}"
+    if param_str:
+        full_url += f"?{param_str}"
+    
+    headers = {"Authorization": f"Bearer {token}"}
+    print(f"\nTesting: {full_url}")
+    
+    try:
+        async with session.get(full_url, headers=headers) as response:
+            status = response.status
+            try:
+                data = await response.json()
+            except:
+                data = await response.text()
+            
+            print(f"Status: {status} (Expected: {expected_status})")
+            if isinstance(data, dict):
+                print(f"Response: {json.dumps(data, indent=2)}")
+            else:
+                print(f"Response: {data[:500]}...")  # First 500 chars
+            assert status == expected_status, f"Expected {expected_status}, got {status}"
+            return data
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return None
+
+
+async def test_stream_crawl(session, token: str):
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:8000/crawl/stream"
+    payload = {
+        "urls": [
+            "https://example.com",
+            "https://example.com/page1",  # Replicated example.com with variation
+            "https://example.com/page2",  # Replicated example.com with variation
+            "https://example.com/page3",  # Replicated example.com with variation
+            # "https://www.python.org",
+            # "https://news.ycombinator.com/news"
+        ],
+        "browser_config": {"headless": True, "viewport": {"width": 1200}},
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
+    }
+    headers = {"Authorization": f"Bearer {token}"}
+    print(f"\nTesting Streaming Crawl: {url}")
+    print(f"Payload: {json.dumps(payload, indent=2)}")
+    
+    try:
+        async with session.post(url, json=payload, headers=headers) as response:
+            status = response.status
+            print(f"Status: {status} (Expected: 200)")
+            assert status == 200, f"Expected 200, got {status}"
+            
+            # Read streaming response line-by-line (NDJSON)
+            async for line in response.content:
+                if line:
+                    data = json.loads(line.decode('utf-8').strip())
+                    print(f"Streamed Result: {json.dumps(data, indent=2)}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+
+async def run_tests():
+    import aiohttp
+    print("Starting API Tests...")
+    
+    # Test URLs
+    urls = [
+        "example.com",
+        "https://www.python.org",
+        "https://news.ycombinator.com/news",
+        "https://github.com/trending"
+    ]
+    
+    async with aiohttp.ClientSession() as session:
+        # Fetch token once and reuse it
+        token = await get_token(session)
+        if not token:
+            print("Aborting tests due to token failure!")
+            return
+        
+        print("\n=== Testing Crawl Endpoint ===")
+        crawl_payload = {
+            "urls": ["https://example.com"],
+            "browser_config": {"headless": True},
+            "crawler_config": {"stream": False}
+        }
+        async with session.post(
+            "http://localhost:8000/crawl",
+            json=crawl_payload,
+            headers={"Authorization": f"Bearer {token}"}
+        ) as response:
+            status = response.status
+            data = await response.json()
+            print(f"\nCrawl Endpoint Status: {status}")
+            print(f"Crawl Response: {json.dumps(data, indent=2)}")
+        
+
+        print("\n=== Testing Crawl Stream Endpoint ===")
+        await test_stream_crawl(session, token)
+
+        print("\n=== Testing Markdown Endpoint ===")
+        for url in []: #urls:
+            for filter_type in ["raw", "fit", "bm25", "llm"]:
+                params = {"f": filter_type}
+                if filter_type in ["bm25", "llm"]:
+                    params["q"] = "extract main content"
+                
+                for cache in ["0", "1"]:
+                    params["c"] = cache
+                    await test_endpoint(session, "md", url, token, params)
+                    await asyncio.sleep(1)  # Be nice to the server
+
+        print("\n=== Testing LLM Endpoint ===")
+        for url in urls:
+            # Test basic extraction (direct response now)
+            result = await test_endpoint(
+                session,
+                "llm",
+                url,
+                token,
+                {"q": "Extract title and main content"}
+            )
+            
+            # Test with schema (direct response)
+            schema = {
+                "type": "object",
+                "properties": {
+                    "title": {"type": "string"},
+                    "content": {"type": "string"},
+                    "links": {"type": "array", "items": {"type": "string"}}
+                }
+            }
+            result = await test_endpoint(
+                session,
+                "llm",
+                url,
+                token,
+                {
+                    "q": "Extract content with links",
+                    "s": json.dumps(schema),
+                    "c": "1"  # Test with cache
+                }
+            )
+            await asyncio.sleep(2)  # Be nice to the server
+        
+        print("\n=== Testing Error Cases ===")
+        # Test invalid URL
+        await test_endpoint(
+            session,
+            "md",
+            "not_a_real_url",
+            token,
+            expected_status=500
+        )
+        
+        # Test invalid filter type
+        await test_endpoint(
+            session,
+            "md",
+            "example.com",
+            token,
+            {"f": "invalid"},
+            expected_status=422
+        )
+        
+        # Test LLM without query (should fail per your server logic)
+        await test_endpoint(
+            session,
+            "llm",
+            "example.com",
+            token,
+            expected_status=400
+        )
+        
+    print("\nAll tests completed!")
+
+if __name__ == "__main__":
+    asyncio.run(run_tests())
\ No newline at end of file
diff --git a/tests/general/generate_dummy_site.py b/tests/general/generate_dummy_site.py
new file mode 100644
index 00000000..d4218b6b
--- /dev/null
+++ b/tests/general/generate_dummy_site.py
@@ -0,0 +1,335 @@
+# ==== File: build_dummy_site.py ====
+
+import os
+import random
+import argparse
+from pathlib import Path
+from urllib.parse import quote
+
+# --- Configuration ---
+NUM_CATEGORIES = 3
+NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
+NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
+MAX_DEPTH_TARGET = 5 # Explicitly set target depth
+
+# --- Helper Functions ---
+
+def generate_lorem(words=20):
+    """Generates simple placeholder text."""
+    lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
+                   "adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
+                   "incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
+    return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
+
+def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
+    """Creates an HTML file with basic structure and inline CSS."""
+    os.makedirs(filepath.parent, exist_ok=True)
+
+    # Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
+    breadcrumb_html = ""
+    if breadcrumbs:
+        links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
+        breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
+
+    # Basic CSS for structure identification (kept the same)
+    css = """
+<style>
+  body {
+    font-family: sans-serif;
+    padding: 20px;
+    background-color: #1e1e1e;
+    color: #d1d1d1;
+  }
+
+  .container {
+    max-width: 960px;
+    margin: auto;
+    background: #2c2c2c;
+    padding: 20px;
+    border-radius: 5px;
+    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
+  }
+
+  h1, h2 {
+    color: #ccc;
+  }
+
+  a {
+    color: #9bcdff;
+    text-decoration: none;
+  }
+
+  a:hover {
+    text-decoration: underline;
+  }
+
+  ul {
+    list-style: none;
+    padding-left: 0;
+  }
+
+  li {
+    margin-bottom: 10px;
+  }
+
+  .category-link,
+  .subcategory-link,
+  .product-link,
+  .details-link,
+  .reviews-link {
+    display: block;
+    padding: 8px;
+    background-color: #3a3a3a;
+    border-radius: 3px;
+  }
+
+  .product-preview {
+    border: 1px solid #444;
+    padding: 10px;
+    margin-bottom: 10px;
+    border-radius: 4px;
+    background-color: #2a2a2a;
+  }
+
+  .product-title {
+    color: #d1d1d1;
+  }
+
+  .product-price {
+    font-weight: bold;
+    color: #85e085;
+  }
+
+  .product-description,
+  .product-specs,
+  .product-reviews {
+    margin-top: 15px;
+    line-height: 1.6;
+  }
+
+  .product-specs li {
+    margin-bottom: 5px;
+    font-size: 0.9em;
+  }
+
+  .spec-name {
+    font-weight: bold;
+  }
+
+  .breadcrumbs {
+    margin-bottom: 20px;
+    font-size: 0.9em;
+    color: #888;
+  }
+
+  .breadcrumbs a {
+    color: #9bcdff;
+  }
+</style>
+    """
+    html_content = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{title} - FakeShop</title>
+    {head_extras}
+    {css}
+</head>
+<body>
+    <div class="container">
+        {breadcrumb_html}
+        <h1>{title}</h1>
+        {body_content}
+    </div>
+</body>
+</html>"""
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(html_content)
+    # Keep print statement concise for clarity
+    # print(f"Created: {filepath}")
+
+def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
+    """Generates the dummy website structure."""
+    base_dir.mkdir(parents=True, exist_ok=True)
+
+    # --- Clean and prepare the base path for URL construction ---
+    # Ensure it starts with '/' if not empty, and remove any trailing '/'
+    if base_path:
+        full_base_path = "/" + base_path.strip('/')
+    else:
+        full_base_path = "" # Represents the root
+
+    print(f"Using base path for links: '{full_base_path}'")
+
+    # --- Level 0: Homepage ---
+    home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
+    # Define the *actual* link path for the homepage breadcrumb
+    home_link_path = f"{full_base_path}/index.html"
+    breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
+
+    # Links *within* the page content should remain relative
+    for i in range(NUM_CATEGORIES):
+        cat_name = f"Category-{i+1}"
+        cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
+        # This path is relative to the current directory (index.html)
+        cat_relative_page_path = f"{cat_folder_name}/index.html"
+        home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
+    home_body += "</ul>"
+    create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
+
+    # --- Levels 1-5 ---
+    for i in range(NUM_CATEGORIES):
+        cat_name = f"Category-{i+1}"
+        cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
+        cat_dir = base_dir / cat_folder_name
+        # This is the *absolute* path for the breadcrumb link
+        cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
+        # Update breadcrumbs list for this level
+        breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
+
+        # --- Level 1: Category Page ---
+        cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
+        for j in range(NUM_SUBCATEGORIES_PER_CAT):
+            subcat_name = f"{cat_name}-Sub-{j+1}"
+            subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
+            # Path relative to the category page
+            subcat_relative_page_path = f"{subcat_folder_name}/index.html"
+            cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
+        cat_body += "</ul>"
+        # Pass the updated breadcrumbs list
+        create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
+
+        for j in range(NUM_SUBCATEGORIES_PER_CAT):
+            subcat_name = f"{cat_name}-Sub-{j+1}"
+            subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
+            subcat_dir = cat_dir / subcat_folder_name
+            # Absolute path for the breadcrumb link
+            subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
+            # Update breadcrumbs list for this level
+            breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
+
+            # --- Level 2: Sub-Category Page (Product List) ---
+            subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
+            for k in range(NUM_PRODUCTS_PER_SUBCAT):
+                prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
+                prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
+                # Filename relative to the subcategory page
+                prod_filename = f"product_{prod_id}.html"
+                # Absolute path for the breadcrumb link
+                prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
+
+                # Preview on list page (link remains relative)
+                subcat_body += f"""
+                <li>
+                    <div class="product-preview">
+                        <a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
+                        <p>{generate_lorem(10)}</p>
+                        <span class="product-price">£{random.uniform(10, 500):.2f}</span>
+                    </div>
+                </li>"""
+
+                # --- Level 3: Product Page ---
+                prod_price = random.uniform(10, 500)
+                prod_desc = generate_lorem(40)
+                prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
+                prod_reviews_count = random.randint(0, 150)
+                # Relative filenames for links on this page
+                details_filename_relative = f"product_{prod_id}_details.html"
+                reviews_filename_relative = f"product_{prod_id}_reviews.html"
+
+                prod_body = f"""
+                <p class="product-price">Price: £{prod_price:.2f}</p>
+                <div class="product-description">
+                    <h2>Description</h2>
+                    <p>{prod_desc}</p>
+                </div>
+                <div class="product-specs">
+                    <h2>Specifications</h2>
+                    <ul>
+                        {''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
+                    </ul>
+                </div>
+                <div class="product-reviews">
+                    <h2>Reviews</h2>
+                    <p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
+                </div>
+                <hr>
+                <p>
+                    <a class="details-link" href="{details_filename_relative}">View More Details</a> |
+                    <a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
+                </p>
+                """
+                # Update breadcrumbs list for this level
+                breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
+                # Pass the updated breadcrumbs list
+                create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
+
+                # --- Level 4: Product Details Page ---
+                details_filename = f"product_{prod_id}_details.html" # Actual filename
+                # Absolute path for the breadcrumb link
+                details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
+                details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
+                # Update breadcrumbs list for this level
+                breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
+                # Pass the updated breadcrumbs list
+                create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
+
+                # --- Level 5: Product Reviews Page ---
+                reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
+                # Absolute path for the breadcrumb link
+                reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
+                reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
+                for r in range(prod_reviews_count):
+                     reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
+                reviews_body += "</ul>"
+                # Update breadcrumbs list for this level
+                breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
+                # Pass the updated breadcrumbs list
+                create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
+
+
+            subcat_body += "</ul>" # Close product-list ul
+            # Pass the correct breadcrumbs list for the subcategory index page
+            create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
+
+
+# --- Main Execution ---
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
+    parser.add_argument(
+        "-o", "--output-dir",
+        type=str,
+        default="dummy_retail_site",
+        help="Directory to generate the website in."
+    )
+    parser.add_argument(
+        "-n", "--site-name",
+        type=str,
+        default="FakeShop",
+        help="Name of the fake shop."
+    )
+    parser.add_argument(
+        "-b", "--base-path",
+        type=str,
+        default="",
+        help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
+    )
+    # Optional: Add more args to configure counts if needed
+
+    args = parser.parse_args()
+
+    output_directory = Path(args.output_dir)
+    site_name = args.site_name
+    base_path = args.base_path
+
+    print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
+    # Pass the base_path to the generation function
+    generate_site(output_directory, site_name, base_path)
+    print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
+    print("Dummy site generation complete.")
+    print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
+    if base_path:
+        print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
+    else:
+         print(f"Access the site at: http://localhost:8000/index.html")
\ No newline at end of file
diff --git a/tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py b/tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py
new file mode 100644
index 00000000..2727d1e4
--- /dev/null
+++ b/tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py
@@ -0,0 +1,56 @@
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    HTTPCrawlerConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter
+)
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+from crawl4ai.async_logger import AsyncLogger
+
+async def main():
+    # Initialize HTTP crawler strategy
+    http_strategy = AsyncHTTPCrawlerStrategy(
+        browser_config=HTTPCrawlerConfig(
+            method="GET",
+            verify_ssl=True,
+            follow_redirects=True
+        ),
+        logger=AsyncLogger(verbose=True)
+    )
+
+    # Initialize web crawler with HTTP strategy
+    async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(
+                    threshold=0.48, 
+                    threshold_type="fixed", 
+                    min_word_threshold=0
+                )
+            )
+        )
+        
+        # Test different URLs
+        urls = [
+            "https://example.com",
+            "https://httpbin.org/get",
+            "raw://<html><body>Test content</body></html>"
+        ]
+        
+        for url in urls:
+            print(f"\n=== Testing {url} ===")
+            try:
+                result = await crawler.arun(url=url, config=crawler_config)
+                print(f"Status: {result.status_code}")
+                print(f"Raw HTML length: {len(result.html)}")
+                if hasattr(result, 'markdown'):
+                    print(f"Markdown length: {len(result.markdown.raw_markdown)}")
+            except Exception as e:
+                print(f"Error: {e}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/general/test_advanced_deep_crawl.py b/tests/general/test_advanced_deep_crawl.py
new file mode 100644
index 00000000..dd291f67
--- /dev/null
+++ b/tests/general/test_advanced_deep_crawl.py
@@ -0,0 +1,46 @@
+import asyncio
+import time
+
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+
+
+async def main():
+    """Example deep crawl of documentation site."""
+    filter_chain = FilterChain([
+        URLPatternFilter(patterns=["*2025*"]),
+        DomainFilter(allowed_domains=["techcrunch.com"]),
+        ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
+        ContentTypeFilter(allowed_types=["text/html","application/javascript"])
+    ])
+    config = CrawlerRunConfig(
+        deep_crawl_strategy = BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
+        ),
+        stream=False,
+        verbose=True,
+        cache_mode=CacheMode.BYPASS,
+        scraping_strategy=LXMLWebScrapingStrategy()
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        print("Starting deep crawl in streaming mode:")
+        config.stream = True
+        start_time = time.perf_counter()
+        async for result in await crawler.arun(
+            url="https://techcrunch.com",
+            config=config
+        ):
+            print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
+        print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/20241401/test_async_crawler_strategy.py b/tests/general/test_async_crawler_strategy.py
similarity index 100%
rename from tests/20241401/test_async_crawler_strategy.py
rename to tests/general/test_async_crawler_strategy.py
diff --git a/tests/20241401/test_async_markdown_generator.py b/tests/general/test_async_markdown_generator.py
similarity index 100%
rename from tests/20241401/test_async_markdown_generator.py
rename to tests/general/test_async_markdown_generator.py
diff --git a/tests/20241401/test_async_webcrawler.py b/tests/general/test_async_webcrawler.py
similarity index 100%
rename from tests/20241401/test_async_webcrawler.py
rename to tests/general/test_async_webcrawler.py
diff --git a/tests/20241401/test_cache_context.py b/tests/general/test_cache_context.py
similarity index 100%
rename from tests/20241401/test_cache_context.py
rename to tests/general/test_cache_context.py
diff --git a/tests/general/test_content_source_parameter.py b/tests/general/test_content_source_parameter.py
new file mode 100644
index 00000000..e686eaf8
--- /dev/null
+++ b/tests/general/test_content_source_parameter.py
@@ -0,0 +1,106 @@
+"""
+Tests for the content_source parameter in markdown generation.
+"""
+import unittest
+import asyncio
+from unittest.mock import patch, MagicMock
+
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.models import MarkdownGenerationResult
+
+HTML_SAMPLE = """
+<html>
+<head><title>Test Page</title></head>
+<body>
+    <h1>Test Content</h1>
+    <p>This is a test paragraph.</p>
+    <div class="container">
+        <p>This is content within a container.</p>
+    </div>
+</body>
+</html>
+"""
+
+
+class TestContentSourceParameter(unittest.TestCase):
+    """Test cases for the content_source parameter in markdown generation."""
+
+    def setUp(self):
+        """Set up test fixtures."""
+        self.loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(self.loop)
+
+    def tearDown(self):
+        """Tear down test fixtures."""
+        self.loop.close()
+
+    def test_default_content_source(self):
+        """Test that the default content_source is 'cleaned_html'."""
+        # Can't directly instantiate abstract class, so just test DefaultMarkdownGenerator
+        generator = DefaultMarkdownGenerator()
+        self.assertEqual(generator.content_source, "cleaned_html")
+
+    def test_custom_content_source(self):
+        """Test that content_source can be customized."""
+        generator = DefaultMarkdownGenerator(content_source="fit_html")
+        self.assertEqual(generator.content_source, "fit_html")
+
+    @patch('crawl4ai.markdown_generation_strategy.CustomHTML2Text')
+    def test_html_processing_using_input_html(self, mock_html2text):
+        """Test that generate_markdown uses input_html parameter."""
+        # Setup mock
+        mock_instance = MagicMock()
+        mock_instance.handle.return_value = "# Test Content\n\nThis is a test paragraph."
+        mock_html2text.return_value = mock_instance
+
+        # Create generator and call generate_markdown
+        generator = DefaultMarkdownGenerator()
+        result = generator.generate_markdown(input_html="<h1>Test Content</h1><p>This is a test paragraph.</p>")
+
+        # Verify input_html was passed to HTML2Text handler
+        mock_instance.handle.assert_called_once()
+        # Get the first positional argument
+        args, _ = mock_instance.handle.call_args
+        self.assertEqual(args[0], "<h1>Test Content</h1><p>This is a test paragraph.</p>")
+        
+        # Check result
+        self.assertIsInstance(result, MarkdownGenerationResult)
+        self.assertEqual(result.raw_markdown, "# Test Content\n\nThis is a test paragraph.")
+
+    def test_html_source_selection_logic(self):
+        """Test that the HTML source selection logic works correctly."""
+        # We'll test the dispatch pattern directly to avoid async complexities
+        
+        # Create test data
+        raw_html = "<html><body><h1>Raw HTML</h1></body></html>"
+        cleaned_html = "<html><body><h1>Cleaned HTML</h1></body></html>"
+        fit_html = "<html><body><h1>Preprocessed HTML</h1></body></html>"
+        
+        # Test the dispatch pattern
+        html_source_selector = {
+            "raw_html": lambda: raw_html,
+            "cleaned_html": lambda: cleaned_html,
+            "fit_html": lambda: fit_html,
+        }
+        
+        # Test Case 1: content_source="cleaned_html"
+        source_lambda = html_source_selector.get("cleaned_html")
+        self.assertEqual(source_lambda(), cleaned_html)
+        
+        # Test Case 2: content_source="raw_html"
+        source_lambda = html_source_selector.get("raw_html")
+        self.assertEqual(source_lambda(), raw_html)
+        
+        # Test Case 3: content_source="fit_html"
+        source_lambda = html_source_selector.get("fit_html")
+        self.assertEqual(source_lambda(), fit_html)
+        
+        # Test Case 4: Invalid content_source falls back to cleaned_html
+        source_lambda = html_source_selector.get("invalid_source", lambda: cleaned_html)
+        self.assertEqual(source_lambda(), cleaned_html)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/tests/general/test_crawlers.py b/tests/general/test_crawlers.py
new file mode 100644
index 00000000..45fb8fcb
--- /dev/null
+++ b/tests/general/test_crawlers.py
@@ -0,0 +1,17 @@
+
+# example_usageexample_usageexample_usage# example_usage.py
+import asyncio
+from crawl4ai.crawlers import get_crawler
+
+async def main():
+    # Get the registered crawler
+    example_crawler = get_crawler("example_site.content")
+    
+    # Crawl example.com
+    result = await example_crawler(url="https://example.com")
+        
+    print(result)
+            
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/general/test_deep_crawl.py b/tests/general/test_deep_crawl.py
new file mode 100644
index 00000000..2f533cc5
--- /dev/null
+++ b/tests/general/test_deep_crawl.py
@@ -0,0 +1,46 @@
+import asyncio
+import time
+
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+
+
+async def main():
+    """Example deep crawl of documentation site."""
+    config = CrawlerRunConfig(
+        deep_crawl_strategy = BFSDeepCrawlStrategy(
+            max_depth=2,
+            include_external=False
+        ),
+        stream=False,
+        verbose=True,
+        cache_mode=CacheMode.BYPASS,
+        scraping_strategy=LXMLWebScrapingStrategy()
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        start_time = time.perf_counter()
+        print("\nStarting deep crawl in batch mode:")
+        results = await crawler.arun(
+            url="https://docs.crawl4ai.com",
+            config=config
+        )
+        print(f"Crawled {len(results)} pages")
+        print(f"Example page: {results[0].url}")
+        print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
+
+        print("Starting deep crawl in streaming mode:")
+        config.stream = True
+        start_time = time.perf_counter()
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com",
+            config=config
+        ):
+            print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
+        print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/general/test_deep_crawl_filters.py b/tests/general/test_deep_crawl_filters.py
new file mode 100644
index 00000000..948bbcbd
--- /dev/null
+++ b/tests/general/test_deep_crawl_filters.py
@@ -0,0 +1,279 @@
+from crawl4ai.deep_crawling.filters import ContentRelevanceFilter, URLPatternFilter, DomainFilter, ContentTypeFilter, SEOFilter
+async def test_pattern_filter():
+    # Test cases as list of tuples instead of dict for multiple patterns
+    test_cases = [
+        # Simple suffix patterns (*.html)
+        ("*.html", {
+            "https://example.com/page.html": True,
+            "https://example.com/path/doc.html": True,
+            "https://example.com/page.htm": False,
+            "https://example.com/page.html?param=1": True,
+        }),
+        
+        # Path prefix patterns (/foo/*)
+        ("*/article/*", {
+            "https://example.com/article/123": True,
+            "https://example.com/blog/article/456": True,
+            "https://example.com/articles/789": False,
+            "https://example.com/article": False,
+        }),
+        
+        # Complex patterns
+        ("blog-*-[0-9]", {
+            "https://example.com/blog-post-1": True,
+            "https://example.com/blog-test-9": True,
+            "https://example.com/blog-post": False,
+            "https://example.com/blog-post-x": False,
+        }),
+        
+        # Multiple patterns case
+        (["*.pdf", "*/download/*"], {
+            "https://example.com/doc.pdf": True,
+            "https://example.com/download/file.txt": True,
+            "https://example.com/path/download/doc": True,
+            "https://example.com/uploads/file.txt": False,
+        }),
+        
+        # Edge cases
+        ("*", {
+            "https://example.com": True,
+            "": True,
+            "http://test.com/path": True,
+        }),
+        
+        # Complex regex
+        (r"^https?://.*\.example\.com/\d+", {
+            "https://sub.example.com/123": True,
+            "http://test.example.com/456": True,
+            "https://example.com/789": False,
+            "https://sub.example.com/abc": False,
+        })
+    ]
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for patterns, test_urls in test_cases:
+            filter_obj = URLPatternFilter(patterns)
+            
+            for url, expected in test_urls.items():
+                result = filter_obj.apply(url)
+                if result != expected:
+                    print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {result}")
+                    all_passed = False
+                else:
+                    print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
+        
+        return all_passed
+
+    # Run tests
+    print("Running Pattern Filter Tests...")
+    accuracy_passed = run_accuracy_test()
+    
+    if accuracy_passed:
+        print("\n✨ All accuracy tests passed!")
+        
+    else:
+        print("\n❌ Some accuracy tests failed!")
+
+async def test_domain_filter():
+    from itertools import chain
+
+    # Test cases
+    test_cases = [
+        # Allowed domains
+        ({"allowed": "example.com"}, {
+            "https://example.com/page": True,
+            "http://example.com": True,
+            "https://sub.example.com": False,
+            "https://other.com": False,
+        }),
+
+        ({"allowed": ["example.com", "test.com"]}, {
+            "https://example.com/page": True,
+            "https://test.com/home": True,
+            "https://other.com": False,
+        }),
+
+        # Blocked domains
+        ({"blocked": "malicious.com"}, {
+            "https://malicious.com": False,
+            "https://safe.com": True,
+            "http://malicious.com/login": False,
+        }),
+
+        ({"blocked": ["spam.com", "ads.com"]}, {
+            "https://spam.com": False,
+            "https://ads.com/banner": False,
+            "https://example.com": True,
+        }),
+
+        # Allowed and Blocked combination
+        ({"allowed": "example.com", "blocked": "sub.example.com"}, {
+            "https://example.com": True,
+            "https://sub.example.com": False,
+            "https://other.com": False,
+        }),
+    ]
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for params, test_urls in test_cases:
+            filter_obj = DomainFilter(
+                allowed_domains=params.get("allowed"),
+                blocked_domains=params.get("blocked"),
+            )
+            
+            for url, expected in test_urls.items():
+                result = filter_obj.apply(url)
+                if result != expected:
+                    print(f"\u274C Failed: Params {params} with URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {result}")
+                    all_passed = False
+                else:
+                    print(f"\u2705 Passed: Params {params} with URL '{url}'")
+        
+        return all_passed
+
+    # Run tests
+    print("Running Domain Filter Tests...")
+    accuracy_passed = run_accuracy_test()
+    
+    if accuracy_passed:
+        print("\n\u2728 All accuracy tests passed!")
+    else:
+        print("\n\u274C Some accuracy tests failed!")
+
+async def test_content_relevance_filter():
+    relevance_filter = ContentRelevanceFilter(
+        query="What was the cause of american civil war?", 
+        threshold=1
+    )
+
+    test_cases = {
+        "https://en.wikipedia.org/wiki/Cricket": False,
+        "https://en.wikipedia.org/wiki/American_Civil_War": True,
+    }
+
+    print("\nRunning Content Relevance Filter Tests...")
+    print("-" * 50)
+    
+    all_passed = True
+    for url, expected in test_cases.items():
+        result = await relevance_filter.apply(url)
+        if result != expected:
+            print(f"\u274C Failed: URL '{url}'")
+            print(f"   Expected: {expected}, Got: {result}")
+            all_passed = False
+        else:
+            print(f"\u2705 Passed: URL '{url}'")
+    
+    if all_passed:
+        print("\n\u2728 All content relevance tests passed!")
+    else:
+        print("\n\u274C Some content relevance tests failed!")
+
+async def test_content_type_filter():
+    from itertools import chain
+
+    # Test cases
+    test_cases = [
+        # Allowed single type
+        ({"allowed": "image/png"}, {
+            "https://example.com/image.png": True,
+            "https://example.com/photo.jpg": False,
+            "https://example.com/document.pdf": False,
+        }),
+
+        # Multiple allowed types
+        ({"allowed": ["image/jpeg", "application/pdf"]}, {
+            "https://example.com/photo.jpg": True,
+            "https://example.com/document.pdf": True,
+            "https://example.com/script.js": False,
+        }),
+
+        # No extension should be allowed
+        ({"allowed": "application/json"}, {
+            "https://example.com/api/data": True,
+            "https://example.com/data.json": True,
+            "https://example.com/page.html": False,
+        }),
+
+        # Unknown extensions should not be allowed
+        ({"allowed": "application/octet-stream"}, {
+            "https://example.com/file.unknown": True,
+            "https://example.com/archive.zip": False,
+            "https://example.com/software.exe": False,
+        }),
+    ]
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for params, test_urls in test_cases:
+            filter_obj = ContentTypeFilter(
+                allowed_types=params.get("allowed"),
+            )
+            
+            for url, expected in test_urls.items():
+                result = filter_obj.apply(url)
+                if result != expected:
+                    print(f"\u274C Failed: Params {params} with URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {result}")
+                    all_passed = False
+                else:
+                    print(f"\u2705 Passed: Params {params} with URL '{url}'")
+        
+        return all_passed
+
+    # Run tests
+    print("Running Content Type Filter Tests...")
+    accuracy_passed = run_accuracy_test()
+    
+    if accuracy_passed:
+        print("\n\u2728 All accuracy tests passed!")
+    else:
+        print("\n\u274C Some accuracy tests failed!")
+
+async def test_seo_filter():
+    seo_filter = SEOFilter(threshold=0.5, keywords=["SEO", "search engines", "Optimization"])
+
+    test_cases = {
+        "https://en.wikipedia.org/wiki/Search_engine_optimization": True,
+        "https://en.wikipedia.org/wiki/Randomness": False,
+    }
+
+    print("\nRunning SEO Filter Tests...")
+    print("-" * 50)
+    
+    all_passed = True
+    for url, expected in test_cases.items():
+        result = await seo_filter.apply(url)
+        if result != expected:
+            print(f"\u274C Failed: URL '{url}'")
+            print(f"   Expected: {expected}, Got: {result}")
+            all_passed = False
+        else:
+            print(f"\u2705 Passed: URL '{url}'")
+    
+    if all_passed:
+        print("\n\u2728 All SEO filter tests passed!")
+    else:
+        print("\n\u274C Some SEO filter tests failed!")
+
+import asyncio
+
+if __name__ == "__main__":
+    asyncio.run(test_pattern_filter())
+    asyncio.run(test_domain_filter())
+    asyncio.run(test_content_type_filter())
+    asyncio.run(test_content_relevance_filter())
+    asyncio.run(test_seo_filter())
\ No newline at end of file
diff --git a/tests/general/test_deep_crawl_scorers.py b/tests/general/test_deep_crawl_scorers.py
new file mode 100644
index 00000000..8e68bca6
--- /dev/null
+++ b/tests/general/test_deep_crawl_scorers.py
@@ -0,0 +1,179 @@
+from crawl4ai.deep_crawling.scorers import CompositeScorer, ContentTypeScorer, DomainAuthorityScorer, FreshnessScorer, KeywordRelevanceScorer, PathDepthScorer
+
+
+def test_scorers():
+    test_cases = [
+        # Keyword Scorer Tests
+        {
+            "scorer_type": "keyword",
+            "config": {
+                "keywords": ["python", "blog"],
+                "weight": 1.0,
+                "case_sensitive": False
+            },
+            "urls": {
+                "https://example.com/python-blog": 1.0,
+                "https://example.com/PYTHON-BLOG": 1.0,
+                "https://example.com/python-only": 0.5,
+                "https://example.com/other": 0.0
+            }
+        },
+        
+        # Path Depth Scorer Tests
+        {
+            "scorer_type": "path_depth",
+            "config": {
+                "optimal_depth": 2,
+                "weight": 1.0
+            },
+            "urls": {
+                "https://example.com/a/b": 1.0,
+                "https://example.com/a": 0.5,
+                "https://example.com/a/b/c": 0.5,
+                "https://example.com": 0.33333333
+            }
+        },
+        
+        # Content Type Scorer Tests
+        {
+            "scorer_type": "content_type",
+            "config": {
+                "type_weights": {
+                    ".html$": 1.0,
+                    ".pdf$": 0.8,
+                    ".jpg$": 0.6
+                },
+                "weight": 1.0
+            },
+            "urls": {
+                "https://example.com/doc.html": 1.0,
+                "https://example.com/doc.pdf": 0.8,
+                "https://example.com/img.jpg": 0.6,
+                "https://example.com/other.txt": 0.0
+            }
+        },
+        
+        # Freshness Scorer Tests
+        {
+            "scorer_type": "freshness",
+            "config": {
+                "weight": 1.0,  # Remove current_year since original doesn't support it
+            },
+            "urls": {
+                "https://example.com/2024/01/post": 1.0,
+                "https://example.com/2023/12/post": 0.9,
+                "https://example.com/2022/post": 0.8,
+                "https://example.com/no-date": 0.5
+            }
+        },
+        
+        # Domain Authority Scorer Tests
+        {
+            "scorer_type": "domain",
+            "config": {
+                "domain_weights": {
+                    "python.org": 1.0,
+                    "github.com": 0.8,
+                    "medium.com": 0.6
+                },
+                "default_weight": 0.3,
+                "weight": 1.0
+            },
+            "urls": {
+                "https://python.org/about": 1.0,
+                "https://github.com/repo": 0.8,
+                "https://medium.com/post": 0.6,
+                "https://unknown.com": 0.3
+            }
+        }
+    ]
+
+    def create_scorer(scorer_type, config):
+        if scorer_type == "keyword":
+            return KeywordRelevanceScorer(**config)
+        elif scorer_type == "path_depth":
+            return PathDepthScorer(**config)
+        elif scorer_type == "content_type":
+            return ContentTypeScorer(**config)
+        elif scorer_type == "freshness":
+            return FreshnessScorer(**config,current_year=2024)
+        elif scorer_type == "domain":
+            return DomainAuthorityScorer(**config)
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for test_case in test_cases:
+            print(f"\nTesting {test_case['scorer_type']} scorer:")
+            scorer = create_scorer(
+                test_case['scorer_type'],
+                test_case['config']
+            )
+            
+            for url, expected in test_case['urls'].items():
+                score = round(scorer.score(url), 8)
+                expected = round(expected, 8)
+                
+                if abs(score - expected) > 0.00001:
+                    print(f"❌ Scorer Failed: URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {score}")
+                    all_passed = False
+                else:
+                    print(f"✅ Scorer Passed: URL '{url}'")
+                    
+                    
+        return all_passed
+
+    def run_composite_test():
+        print("\nTesting Composite Scorer:")
+        print("-" * 50)
+        
+        # Create test data
+        test_urls = {
+            "https://python.org/blog/2024/01/new-release.html":0.86666667,
+            "https://github.com/repo/old-code.pdf": 0.62,
+            "https://unknown.com/random": 0.26
+        }
+        
+        # Create composite scorers with all types
+        scorers = []
+        
+        for test_case in test_cases:
+            scorer = create_scorer(
+                test_case['scorer_type'],
+                test_case['config']
+            )
+            scorers.append(scorer)
+            
+        composite = CompositeScorer(scorers, normalize=True)
+        
+        all_passed = True
+        for url, expected in test_urls.items():
+            score = round(composite.score(url), 8)
+            
+            if abs(score - expected) > 0.00001:
+                print(f"❌ Composite Failed: URL '{url}'")
+                print(f"   Expected: {expected}, Got: {score}")
+                all_passed = False
+            else:
+                print(f"✅ Composite Passed: URL '{url}'")
+                
+        return all_passed
+
+    # Run tests
+    print("Running Scorer Tests...")
+    accuracy_passed = run_accuracy_test()
+    composite_passed = run_composite_test()
+    
+    if accuracy_passed and composite_passed:
+        print("\n✨ All tests passed!")
+        # Note: Already have performance tests in run_scorer_performance_test()
+    else:
+        print("\n❌ Some tests failed!")
+
+    
+
+if __name__ == "__main__":
+    test_scorers()
\ No newline at end of file
diff --git a/tests/general/test_http_crawler_strategy.py b/tests/general/test_http_crawler_strategy.py
new file mode 100644
index 00000000..dc141418
--- /dev/null
+++ b/tests/general/test_http_crawler_strategy.py
@@ -0,0 +1,116 @@
+from tkinter import N
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai import CrawlerRunConfig, HTTPCrawlerConfig
+from crawl4ai.async_crawler_strategy import ConnectionTimeoutError
+import asyncio
+import os
+
+async def main():
+    """Test the AsyncHTTPCrawlerStrategy with various scenarios"""
+    logger = AsyncLogger(verbose=True)
+
+    # Initialize the strategy with default HTTPCrawlerConfig
+    crawler = AsyncHTTPCrawlerStrategy(
+        browser_config=HTTPCrawlerConfig(),
+        logger=logger
+    )
+    # Test 1: Basic HTTP GET
+    print("\n=== Test 1: Basic HTTP GET ===")
+    result = await crawler.crawl("https://example.com")
+    print(f"Status: {result.status_code}")
+    print(f"Content length: {len(result.html)}")
+    print(f"Headers: {dict(result.response_headers)}")
+
+    # Test 2: POST request with JSON
+    print("\n=== Test 2: POST with JSON ===")
+    crawler.browser_config = crawler.browser_config.clone(
+        method="POST",
+        json={"test": "data"},
+        headers={"Content-Type": "application/json"}
+    )
+    try:
+        result = await crawler.crawl(
+            "https://httpbin.org/post",
+        )
+        print(f"Status: {result.status_code}")
+        print(f"Response: {result.html[:200]}...")
+    except Exception as e:
+        print(f"Error: {e}")
+
+    # Test 3: File handling
+    crawler.browser_config = HTTPCrawlerConfig()
+    print("\n=== Test 3: Local file handling ===")
+    # Create a tmp file with test content
+    from tempfile import NamedTemporaryFile
+    with NamedTemporaryFile(delete=False) as f:
+        f.write(b"<html><body>Test content</body></html>")
+        f.close()
+        result = await crawler.crawl(f"file://{f.name}")
+        print(f"File content: {result.html}")
+
+    # Test 4: Raw content
+    print("\n=== Test 4: Raw content handling ===")
+    raw_html = "raw://<html><body>Raw test content</body></html>"
+    result = await crawler.crawl(raw_html)
+    print(f"Raw content: {result.html}")
+
+    # Test 5: Custom hooks
+    print("\n=== Test 5: Custom hooks ===")
+    async def before_request(url, kwargs):
+        print(f"Before request to {url}")
+        kwargs['headers']['X-Custom'] = 'test'
+
+    async def after_request(response):
+        print(f"After request, status: {response.status_code}")
+
+    crawler.set_hook('before_request', before_request)
+    crawler.set_hook('after_request', after_request)
+    result = await crawler.crawl("https://example.com")
+
+    # Test 6: Error handling
+    print("\n=== Test 6: Error handling ===")
+    try:
+        await crawler.crawl("https://nonexistent.domain.test")
+    except Exception as e:
+        print(f"Expected error: {e}")
+
+    # Test 7: Redirects
+    print("\n=== Test 7: Redirect handling ===")
+    crawler.browser_config = HTTPCrawlerConfig(follow_redirects=True)
+    result = await crawler.crawl("http://httpbin.org/redirect/1")
+    print(f"Final URL: {result.redirected_url}")
+
+    # Test 8: Custom timeout
+    print("\n=== Test 8: Custom timeout ===")
+    try:
+        await crawler.crawl(
+            "https://httpbin.org/delay/5",
+            config=CrawlerRunConfig(page_timeout=2)
+        )
+    except ConnectionTimeoutError as e:
+        print(f"Expected timeout: {e}")
+
+    # Test 9: SSL verification
+    print("\n=== Test 9: SSL verification ===")
+    crawler.browser_config = HTTPCrawlerConfig(verify_ssl=False)
+    try:
+        await crawler.crawl("https://expired.badssl.com/")
+        print("Connected to invalid SSL site with verification disabled")
+    except Exception as e:
+        print(f"SSL error: {e}")
+
+    # Test 10: Large file streaming
+    print("\n=== Test 10: Large file streaming ===")
+    from tempfile import NamedTemporaryFile
+    with NamedTemporaryFile(delete=False) as f:
+        f.write(b"<html><body>" + b"X" * 1024 * 1024 * 10 + b"</body></html>")
+        f.close()
+        result = await crawler.crawl("file://" + f.name)
+        print(f"Large file content length: {len(result.html)}")
+        os.remove(f.name)
+
+    crawler.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/20241401/test_llm_filter.py b/tests/general/test_llm_filter.py
similarity index 93%
rename from tests/20241401/test_llm_filter.py
rename to tests/general/test_llm_filter.py
index 60b8549d..6211c429 100644
--- a/tests/20241401/test_llm_filter.py
+++ b/tests/general/test_llm_filter.py
@@ -1,6 +1,7 @@
 import os
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai import LLMConfig
 from crawl4ai.content_filter_strategy import LLMContentFilter
 
 async def test_llm_filter():
@@ -22,8 +23,7 @@ async def test_llm_filter():
 
         # Initialize LLM filter with focused instruction
         filter = LLMContentFilter(
-            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
+            llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
             instruction="""
             Focus on extracting the core educational content about Python classes.
             Include:
@@ -43,8 +43,7 @@ async def test_llm_filter():
         )
         
         filter = LLMContentFilter(
-            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
+            llm_config = LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
             chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
             instruction="""
             Extract the main educational content while preserving its original wording and substance completely. Your task is to:
diff --git a/tests/general/test_mhtml.py b/tests/general/test_mhtml.py
new file mode 100644
index 00000000..06e0e294
--- /dev/null
+++ b/tests/general/test_mhtml.py
@@ -0,0 +1,213 @@
+# test_mhtml_capture.py
+
+import pytest
+import asyncio
+import re  # For more robust MHTML checks
+
+# Assuming these can be imported directly from the crawl4ai library
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult
+
+# A reliable, simple static HTML page for testing
+# Using httpbin as it's designed for testing clients
+TEST_URL_SIMPLE = "https://httpbin.org/html"
+EXPECTED_CONTENT_SIMPLE = "Herman Melville - Moby-Dick"
+
+# A slightly more complex page that might involve JS (good secondary test)
+TEST_URL_JS = "https://quotes.toscrape.com/js/"
+EXPECTED_CONTENT_JS = "Quotes to Scrape" # Title of the page, which should be present in MHTML
+
+# Removed the custom event_loop fixture as pytest-asyncio provides a default one.
+
+@pytest.mark.asyncio
+async def test_mhtml_capture_when_enabled():
+    """
+    Verify that when CrawlerRunConfig has capture_mhtml=True,
+    the CrawlResult contains valid MHTML content.
+    """
+    # Create a fresh browser config and crawler instance for this test
+    browser_config = BrowserConfig(headless=True) # Use headless for testing CI/CD
+    # --- Key: Enable MHTML capture in the run config ---
+    run_config = CrawlerRunConfig(capture_mhtml=True)
+
+    # Create a fresh crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    try:
+        # Start the browser
+        await crawler.start()
+        
+        # Perform the crawl with the MHTML-enabled config
+        result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
+
+        # --- Assertions ---
+        assert result is not None, "Crawler should return a result object"
+        assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
+
+        # 1. Check if the mhtml attribute exists (will fail if CrawlResult not updated)
+        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+
+        # 2. Check if mhtml is populated
+        assert result.mhtml is not None, "MHTML content should be captured when enabled"
+        assert isinstance(result.mhtml, str), "MHTML content should be a string"
+        assert len(result.mhtml) > 500, "MHTML content seems too short, likely invalid" # Basic sanity check
+
+        # 3. Check for MHTML structure indicators (more robust than simple string contains)
+        # MHTML files are multipart MIME messages
+        assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE), \
+            "MHTML should contain 'Content-Type: multipart/related;'"
+        # Should contain a boundary definition
+        assert re.search(r"boundary=\"----MultipartBoundary", result.mhtml), \
+            "MHTML should contain a multipart boundary"
+        # Should contain the main HTML part
+        assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE), \
+            "MHTML should contain a 'Content-Type: text/html' part"
+
+        # 4. Check if the *actual page content* is within the MHTML string
+        # This confirms the snapshot captured the rendered page
+        assert EXPECTED_CONTENT_SIMPLE in result.mhtml, \
+            f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the captured MHTML"
+
+        # 5. Ensure standard HTML is still present and correct
+        assert result.html is not None, "Standard HTML should still be present"
+        assert isinstance(result.html, str), "Standard HTML should be a string"
+        assert EXPECTED_CONTENT_SIMPLE in result.html, \
+            f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the standard HTML"
+
+    finally:
+        # Important: Ensure browser is completely closed even if assertions fail
+        await crawler.close()
+        # Help the garbage collector clean up
+        crawler = None
+
+
+@pytest.mark.asyncio
+async def test_mhtml_capture_when_disabled_explicitly():
+    """
+    Verify that when CrawlerRunConfig explicitly has capture_mhtml=False,
+    the CrawlResult.mhtml attribute is None.
+    """
+    # Create a fresh browser config and crawler instance for this test
+    browser_config = BrowserConfig(headless=True)
+    # --- Key: Explicitly disable MHTML capture ---
+    run_config = CrawlerRunConfig(capture_mhtml=False)
+
+    # Create a fresh crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    try:
+        # Start the browser
+        await crawler.start()
+        result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
+
+        assert result is not None
+        assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
+
+        # 1. Check attribute existence (important for TDD start)
+        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+
+        # 2. Check mhtml is None
+        assert result.mhtml is None, "MHTML content should be None when explicitly disabled"
+
+        # 3. Ensure standard HTML is still present
+        assert result.html is not None
+        assert EXPECTED_CONTENT_SIMPLE in result.html
+
+    finally:
+        # Important: Ensure browser is completely closed even if assertions fail
+        await crawler.close()
+        # Help the garbage collector clean up
+        crawler = None
+
+
+@pytest.mark.asyncio
+async def test_mhtml_capture_when_disabled_by_default():
+    """
+    Verify that if capture_mhtml is not specified (using its default),
+    the CrawlResult.mhtml attribute is None.
+    (This assumes the default value for capture_mhtml in CrawlerRunConfig is False)
+    """
+    # Create a fresh browser config and crawler instance for this test
+    browser_config = BrowserConfig(headless=True)
+    # --- Key: Use default run config ---
+    run_config = CrawlerRunConfig() # Do not specify capture_mhtml
+
+    # Create a fresh crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    try:
+        # Start the browser
+        await crawler.start()
+        result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
+
+        assert result is not None
+        assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
+
+        # 1. Check attribute existence
+        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+
+        # 2. Check mhtml is None (assuming default is False)
+        assert result.mhtml is None, "MHTML content should be None when using default config (assuming default=False)"
+
+        # 3. Ensure standard HTML is still present
+        assert result.html is not None
+        assert EXPECTED_CONTENT_SIMPLE in result.html
+
+    finally:
+        # Important: Ensure browser is completely closed even if assertions fail
+        await crawler.close()
+        # Help the garbage collector clean up
+        crawler = None
+
+# Optional: Add a test for a JS-heavy page if needed
+@pytest.mark.asyncio
+async def test_mhtml_capture_on_js_page_when_enabled():
+    """
+    Verify MHTML capture works on a page requiring JavaScript execution.
+    """
+    # Create a fresh browser config and crawler instance for this test
+    browser_config = BrowserConfig(headless=True)
+    run_config = CrawlerRunConfig(
+        capture_mhtml=True,
+        # Add a small wait or JS execution if needed for the JS page to fully render
+        # For quotes.toscrape.com/js/, it renders quickly, but a wait might be safer
+        # wait_for_timeout=2000 # Example: wait up to 2 seconds
+        js_code="await new Promise(r => setTimeout(r, 500));" # Small delay after potential load
+    )
+
+    # Create a fresh crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    try:
+        # Start the browser
+        await crawler.start()
+        result: CrawlResult = await crawler.arun(TEST_URL_JS, config=run_config)
+
+        assert result is not None
+        assert result.success is True, f"Crawling {TEST_URL_JS} should succeed. Error: {result.error_message}"
+        assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+        assert result.mhtml is not None, "MHTML content should be captured on JS page when enabled"
+        assert isinstance(result.mhtml, str), "MHTML content should be a string"
+        assert len(result.mhtml) > 500, "MHTML content from JS page seems too short"
+
+        # Check for MHTML structure
+        assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE)
+        assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE)
+
+        # Check for content rendered by JS within the MHTML
+        assert EXPECTED_CONTENT_JS in result.mhtml, \
+            f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the captured MHTML"
+
+        # Check standard HTML too
+        assert result.html is not None
+        assert EXPECTED_CONTENT_JS in result.html, \
+             f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the standard HTML"
+
+    finally:
+        # Important: Ensure browser is completely closed even if assertions fail
+        await crawler.close()
+        # Help the garbage collector clean up
+        crawler = None
+
+if __name__ == "__main__":
+    # Use pytest for async tests
+    pytest.main(["-xvs", __file__])
diff --git a/tests/general/test_network_console_capture.py b/tests/general/test_network_console_capture.py
new file mode 100644
index 00000000..da41ecec
--- /dev/null
+++ b/tests/general/test_network_console_capture.py
@@ -0,0 +1,185 @@
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
+import asyncio
+import aiohttp
+from aiohttp import web
+import tempfile
+import shutil
+import os, sys, time, json
+
+
+async def start_test_server():
+    app = web.Application()
+    
+    async def basic_page(request):
+        return web.Response(text="""
+        <!DOCTYPE html>
+        <html>
+        <head>
+            <title>Network Request Test</title>
+        </head>
+        <body>
+            <h1>Test Page for Network Capture</h1>
+            <p>This page performs network requests and console logging.</p>
+            <img src="/image.png" alt="Test Image">
+            <script>
+                console.log("Basic console log");
+                console.error("Error message");
+                console.warn("Warning message");
+                
+                // Make some XHR requests
+                const xhr = new XMLHttpRequest();
+                xhr.open('GET', '/api/data', true);
+                xhr.send();
+                
+                // Make a fetch request
+                fetch('/api/json')
+                    .then(response => response.json())
+                    .catch(error => console.error('Fetch error:', error));
+                
+                // Trigger an error
+                setTimeout(() => {
+                    try {
+                        nonExistentFunction();
+                    } catch (e) {
+                        console.error("Caught error:", e);
+                    }
+                }, 100);
+            </script>
+        </body>
+        </html>
+        """, content_type="text/html")
+    
+    async def image(request):
+        # Return a small 1x1 transparent PNG
+        return web.Response(body=bytes.fromhex('89504E470D0A1A0A0000000D49484452000000010000000108060000001F15C4890000000D4944415478DA63FAFFFF3F030079DB00018D959DE70000000049454E44AE426082'), content_type="image/png")
+    
+    async def api_data(request):
+        return web.Response(text="sample data")
+    
+    async def api_json(request):
+        return web.json_response({"status": "success", "message": "JSON data"})
+    
+    # Register routes
+    app.router.add_get('/', basic_page)
+    app.router.add_get('/image.png', image)
+    app.router.add_get('/api/data', api_data)
+    app.router.add_get('/api/json', api_json)
+    
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', 8080)
+    await site.start()
+    
+    return runner
+
+
+async def test_network_console_capture():
+    print("\n=== Testing Network and Console Capture ===\n")
+    
+    # Start test server
+    runner = await start_test_server()
+    try:
+        browser_config = BrowserConfig(headless=True)
+        
+        # Test with capture disabled (default)
+        print("\n1. Testing with capture disabled (default)...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                wait_until="networkidle",  # Wait for network to be idle
+            )
+            result = await crawler.arun(url="http://localhost:8080/", config=config)
+            
+            assert result.network_requests is None, "Network requests should be None when capture is disabled"
+            assert result.console_messages is None, "Console messages should be None when capture is disabled"
+            print("✓ Default config correctly returns None for network_requests and console_messages")
+        
+        # Test with network capture enabled
+        print("\n2. Testing with network capture enabled...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                wait_until="networkidle",  # Wait for network to be idle
+                capture_network_requests=True
+            )
+            result = await crawler.arun(url="http://localhost:8080/", config=config)
+            
+            assert result.network_requests is not None, "Network requests should be captured"
+            print(f"✓ Captured {len(result.network_requests)} network requests")
+            
+            # Check if we have both requests and responses
+            request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
+            response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
+            print(f"  - {request_count} requests, {response_count} responses")
+            
+            # Check if we captured specific resources
+            urls = [r.get("url") for r in result.network_requests]
+            has_image = any("/image.png" in url for url in urls)
+            has_api_data = any("/api/data" in url for url in urls)
+            has_api_json = any("/api/json" in url for url in urls)
+            
+            assert has_image, "Should have captured image request"
+            assert has_api_data, "Should have captured API data request"
+            assert has_api_json, "Should have captured API JSON request"
+            print("✓ Captured expected network requests (image, API endpoints)")
+        
+        # Test with console capture enabled
+        print("\n3. Testing with console capture enabled...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                wait_until="networkidle",  # Wait for network to be idle
+                capture_console_messages=True
+            )
+            result = await crawler.arun(url="http://localhost:8080/", config=config)
+            
+            assert result.console_messages is not None, "Console messages should be captured"
+            print(f"✓ Captured {len(result.console_messages)} console messages")
+            
+            # Check if we have different types of console messages
+            message_types = set(msg.get("type") for msg in result.console_messages if "type" in msg)
+            print(f"  - Message types: {', '.join(message_types)}")
+            
+            # Print all captured messages for debugging
+            print("  - Captured messages:")
+            for msg in result.console_messages:
+                print(f"    * Type: {msg.get('type', 'N/A')}, Text: {msg.get('text', 'N/A')}")
+            
+            # Look for specific messages
+            messages = [msg.get("text") for msg in result.console_messages if "text" in msg]
+            has_basic_log = any("Basic console log" in msg for msg in messages)
+            has_error_msg = any("Error message" in msg for msg in messages)
+            has_warning_msg = any("Warning message" in msg for msg in messages)
+            
+            assert has_basic_log, "Should have captured basic console.log message"
+            assert has_error_msg, "Should have captured console.error message"
+            assert has_warning_msg, "Should have captured console.warn message"
+            print("✓ Captured expected console messages (log, error, warning)")
+        
+        # Test with both captures enabled
+        print("\n4. Testing with both network and console capture enabled...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            config = CrawlerRunConfig(
+                wait_until="networkidle",  # Wait for network to be idle
+                capture_network_requests=True,
+                capture_console_messages=True
+            )
+            result = await crawler.arun(url="http://localhost:8080/", config=config)
+            
+            assert result.network_requests is not None, "Network requests should be captured"
+            assert result.console_messages is not None, "Console messages should be captured"
+            print(f"✓ Successfully captured both {len(result.network_requests)} network requests and {len(result.console_messages)} console messages")
+        
+    finally:
+        await runner.cleanup()
+        print("\nTest server shutdown")
+
+
+async def main():
+    try:
+        await test_network_console_capture()
+        print("\n✅ All tests passed successfully!")
+    except Exception as e:
+        print(f"\n❌ Test failed: {str(e)}")
+        raise
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/tests/20241401/test_robot_parser.py b/tests/general/test_robot_parser.py
similarity index 100%
rename from tests/20241401/test_robot_parser.py
rename to tests/general/test_robot_parser.py
diff --git a/tests/20241401/test_schema_builder.py b/tests/general/test_schema_builder.py
similarity index 98%
rename from tests/20241401/test_schema_builder.py
rename to tests/general/test_schema_builder.py
index 431fb001..46d0e240 100644
--- a/tests/20241401/test_schema_builder.py
+++ b/tests/general/test_schema_builder.py
@@ -10,6 +10,7 @@ import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
 import json
 
 # Test HTML - A complex job board with companies, departments, and positions
diff --git a/tests/20241401/test_stream.py b/tests/general/test_stream.py
similarity index 100%
rename from tests/20241401/test_stream.py
rename to tests/general/test_stream.py
diff --git a/tests/20241401/test_stream_dispatch.py b/tests/general/test_stream_dispatch.py
similarity index 100%
rename from tests/20241401/test_stream_dispatch.py
rename to tests/general/test_stream_dispatch.py
diff --git a/tests/20241401/tets_robot.py b/tests/general/tets_robot.py
similarity index 100%
rename from tests/20241401/tets_robot.py
rename to tests/general/tets_robot.py
diff --git a/tests/hub/test_simple.py b/tests/hub/test_simple.py
new file mode 100644
index 00000000..a970d683
--- /dev/null
+++ b/tests/hub/test_simple.py
@@ -0,0 +1,34 @@
+# test.py
+from crawl4ai import CrawlerHub
+import json
+
+async def amazon_example():
+    if (crawler_cls := CrawlerHub.get("amazon_product")) :
+        crawler = crawler_cls()
+        print(f"Crawler version: {crawler_cls.meta['version']}")
+        print(f"Rate limits: {crawler_cls.meta.get('rate_limit', 'Unlimited')}")
+        print(await crawler.run("https://amazon.com/test"))
+    else:
+        print("Crawler not found!")
+
+async def google_example():
+    # Get crawler dynamically
+    crawler_cls = CrawlerHub.get("google_search")
+    crawler = crawler_cls()
+
+    # Text search
+    text_results = await crawler.run(
+        query="apple inc", 
+        search_type="text",  
+        schema_cache_path="/Users/unclecode/.crawl4ai"
+    )
+    print(json.dumps(json.loads(text_results), indent=4))
+
+    # Image search
+    # image_results = await crawler.run(query="apple inc", search_type="image")
+    # print(image_results)
+
+if __name__ == "__main__":
+    import asyncio
+    # asyncio.run(amazon_example())
+    asyncio.run(google_example())
\ No newline at end of file
diff --git a/tests/loggers/test_logger.py b/tests/loggers/test_logger.py
new file mode 100644
index 00000000..6c3a811b
--- /dev/null
+++ b/tests/loggers/test_logger.py
@@ -0,0 +1,80 @@
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
+import os
+from datetime import datetime
+
+class AsyncFileLogger(AsyncLoggerBase):
+    """
+    File-only asynchronous logger that writes logs to a specified file.
+    """
+
+    def __init__(self, log_file: str):
+        """
+        Initialize the file logger.
+
+        Args:
+            log_file: File path for logging
+        """
+        self.log_file = log_file
+        os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
+
+    def _write_to_file(self, level: str, message: str, tag: str):
+        """Write a message to the log file."""
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+        with open(self.log_file, "a", encoding="utf-8") as f:
+            f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
+
+    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
+        """Log a debug message to file."""
+        self._write_to_file("DEBUG", message, tag)
+
+    def info(self, message: str, tag: str = "INFO", **kwargs):
+        """Log an info message to file."""
+        self._write_to_file("INFO", message, tag)
+
+    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
+        """Log a success message to file."""
+        self._write_to_file("SUCCESS", message, tag)
+
+    def warning(self, message: str, tag: str = "WARNING", **kwargs):
+        """Log a warning message to file."""
+        self._write_to_file("WARNING", message, tag)
+
+    def error(self, message: str, tag: str = "ERROR", **kwargs):
+        """Log an error message to file."""
+        self._write_to_file("ERROR", message, tag)
+
+    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
+        """Log URL fetch status to file."""
+        status = "SUCCESS" if success else "FAILED"
+        message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
+        self._write_to_file("URL_STATUS", message, tag)
+
+    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
+        """Log error status to file."""
+        message = f"{url[:url_length]}... | Error: {error}"
+        self._write_to_file("ERROR", message, tag)
+
+async def main():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
+    await crawler.start()
+    
+    try:
+        crawl_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+        )
+        # Use the crawler multiple times
+        result = await crawler.arun(
+            url='https://kidocode.com/',
+            config=crawl_config
+        )
+        if result.success:
+            print("First crawl - Raw Markdown Length:", len(result.markdown.raw_markdown))
+            
+    finally:
+        # Always ensure we close the crawler
+        await crawler.close()
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/mcp/test_mcp_socket.py b/tests/mcp/test_mcp_socket.py
new file mode 100644
index 00000000..32456b31
--- /dev/null
+++ b/tests/mcp/test_mcp_socket.py
@@ -0,0 +1,119 @@
+# pip install "mcp-sdk[ws]" anyio
+import anyio, json
+from mcp.client.websocket import websocket_client
+from mcp.client.session import ClientSession
+
+async def test_list():
+    async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
+        async with ClientSession(r, w) as s:
+            await s.initialize()
+
+            print("tools      :", [t.name for t in (await s.list_tools()).tools])
+            print("resources  :", [r.name for r in (await s.list_resources()).resources])
+            print("templates  :", [t.name for t in (await s.list_resource_templates()).resource_templates])
+
+
+async def test_crawl(s: ClientSession) -> None:
+    """Hit the @mcp_tool('crawl') endpoint."""
+    res = await s.call_tool(
+        "crawl",
+        {
+            "urls": ["https://example.com"],
+            "browser_config": {},
+            "crawler_config": {},
+        },
+    )
+    print("crawl →", json.loads(res.content[0].text))
+
+
+async def test_md(s: ClientSession) -> None:
+    """Hit the @mcp_tool('md') endpoint."""
+    res = await s.call_tool(
+        "md",
+        {
+            "url": "https://example.com",
+            "f": "fit",   # or RAW, BM25, LLM
+            "q": None,
+            "c": "0",
+        },
+    )
+    result = json.loads(res.content[0].text)
+    print("md →", result['markdown'][:100], "...")
+
+async def test_screenshot(s: ClientSession):
+    res = await s.call_tool(
+        "screenshot",
+        {
+            "url": "https://example.com",
+            "screenshot_wait_for": 1.0,
+        },
+    )
+    png_b64 = json.loads(res.content[0].text)["screenshot"]
+    print("screenshot →", png_b64[:60], "… (base64)")
+
+
+async def test_pdf(s: ClientSession):
+    res = await s.call_tool(
+        "pdf",
+        {
+            "url": "https://example.com",
+        },
+    )
+    pdf_b64 = json.loads(res.content[0].text)["pdf"]
+    print("pdf →", pdf_b64[:60], "… (base64)")
+
+async def test_execute_js(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "execute_js",
+        {
+            "url": "https://news.ycombinator.com/news",
+            "js_code": [
+                "await page.click('a.morelink')",
+                "await page.waitForTimeout(1000)",
+            ],
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
+    
+async def test_html(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "html",
+        {
+            "url": "https://news.ycombinator.com/news",
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))    
+    
+async def test_context(s: ClientSession):
+    # click the “More” link on Hacker News front page and wait 1 s
+    res = await s.call_tool(
+        "ask",
+        {
+            "query": "I hv a question about Crawl4ai library, how to extract internal links when crawling a page?"
+        },
+    )
+    crawl_result = json.loads(res.content[0].text)
+    print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))    
+
+
+async def main() -> None:
+    async with websocket_client("ws://localhost:11235/mcp/ws") as (r, w):
+        async with ClientSession(r, w) as s:
+            await s.initialize()                       # handshake
+            tools = (await s.list_tools()).tools
+            print("tools:", [t.name for t in tools])
+
+            # await test_list()
+            await test_crawl(s)
+            await test_md(s)
+            await test_screenshot(s)
+            await test_pdf(s)
+            await test_execute_js(s)
+            await test_html(s)
+            await test_context(s)
+
+anyio.run(main)
diff --git a/tests/mcp/test_mcp_sse.py b/tests/mcp/test_mcp_sse.py
new file mode 100644
index 00000000..d9eee557
--- /dev/null
+++ b/tests/mcp/test_mcp_sse.py
@@ -0,0 +1,11 @@
+from mcp.client.sse import sse_client
+from mcp.client.session import ClientSession
+
+async def main():
+    async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
+        async with ClientSession(r, w) as sess:
+            print(await sess.list_tools())      # now works
+            
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
diff --git a/tests/memory/README.md b/tests/memory/README.md
new file mode 100644
index 00000000..164ef095
--- /dev/null
+++ b/tests/memory/README.md
@@ -0,0 +1,315 @@
+# Crawl4AI Stress Testing and Benchmarking
+
+This directory contains tools for stress testing Crawl4AI's `arun_many` method and dispatcher system with high volumes of URLs to evaluate performance, concurrency handling, and potentially detect memory issues. It also includes a benchmarking system to track performance over time.
+
+## Quick Start
+
+```bash
+# Run a default stress test (small config) and generate a report
+# (Assumes run_all.sh is updated to call run_benchmark.py)
+./run_all.sh
+```
+*Note: `run_all.sh` might need to be updated if it directly called the old script.*
+
+## Overview
+
+The stress testing system works by:
+
+1.  Generating a local test site with heavy HTML pages (regenerated by default for each test).
+2.  Starting a local HTTP server to serve these pages.
+3.  Running Crawl4AI's `arun_many` method against this local site using the `MemoryAdaptiveDispatcher` with configurable concurrency (`max_sessions`).
+4.  Monitoring performance metrics via the `CrawlerMonitor` and optionally logging memory usage.
+5.  Optionally generating detailed benchmark reports with visualizations using `benchmark_report.py`.
+
+## Available Tools
+
+-   `test_stress_sdk.py` - Main stress testing script utilizing `arun_many` and dispatchers.
+-   `benchmark_report.py` - Report generator for comparing test results (assumes compatibility with `test_stress_sdk.py` outputs).
+-   `run_benchmark.py` - Python script with predefined test configurations that orchestrates tests using `test_stress_sdk.py`.
+-   `run_all.sh` - Simple wrapper script (may need updating).
+
+## Usage Guide
+
+### Using Predefined Configurations (Recommended)
+
+The `run_benchmark.py` script offers the easiest way to run standardized tests:
+
+```bash
+# Quick test (50 URLs, 4 max sessions)
+python run_benchmark.py quick
+
+# Medium test (500 URLs, 16 max sessions)
+python run_benchmark.py medium
+
+# Large test (1000 URLs, 32 max sessions)
+python run_benchmark.py large
+
+# Extreme test (2000 URLs, 64 max sessions)
+python run_benchmark.py extreme
+
+# Custom configuration
+python run_benchmark.py custom --urls 300 --max-sessions 24 --chunk-size 50
+
+# Run 'small' test in streaming mode
+python run_benchmark.py small --stream
+
+# Override max_sessions for the 'medium' config
+python run_benchmark.py medium --max-sessions 20
+
+# Skip benchmark report generation after the test
+python run_benchmark.py small --no-report
+
+# Clean up reports and site files before running
+python run_benchmark.py medium --clean
+```
+
+#### `run_benchmark.py` Parameters
+
+| Parameter            | Default         | Description                                                                 |
+| -------------------- | --------------- | --------------------------------------------------------------------------- |
+| `config`             | *required*      | Test configuration: `quick`, `small`, `medium`, `large`, `extreme`, `custom`|
+| `--urls`             | config-specific | Number of URLs (required for `custom`)                                      |
+| `--max-sessions`     | config-specific | Max concurrent sessions managed by dispatcher (required for `custom`)         |
+| `--chunk-size`       | config-specific | URLs per batch for non-stream logging (required for `custom`)               |
+| `--stream`           | False           | Enable streaming results (disables batch logging)                           |
+| `--monitor-mode`     | DETAILED        | `DETAILED` or `AGGREGATED` display for the live monitor                     |
+| `--use-rate-limiter` | False           | Enable basic rate limiter in the dispatcher                                 |
+| `--port`             | 8000            | HTTP server port                                                            |
+| `--no-report`        | False           | Skip generating comparison report via `benchmark_report.py`                 |
+| `--clean`            | False           | Clean up reports and site files before running                              |
+| `--keep-server-alive`| False           | Keep local HTTP server running after test                                   |
+| `--use-existing-site`| False           | Use existing site on specified port (no local server start/site gen)        |
+| `--skip-generation`  | False           | Use existing site files but start local server                              |
+| `--keep-site`        | False           | Keep generated site files after test                                        |
+
+#### Predefined Configurations
+
+| Configuration | URLs   | Max Sessions | Chunk Size | Description                      |
+| ------------- | ------ | ------------ | ---------- | -------------------------------- |
+| `quick`       | 50     | 4            | 10         | Quick test for basic validation  |
+| `small`       | 100    | 8            | 20         | Small test for routine checks    |
+| `medium`      | 500    | 16           | 50         | Medium test for thorough checks  |
+| `large`       | 1000   | 32           | 100        | Large test for stress testing    |
+| `extreme`     | 2000   | 64           | 200        | Extreme test for limit testing   |
+
+### Direct Usage of `test_stress_sdk.py`
+
+For fine-grained control or debugging, you can run the stress test script directly:
+
+```bash
+# Test with 200 URLs and 32 max concurrent sessions
+python test_stress_sdk.py --urls 200 --max-sessions 32 --chunk-size 40
+
+# Clean up previous test data first
+python test_stress_sdk.py --clean-reports --clean-site --urls 100 --max-sessions 16 --chunk-size 20
+
+# Change the HTTP server port and use aggregated monitor
+python test_stress_sdk.py --port 8088 --urls 100 --max-sessions 16 --monitor-mode AGGREGATED
+
+# Enable streaming mode and use rate limiting
+python test_stress_sdk.py --urls 50 --max-sessions 8 --stream --use-rate-limiter
+
+# Change report output location
+python test_stress_sdk.py --report-path custom_reports --urls 100 --max-sessions 16
+```
+
+#### `test_stress_sdk.py` Parameters
+
+| Parameter            | Default    | Description                                                          |
+| -------------------- | ---------- | -------------------------------------------------------------------- |
+| `--urls`             | 100        | Number of URLs to test                                               |
+| `--max-sessions`     | 16         | Maximum concurrent crawling sessions managed by the dispatcher       |
+| `--chunk-size`       | 10         | Number of URLs per batch (relevant for non-stream logging)           |
+| `--stream`           | False      | Enable streaming results (disables batch logging)                    |
+| `--monitor-mode`     | DETAILED   | `DETAILED` or `AGGREGATED` display for the live `CrawlerMonitor`     |
+| `--use-rate-limiter` | False      | Enable a basic `RateLimiter` within the dispatcher                   |
+| `--site-path`        | "test_site"| Path to store/use the generated test site                            |
+| `--port`             | 8000       | Port for the local HTTP server                                       |
+| `--report-path`      | "reports"  | Path to save test result summary (JSON) and memory samples (CSV)   |
+| `--skip-generation`  | False      | Use existing test site files but still start local server            |
+| `--use-existing-site`| False      | Use existing site on specified port (no local server/site gen)     |
+| `--keep-server-alive`| False      | Keep local HTTP server running after test completion                 |
+| `--keep-site`        | False      | Keep the generated test site files after test completion             |
+| `--clean-reports`    | False      | Clean up report directory before running                             |
+| `--clean-site`       | False      | Clean up site directory before/after running (see script logic)    |
+
+### Generating Reports Only
+
+If you only want to generate a benchmark report from existing test results (assuming `benchmark_report.py` is compatible):
+
+```bash
+# Generate a report from existing test results in ./reports/
+python benchmark_report.py
+
+# Limit to the most recent 5 test results
+python benchmark_report.py --limit 5
+
+# Specify a custom source directory for test results
+python benchmark_report.py --reports-dir alternate_results
+```
+
+#### `benchmark_report.py` Parameters (Assumed)
+
+| Parameter       | Default              | Description                                                 |
+| --------------- | -------------------- | ----------------------------------------------------------- |
+| `--reports-dir` | "reports"            | Directory containing `test_stress_sdk.py` result files      |
+| `--output-dir`  | "benchmark_reports"  | Directory to save generated HTML reports and charts         |
+| `--limit`       | None (all results)   | Limit comparison to N most recent test results              |
+| `--output-file` | Auto-generated       | Custom output filename for the HTML report                  |
+
+## Understanding the Test Output
+
+### Real-time Progress Display (`CrawlerMonitor`)
+
+When running `test_stress_sdk.py`, the `CrawlerMonitor` provides a live view of the crawling process managed by the dispatcher.
+
+-   **DETAILED Mode (Default):** Shows individual task status (Queued, Active, Completed, Failed), timings, memory usage per task (if `psutil` is available), overall queue statistics, and memory pressure status (if `psutil` available).
+-   **AGGREGATED Mode:** Shows summary counts (Queued, Active, Completed, Failed), overall progress percentage, estimated time remaining, average URLs/sec, and memory pressure status.
+
+### Batch Log Output (Non-Streaming Mode Only)
+
+If running `test_stress_sdk.py` **without** the `--stream` flag, you will *also* see per-batch summary lines printed to the console *after* the monitor display, once each chunk of URLs finishes processing:
+
+```
+ Batch | Progress | Start Mem | End Mem   | URLs/sec | Success/Fail | Time (s) | Status
+───────────────────────────────────────────────────────────────────────────────────────────
+ 1     |  10.0%   |  50.1 MB  |  55.3 MB  |    23.8    |    10/0      |     0.42   | Success
+ 2     |  20.0%   |  55.3 MB  |  60.1 MB  |    24.1    |    10/0      |     0.41   | Success
+ ...
+```
+
+This display provides chunk-specific metrics:
+-   **Batch**: The batch number being reported.
+-   **Progress**: Overall percentage of total URLs processed *after* this batch.
+-   **Start Mem / End Mem**: Memory usage before and after processing this batch (if tracked).
+-   **URLs/sec**: Processing speed *for this specific batch*.
+-   **Success/Fail**: Number of successful and failed URLs *in this batch*.
+-   **Time (s)**: Wall-clock time taken to process *this batch*.
+-   **Status**: Color-coded status for the batch outcome.
+
+### Summary Output
+
+After test completion, a final summary is displayed:
+
+```
+================================================================================
+Test Completed
+================================================================================
+Test ID: 20250418_103015
+Configuration: 100 URLs, 16 max sessions, Chunk: 10, Stream: False, Monitor: DETAILED
+Results: 100 successful, 0 failed (100 processed, 100.0% success)
+Performance: 5.85 seconds total, 17.09 URLs/second avg
+Memory Usage: Start: 50.1 MB, End: 75.3 MB, Max: 78.1 MB, Growth: 25.2 MB
+Results summary saved to reports/test_summary_20250418_103015.json
+```
+
+### HTML Report Structure (Generated by `benchmark_report.py`)
+
+(This section remains the same, assuming `benchmark_report.py` generates these)
+The benchmark report contains several sections:
+1.  **Summary**: Overview of the latest test results and trends
+2.  **Performance Comparison**: Charts showing throughput across tests
+3.  **Memory Usage**: Detailed memory usage graphs for each test
+4.  **Detailed Results**: Tabular data of all test metrics
+5.  **Conclusion**: Automated analysis of performance and memory patterns
+
+### Memory Metrics
+
+(This section remains conceptually the same)
+Memory growth is the key metric for detecting leaks...
+
+### Performance Metrics
+
+(This section remains conceptually the same, though "URLs per Worker" is less relevant - focus on overall URLs/sec)
+Key performance indicators include:
+-   **URLs per Second**: Higher is better (throughput)
+-   **Success Rate**: Should be 100% in normal conditions
+-   **Total Processing Time**: Lower is better
+-   **Dispatcher Efficiency**: Observe queue lengths and wait times in the monitor (Detailed mode)
+
+### Raw Data Files
+
+Raw data is saved in the `--report-path` directory (default `./reports/`):
+
+-   **JSON files** (`test_summary_*.json`): Contains the final summary for each test run.
+-   **CSV files** (`memory_samples_*.csv`): Contains time-series memory samples taken during the test run.
+
+Example of reading raw data:
+```python
+import json
+import pandas as pd
+
+# Load test summary
+test_id = "20250418_103015" # Example ID
+with open(f'reports/test_summary_{test_id}.json', 'r') as f:
+    results = json.load(f)
+
+# Load memory samples
+memory_df = pd.read_csv(f'reports/memory_samples_{test_id}.csv')
+
+# Analyze memory_df (e.g., calculate growth, plot)
+if not memory_df['memory_info_mb'].isnull().all():
+    growth = memory_df['memory_info_mb'].iloc[-1] - memory_df['memory_info_mb'].iloc[0]
+    print(f"Total Memory Growth: {growth:.1f} MB")
+else:
+    print("No valid memory samples found.")
+
+print(f"Avg URLs/sec: {results['urls_processed'] / results['total_time_seconds']:.2f}")
+```
+
+## Visualization Dependencies
+
+(This section remains the same)
+For full visualization capabilities in the HTML reports generated by `benchmark_report.py`, install additional dependencies...
+
+## Directory Structure
+
+```
+benchmarking/          # Or your top-level directory name
+├── benchmark_reports/ # Generated HTML reports (by benchmark_report.py)
+├── reports/           # Raw test result data (from test_stress_sdk.py)
+├── test_site/         # Generated test content (temporary)
+├── benchmark_report.py# Report generator
+├── run_benchmark.py   # Test runner with predefined configs
+├── test_stress_sdk.py # Main stress test implementation using arun_many
+└── run_all.sh         # Simple wrapper script (may need updates)
+#└── requirements.txt   # Optional: Visualization dependencies for benchmark_report.py
+```
+
+## Cleanup
+
+To clean up after testing:
+
+```bash
+# Remove the test site content (if not using --keep-site)
+rm -rf test_site
+
+# Remove all raw reports and generated benchmark reports
+rm -rf reports benchmark_reports
+
+# Or use the --clean flag with run_benchmark.py
+python run_benchmark.py medium --clean
+```
+
+## Use in CI/CD
+
+(This section remains conceptually the same, just update script names)
+These tests can be integrated into CI/CD pipelines:
+```bash
+# Example CI script
+python run_benchmark.py medium --no-report # Run test without interactive report gen
+# Check exit code
+if [ $? -ne 0 ]; then echo "Stress test failed!"; exit 1; fi
+# Optionally, run report generator and check its output/metrics
+# python benchmark_report.py
+# check_report_metrics.py reports/test_summary_*.json || exit 1
+exit 0
+```
+
+## Troubleshooting
+
+-   **HTTP Server Port Conflict**: Use `--port` with `run_benchmark.py` or `test_stress_sdk.py`.
+-   **Memory Tracking Issues**: The `SimpleMemoryTracker` uses platform commands (`ps`, `/proc`, `tasklist`). Ensure these are available and the script has permission. If it consistently fails, memory reporting will be limited.
+-   **Visualization Missing**: Related to `benchmark_report.py` and its dependencies.
+-   **Site Generation Issues**: Check permissions for creating `./test_site/`. Use `--skip-generation` if you want to manage the site manually.
+-   **Testing Against External Site**: Ensure the external site is running and use `--use-existing-site --port <correct_port>`.
diff --git a/tests/memory/benchmark_report.py b/tests/memory/benchmark_report.py
new file mode 100755
index 00000000..a634f997
--- /dev/null
+++ b/tests/memory/benchmark_report.py
@@ -0,0 +1,887 @@
+#!/usr/bin/env python3
+"""
+Benchmark reporting tool for Crawl4AI stress tests.
+Generates visual reports and comparisons between test runs.
+"""
+
+import os
+import json
+import glob
+import argparse
+import sys
+from datetime import datetime
+from pathlib import Path
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+
+# Initialize rich console
+console = Console()
+
+# Try to import optional visualization dependencies
+VISUALIZATION_AVAILABLE = True
+try:
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import matplotlib as mpl
+    import numpy as np
+    import seaborn as sns
+except ImportError:
+    VISUALIZATION_AVAILABLE = False
+    console.print("[yellow]Warning: Visualization dependencies not found. Install with:[/yellow]")
+    console.print("[yellow]pip install pandas matplotlib seaborn[/yellow]")
+    console.print("[yellow]Only text-based reports will be generated.[/yellow]")
+
+# Configure plotting if available
+if VISUALIZATION_AVAILABLE:
+    # Set plot style for dark theme
+    plt.style.use('dark_background')
+    sns.set_theme(style="darkgrid")
+    
+    # Custom color palette based on Nord theme
+    nord_palette = ["#88c0d0", "#81a1c1", "#a3be8c", "#ebcb8b", "#bf616a", "#b48ead", "#5e81ac"]
+    sns.set_palette(nord_palette)
+
+class BenchmarkReporter:
+    """Generates visual reports and comparisons for Crawl4AI stress tests."""
+    
+    def __init__(self, reports_dir="reports", output_dir="benchmark_reports"):
+        """Initialize the benchmark reporter.
+        
+        Args:
+            reports_dir: Directory containing test result files
+            output_dir: Directory to save generated reports
+        """
+        self.reports_dir = Path(reports_dir)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Configure matplotlib if available
+        if VISUALIZATION_AVAILABLE:
+            # Ensure the matplotlib backend works in headless environments
+            mpl.use('Agg')
+            
+            # Set up styling for plots with dark theme
+            mpl.rcParams['figure.figsize'] = (12, 8)
+            mpl.rcParams['font.size'] = 12
+            mpl.rcParams['axes.labelsize'] = 14
+            mpl.rcParams['axes.titlesize'] = 16
+            mpl.rcParams['xtick.labelsize'] = 12
+            mpl.rcParams['ytick.labelsize'] = 12
+            mpl.rcParams['legend.fontsize'] = 12
+            mpl.rcParams['figure.facecolor'] = '#1e1e1e'
+            mpl.rcParams['axes.facecolor'] = '#2e3440'
+            mpl.rcParams['savefig.facecolor'] = '#1e1e1e'
+            mpl.rcParams['text.color'] = '#e0e0e0'
+            mpl.rcParams['axes.labelcolor'] = '#e0e0e0'
+            mpl.rcParams['xtick.color'] = '#e0e0e0'
+            mpl.rcParams['ytick.color'] = '#e0e0e0'
+            mpl.rcParams['grid.color'] = '#444444'
+            mpl.rcParams['figure.edgecolor'] = '#444444'
+        
+    def load_test_results(self, limit=None):
+        """Load all test results from the reports directory.
+        
+        Args:
+            limit: Optional limit on number of most recent tests to load
+            
+        Returns:
+            Dictionary mapping test IDs to result data
+        """
+        result_files = glob.glob(str(self.reports_dir / "test_results_*.json"))
+        
+        # Sort files by modification time (newest first)
+        result_files.sort(key=os.path.getmtime, reverse=True)
+        
+        if limit:
+            result_files = result_files[:limit]
+        
+        results = {}
+        for file_path in result_files:
+            try:
+                with open(file_path, 'r') as f:
+                    data = json.load(f)
+                    test_id = data.get('test_id')
+                    if test_id:
+                        results[test_id] = data
+                        
+                        # Try to load the corresponding memory samples
+                        csv_path = self.reports_dir / f"memory_samples_{test_id}.csv"
+                        if csv_path.exists():
+                            try:
+                                memory_df = pd.read_csv(csv_path)
+                                results[test_id]['memory_samples'] = memory_df
+                            except Exception as e:
+                                console.print(f"[yellow]Warning: Could not load memory samples for {test_id}: {e}[/yellow]")
+            except Exception as e:
+                console.print(f"[red]Error loading {file_path}: {e}[/red]")
+        
+        console.print(f"Loaded {len(results)} test results")
+        return results
+    
+    def generate_summary_table(self, results):
+        """Generate a summary table of test results.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            
+        Returns:
+            Rich Table object
+        """
+        table = Table(title="Crawl4AI Stress Test Summary", show_header=True)
+        
+        # Define columns
+        table.add_column("Test ID", style="cyan")
+        table.add_column("Date", style="bright_green")
+        table.add_column("URLs", justify="right")
+        table.add_column("Workers", justify="right")
+        table.add_column("Success %", justify="right")
+        table.add_column("Time (s)", justify="right")
+        table.add_column("Mem Growth", justify="right")
+        table.add_column("URLs/sec", justify="right")
+        
+        # Add rows
+        for test_id, data in sorted(results.items(), key=lambda x: x[0], reverse=True):
+            # Parse timestamp from test_id
+            try:
+                date_str = datetime.strptime(test_id, "%Y%m%d_%H%M%S").strftime("%Y-%m-%d %H:%M")
+            except:
+                date_str = "Unknown"
+            
+            # Calculate success percentage
+            total_urls = data.get('url_count', 0)
+            successful = data.get('successful_urls', 0)
+            success_pct = (successful / total_urls * 100) if total_urls > 0 else 0
+            
+            # Calculate memory growth if available
+            mem_growth = "N/A"
+            if 'memory_samples' in data:
+                samples = data['memory_samples']
+                if len(samples) >= 2:
+                    # Try to extract numeric values from memory_info strings
+                    try:
+                        first_mem = float(samples.iloc[0]['memory_info'].split()[0])
+                        last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
+                        mem_growth = f"{last_mem - first_mem:.1f} MB"
+                    except:
+                        pass
+            
+            # Calculate URLs per second
+            time_taken = data.get('total_time_seconds', 0)
+            urls_per_sec = total_urls / time_taken if time_taken > 0 else 0
+            
+            table.add_row(
+                test_id,
+                date_str,
+                str(total_urls),
+                str(data.get('workers', 'N/A')),
+                f"{success_pct:.1f}%",
+                f"{data.get('total_time_seconds', 0):.2f}",
+                mem_growth,
+                f"{urls_per_sec:.1f}"
+            )
+        
+        return table
+    
+    def generate_performance_chart(self, results, output_file=None):
+        """Generate a performance comparison chart.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            output_file: File path to save the chart
+            
+        Returns:
+            Path to the saved chart file or None if visualization is not available
+        """
+        if not VISUALIZATION_AVAILABLE:
+            console.print("[yellow]Skipping performance chart - visualization dependencies not available[/yellow]")
+            return None
+            
+        # Extract relevant data
+        data = []
+        for test_id, result in results.items():
+            urls = result.get('url_count', 0)
+            workers = result.get('workers', 0)
+            time_taken = result.get('total_time_seconds', 0)
+            urls_per_sec = urls / time_taken if time_taken > 0 else 0
+            
+            # Parse timestamp from test_id for sorting
+            try:
+                timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
+                data.append({
+                    'test_id': test_id,
+                    'timestamp': timestamp,
+                    'urls': urls,
+                    'workers': workers,
+                    'time_seconds': time_taken,
+                    'urls_per_sec': urls_per_sec
+                })
+            except:
+                console.print(f"[yellow]Warning: Could not parse timestamp from {test_id}[/yellow]")
+        
+        if not data:
+            console.print("[yellow]No valid data for performance chart[/yellow]")
+            return None
+        
+        # Convert to DataFrame and sort by timestamp
+        df = pd.DataFrame(data)
+        df = df.sort_values('timestamp')
+        
+        # Create the plot
+        fig, ax1 = plt.subplots(figsize=(12, 6))
+        
+        # Plot URLs per second as bars with properly set x-axis
+        x_pos = range(len(df['test_id']))
+        bars = ax1.bar(x_pos, df['urls_per_sec'], color='#88c0d0', alpha=0.8)
+        ax1.set_ylabel('URLs per Second', color='#88c0d0')
+        ax1.tick_params(axis='y', labelcolor='#88c0d0')
+        
+        # Properly set x-axis labels
+        ax1.set_xticks(x_pos)
+        ax1.set_xticklabels(df['test_id'].tolist(), rotation=45, ha='right')
+        
+        # Add worker count as text on each bar
+        for i, bar in enumerate(bars):
+            height = bar.get_height()
+            workers = df.iloc[i]['workers']
+            ax1.text(i, height + 0.1,
+                    f'W: {workers}', ha='center', va='bottom', fontsize=9, color='#e0e0e0')
+        
+        # Add a second y-axis for total URLs
+        ax2 = ax1.twinx()
+        ax2.plot(x_pos, df['urls'], '-', color='#bf616a', alpha=0.8, markersize=6, marker='o')
+        ax2.set_ylabel('Total URLs', color='#bf616a')
+        ax2.tick_params(axis='y', labelcolor='#bf616a')
+        
+        # Set title and layout
+        plt.title('Crawl4AI Performance Benchmarks')
+        plt.tight_layout()
+        
+        # Save the figure
+        if output_file is None:
+            output_file = self.output_dir / "performance_comparison.png"
+        plt.savefig(output_file, dpi=100, bbox_inches='tight')
+        plt.close()
+        
+        return output_file
+    
+    def generate_memory_charts(self, results, output_prefix=None):
+        """Generate memory usage charts for each test.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            output_prefix: Prefix for output file names
+            
+        Returns:
+            List of paths to the saved chart files
+        """
+        if not VISUALIZATION_AVAILABLE:
+            console.print("[yellow]Skipping memory charts - visualization dependencies not available[/yellow]")
+            return []
+            
+        output_files = []
+        
+        for test_id, result in results.items():
+            if 'memory_samples' not in result:
+                continue
+            
+            memory_df = result['memory_samples']
+            
+            # Check if we have enough data points
+            if len(memory_df) < 2:
+                continue
+            
+            # Try to extract numeric values from memory_info strings
+            try:
+                memory_values = []
+                for mem_str in memory_df['memory_info']:
+                    # Extract the number from strings like "142.8 MB"
+                    value = float(mem_str.split()[0])
+                    memory_values.append(value)
+                
+                memory_df['memory_mb'] = memory_values
+            except Exception as e:
+                console.print(f"[yellow]Could not parse memory values for {test_id}: {e}[/yellow]")
+                continue
+            
+            # Create the plot
+            plt.figure(figsize=(10, 6))
+            
+            # Plot memory usage over time
+            plt.plot(memory_df['elapsed_seconds'], memory_df['memory_mb'], 
+                     color='#88c0d0', marker='o', linewidth=2, markersize=4)
+            
+            # Add annotations for chunk processing
+            chunk_size = result.get('chunk_size', 0)
+            url_count = result.get('url_count', 0)
+            if chunk_size > 0 and url_count > 0:
+                # Estimate chunk processing times
+                num_chunks = (url_count + chunk_size - 1) // chunk_size  # Ceiling division
+                total_time = result.get('total_time_seconds', memory_df['elapsed_seconds'].max())
+                chunk_times = np.linspace(0, total_time, num_chunks + 1)[1:]
+                
+                for i, time_point in enumerate(chunk_times):
+                    if time_point <= memory_df['elapsed_seconds'].max():
+                        plt.axvline(x=time_point, color='#4c566a', linestyle='--', alpha=0.6)
+                        plt.text(time_point, memory_df['memory_mb'].min(), f'Chunk {i+1}', 
+                                rotation=90, verticalalignment='bottom', fontsize=8, color='#e0e0e0')
+            
+            # Set labels and title
+            plt.xlabel('Elapsed Time (seconds)', color='#e0e0e0')
+            plt.ylabel('Memory Usage (MB)', color='#e0e0e0')
+            plt.title(f'Memory Usage During Test {test_id}\n({url_count} URLs, {result.get("workers", "?")} Workers)', 
+                      color='#e0e0e0')
+            
+            # Add grid and set y-axis to start from zero
+            plt.grid(True, alpha=0.3, color='#4c566a')
+            
+            # Add test metadata as text
+            info_text = (
+                f"URLs: {url_count}\n"
+                f"Workers: {result.get('workers', 'N/A')}\n"
+                f"Chunk Size: {result.get('chunk_size', 'N/A')}\n"
+                f"Total Time: {result.get('total_time_seconds', 0):.2f}s\n"
+            )
+            
+            # Calculate memory growth
+            if len(memory_df) >= 2:
+                first_mem = memory_df.iloc[0]['memory_mb']
+                last_mem = memory_df.iloc[-1]['memory_mb']
+                growth = last_mem - first_mem
+                growth_rate = growth / result.get('total_time_seconds', 1)
+                
+                info_text += f"Memory Growth: {growth:.1f} MB\n"
+                info_text += f"Growth Rate: {growth_rate:.2f} MB/s"
+            
+            plt.figtext(0.02, 0.02, info_text, fontsize=9, color='#e0e0e0',
+                       bbox=dict(facecolor='#3b4252', alpha=0.8, edgecolor='#4c566a'))
+            
+            # Save the figure
+            if output_prefix is None:
+                output_file = self.output_dir / f"memory_chart_{test_id}.png"
+            else:
+                output_file = Path(f"{output_prefix}_memory_{test_id}.png")
+                
+            plt.tight_layout()
+            plt.savefig(output_file, dpi=100, bbox_inches='tight')
+            plt.close()
+            
+            output_files.append(output_file)
+        
+        return output_files
+    
+    def generate_comparison_report(self, results, title=None, output_file=None):
+        """Generate a comprehensive comparison report of multiple test runs.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            title: Optional title for the report
+            output_file: File path to save the report
+            
+        Returns:
+            Path to the saved report file
+        """
+        if not results:
+            console.print("[yellow]No results to generate comparison report[/yellow]")
+            return None
+        
+        if output_file is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_file = self.output_dir / f"comparison_report_{timestamp}.html"
+        
+        # Create data for the report
+        rows = []
+        for test_id, data in results.items():
+            # Calculate metrics
+            urls = data.get('url_count', 0)
+            workers = data.get('workers', 0)
+            successful = data.get('successful_urls', 0)
+            failed = data.get('failed_urls', 0)
+            time_seconds = data.get('total_time_seconds', 0)
+            
+            # Calculate additional metrics
+            success_rate = (successful / urls) * 100 if urls > 0 else 0
+            urls_per_second = urls / time_seconds if time_seconds > 0 else 0
+            urls_per_worker = urls / workers if workers > 0 else 0
+            
+            # Calculate memory growth if available
+            mem_start = None
+            mem_end = None
+            mem_growth = None
+            if 'memory_samples' in data:
+                samples = data['memory_samples']
+                if len(samples) >= 2:
+                    try:
+                        first_mem = float(samples.iloc[0]['memory_info'].split()[0])
+                        last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
+                        mem_start = first_mem
+                        mem_end = last_mem
+                        mem_growth = last_mem - first_mem
+                    except:
+                        pass
+            
+            # Parse timestamp from test_id
+            try:
+                timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
+            except:
+                timestamp = None
+            
+            rows.append({
+                'test_id': test_id,
+                'timestamp': timestamp,
+                'date': timestamp.strftime("%Y-%m-%d %H:%M:%S") if timestamp else "Unknown",
+                'urls': urls,
+                'workers': workers,
+                'chunk_size': data.get('chunk_size', 0),
+                'successful': successful,
+                'failed': failed,
+                'success_rate': success_rate,
+                'time_seconds': time_seconds,
+                'urls_per_second': urls_per_second,
+                'urls_per_worker': urls_per_worker,
+                'memory_start': mem_start,
+                'memory_end': mem_end,
+                'memory_growth': mem_growth
+            })
+        
+        # Sort data by timestamp if possible
+        if VISUALIZATION_AVAILABLE:
+            # Convert to DataFrame and sort by timestamp
+            df = pd.DataFrame(rows)
+            if 'timestamp' in df.columns and not df['timestamp'].isna().all():
+                df = df.sort_values('timestamp', ascending=False)
+        else:
+            # Simple sorting without pandas
+            rows.sort(key=lambda x: x.get('timestamp', datetime.now()), reverse=True)
+            df = None
+        
+        # Generate HTML report
+        html = []
+        html.append('<!DOCTYPE html>')
+        html.append('<html lang="en">')
+        html.append('<head>')
+        html.append('<meta charset="UTF-8">')
+        html.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
+        html.append(f'<title>{title or "Crawl4AI Benchmark Comparison"}</title>')
+        html.append('<style>')
+        html.append('''
+        body {
+            font-family: Arial, sans-serif;
+            line-height: 1.6;
+            margin: 0;
+            padding: 20px;
+            max-width: 1200px;
+            margin: 0 auto;
+            color: #e0e0e0;
+            background-color: #1e1e1e;
+        }
+        h1, h2, h3 {
+            color: #81a1c1;
+        }
+        table {
+            border-collapse: collapse;
+            width: 100%;
+            margin-bottom: 20px;
+        }
+        th, td {
+            text-align: left;
+            padding: 12px;
+            border-bottom: 1px solid #444;
+        }
+        th {
+            background-color: #2e3440;
+            font-weight: bold;
+        }
+        tr:hover {
+            background-color: #2e3440;
+        }
+        a {
+            color: #88c0d0;
+            text-decoration: none;
+        }
+        a:hover {
+            text-decoration: underline;
+        }
+        .chart-container {
+            margin: 30px 0;
+            text-align: center;
+            background-color: #2e3440;
+            padding: 20px;
+            border-radius: 8px;
+        }
+        .chart-container img {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid #444;
+            box-shadow: 0 0 10px rgba(0,0,0,0.3);
+        }
+        .card {
+            border: 1px solid #444;
+            border-radius: 8px;
+            padding: 15px;
+            margin-bottom: 20px;
+            background-color: #2e3440;
+            box-shadow: 0 0 10px rgba(0,0,0,0.2);
+        }
+        .highlight {
+            background-color: #3b4252;
+            font-weight: bold;
+        }
+        .status-good {
+            color: #a3be8c;
+        }
+        .status-warning {
+            color: #ebcb8b;
+        }
+        .status-bad {
+            color: #bf616a;
+        }
+        ''')
+        html.append('</style>')
+        html.append('</head>')
+        html.append('<body>')
+        
+        # Header
+        html.append(f'<h1>{title or "Crawl4AI Benchmark Comparison"}</h1>')
+        html.append(f'<p>Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>')
+        
+        # Summary section
+        html.append('<div class="card">')
+        html.append('<h2>Summary</h2>')
+        html.append('<p>This report compares the performance of Crawl4AI across multiple test runs.</p>')
+        
+        # Summary metrics
+        data_available = (VISUALIZATION_AVAILABLE and df is not None and not df.empty) or (not VISUALIZATION_AVAILABLE and len(rows) > 0)
+        if data_available:
+            # Get the latest test data
+            if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
+                latest_test = df.iloc[0]
+                latest_id = latest_test['test_id']
+            else:
+                latest_test = rows[0]  # First row (already sorted by timestamp)
+                latest_id = latest_test['test_id']
+            
+            html.append('<h3>Latest Test Results</h3>')
+            html.append('<ul>')
+            html.append(f'<li><strong>Test ID:</strong> {latest_id}</li>')
+            html.append(f'<li><strong>Date:</strong> {latest_test["date"]}</li>')
+            html.append(f'<li><strong>URLs:</strong> {latest_test["urls"]}</li>')
+            html.append(f'<li><strong>Workers:</strong> {latest_test["workers"]}</li>')
+            html.append(f'<li><strong>Success Rate:</strong> {latest_test["success_rate"]:.1f}%</li>')
+            html.append(f'<li><strong>Time:</strong> {latest_test["time_seconds"]:.2f} seconds</li>')
+            html.append(f'<li><strong>Performance:</strong> {latest_test["urls_per_second"]:.1f} URLs/second</li>')
+            
+            # Check memory growth (handle both pandas and dict mode)
+            memory_growth_available = False
+            if VISUALIZATION_AVAILABLE and df is not None:
+                if pd.notna(latest_test["memory_growth"]):
+                    html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
+                    memory_growth_available = True
+            else:
+                if latest_test["memory_growth"] is not None:
+                    html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
+                    memory_growth_available = True
+            
+            html.append('</ul>')
+            
+            # If we have more than one test, show trend
+            if (VISUALIZATION_AVAILABLE and df is not None and len(df) > 1) or (not VISUALIZATION_AVAILABLE and len(rows) > 1):
+                if VISUALIZATION_AVAILABLE and df is not None:
+                    prev_test = df.iloc[1]
+                else:
+                    prev_test = rows[1]
+                
+                # Calculate performance change
+                perf_change = ((latest_test["urls_per_second"] / prev_test["urls_per_second"]) - 1) * 100 if prev_test["urls_per_second"] > 0 else 0
+                
+                status_class = ""
+                if perf_change > 5:
+                    status_class = "status-good"
+                elif perf_change < -5:
+                    status_class = "status-bad"
+                
+                html.append('<h3>Performance Trend</h3>')
+                html.append('<ul>')
+                html.append(f'<li><strong>Performance Change:</strong> <span class="{status_class}">{perf_change:+.1f}%</span> compared to previous test</li>')
+                
+                # Memory trend if available
+                memory_trend_available = False
+                if VISUALIZATION_AVAILABLE and df is not None:
+                    if pd.notna(latest_test["memory_growth"]) and pd.notna(prev_test["memory_growth"]):
+                        mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
+                        memory_trend_available = True
+                else:
+                    if latest_test["memory_growth"] is not None and prev_test["memory_growth"] is not None:
+                        mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
+                        memory_trend_available = True
+                
+                if memory_trend_available:
+                    mem_status = ""
+                    if mem_change < -1:  # Improved (less growth)
+                        mem_status = "status-good"
+                    elif mem_change > 1:  # Worse (more growth)
+                        mem_status = "status-bad"
+                    
+                    html.append(f'<li><strong>Memory Trend:</strong> <span class="{mem_status}">{mem_change:+.1f} MB</span> change in memory growth</li>')
+                
+                html.append('</ul>')
+        
+        html.append('</div>')
+        
+        # Generate performance chart if visualization is available
+        if VISUALIZATION_AVAILABLE:
+            perf_chart = self.generate_performance_chart(results)
+            if perf_chart:
+                html.append('<div class="chart-container">')
+                html.append('<h2>Performance Comparison</h2>')
+                html.append(f'<img src="{os.path.relpath(perf_chart, os.path.dirname(output_file))}" alt="Performance Comparison Chart">')
+                html.append('</div>')
+        else:
+            html.append('<div class="chart-container">')
+            html.append('<h2>Performance Comparison</h2>')
+            html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
+            html.append('</div>')
+        
+        # Generate memory charts if visualization is available
+        if VISUALIZATION_AVAILABLE:
+            memory_charts = self.generate_memory_charts(results)
+            if memory_charts:
+                html.append('<div class="chart-container">')
+                html.append('<h2>Memory Usage</h2>')
+                
+                for chart in memory_charts:
+                    test_id = chart.stem.split('_')[-1]
+                    html.append(f'<h3>Test {test_id}</h3>')
+                    html.append(f'<img src="{os.path.relpath(chart, os.path.dirname(output_file))}" alt="Memory Chart for {test_id}">')
+                
+                html.append('</div>')
+        else:
+            html.append('<div class="chart-container">')
+            html.append('<h2>Memory Usage</h2>')
+            html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
+            html.append('</div>')
+        
+        # Detailed results table
+        html.append('<h2>Detailed Results</h2>')
+        
+        # Add the results as an HTML table
+        html.append('<table>')
+        
+        # Table headers
+        html.append('<tr>')
+        for col in ['Test ID', 'Date', 'URLs', 'Workers', 'Success %', 'Time (s)', 'URLs/sec', 'Mem Growth (MB)']:
+            html.append(f'<th>{col}</th>')
+        html.append('</tr>')
+        
+        # Table rows - handle both pandas DataFrame and list of dicts
+        if VISUALIZATION_AVAILABLE and df is not None:
+            # Using pandas DataFrame
+            for _, row in df.iterrows():
+                html.append('<tr>')
+                html.append(f'<td>{row["test_id"]}</td>')
+                html.append(f'<td>{row["date"]}</td>')
+                html.append(f'<td>{row["urls"]}</td>')
+                html.append(f'<td>{row["workers"]}</td>')
+                html.append(f'<td>{row["success_rate"]:.1f}%</td>')
+                html.append(f'<td>{row["time_seconds"]:.2f}</td>')
+                html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
+                
+                # Memory growth cell
+                if pd.notna(row["memory_growth"]):
+                    html.append(f'<td>{row["memory_growth"]:.1f}</td>')
+                else:
+                    html.append('<td>N/A</td>')
+                    
+                html.append('</tr>')
+        else:
+            # Using list of dicts (when pandas is not available)
+            for row in rows:
+                html.append('<tr>')
+                html.append(f'<td>{row["test_id"]}</td>')
+                html.append(f'<td>{row["date"]}</td>')
+                html.append(f'<td>{row["urls"]}</td>')
+                html.append(f'<td>{row["workers"]}</td>')
+                html.append(f'<td>{row["success_rate"]:.1f}%</td>')
+                html.append(f'<td>{row["time_seconds"]:.2f}</td>')
+                html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
+                
+                # Memory growth cell
+                if row["memory_growth"] is not None:
+                    html.append(f'<td>{row["memory_growth"]:.1f}</td>')
+                else:
+                    html.append('<td>N/A</td>')
+                    
+                html.append('</tr>')
+        
+        html.append('</table>')
+        
+        # Conclusion section
+        html.append('<div class="card">')
+        html.append('<h2>Conclusion</h2>')
+        
+        if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
+            # Using pandas for statistics (when available)
+            # Calculate some overall statistics
+            avg_urls_per_sec = df['urls_per_second'].mean()
+            max_urls_per_sec = df['urls_per_second'].max()
+            
+            # Determine if we have a trend
+            if len(df) > 1:
+                trend_data = df.sort_values('timestamp')
+                first_perf = trend_data.iloc[0]['urls_per_second']
+                last_perf = trend_data.iloc[-1]['urls_per_second']
+                
+                perf_change = ((last_perf / first_perf) - 1) * 100 if first_perf > 0 else 0
+                
+                if perf_change > 10:
+                    trend_desc = "significantly improved"
+                    trend_class = "status-good"
+                elif perf_change > 5:
+                    trend_desc = "improved"
+                    trend_class = "status-good"
+                elif perf_change < -10:
+                    trend_desc = "significantly decreased"
+                    trend_class = "status-bad"
+                elif perf_change < -5:
+                    trend_desc = "decreased"
+                    trend_class = "status-bad"
+                else:
+                    trend_desc = "remained stable"
+                    trend_class = ""
+                
+                html.append(f'<p>Overall performance has <span class="{trend_class}">{trend_desc}</span> over the test period.</p>')
+            
+            html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
+            html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
+            
+            # Memory leak assessment
+            if 'memory_growth' in df.columns and not df['memory_growth'].isna().all():
+                avg_growth = df['memory_growth'].mean()
+                max_growth = df['memory_growth'].max()
+                
+                if avg_growth < 5:
+                    leak_assessment = "No significant memory leaks detected"
+                    leak_class = "status-good"
+                elif avg_growth < 10:
+                    leak_assessment = "Minor memory growth observed"
+                    leak_class = "status-warning"
+                else:
+                    leak_assessment = "Potential memory leak detected"
+                    leak_class = "status-bad"
+                
+                html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
+        else:
+            # Manual calculations without pandas
+            if rows:
+                # Calculate average and max throughput
+                total_urls_per_sec = sum(row['urls_per_second'] for row in rows)
+                avg_urls_per_sec = total_urls_per_sec / len(rows)
+                max_urls_per_sec = max(row['urls_per_second'] for row in rows)
+                
+                html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
+                html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
+                
+                # Memory assessment (simplified without pandas)
+                growth_values = [row['memory_growth'] for row in rows if row['memory_growth'] is not None]
+                if growth_values:
+                    avg_growth = sum(growth_values) / len(growth_values)
+                    
+                    if avg_growth < 5:
+                        leak_assessment = "No significant memory leaks detected"
+                        leak_class = "status-good"
+                    elif avg_growth < 10:
+                        leak_assessment = "Minor memory growth observed"
+                        leak_class = "status-warning"
+                    else:
+                        leak_assessment = "Potential memory leak detected"
+                        leak_class = "status-bad"
+                    
+                    html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
+            else:
+                html.append('<p>No test data available for analysis.</p>')
+        
+        html.append('</div>')
+        
+        # Footer
+        html.append('<div style="margin-top: 30px; text-align: center; color: #777; font-size: 0.9em;">')
+        html.append('<p>Generated by Crawl4AI Benchmark Reporter</p>')
+        html.append('</div>')
+        
+        html.append('</body>')
+        html.append('</html>')
+        
+        # Write the HTML file
+        with open(output_file, 'w') as f:
+            f.write('\n'.join(html))
+        
+        # Print a clickable link for terminals that support it (iTerm, VS Code, etc.)
+        file_url = f"file://{os.path.abspath(output_file)}"
+        console.print(f"[green]Comparison report saved to: {output_file}[/green]")
+        console.print(f"[blue underline]Click to open report: {file_url}[/blue underline]")
+        return output_file
+    
+    def run(self, limit=None, output_file=None):
+        """Generate a full benchmark report.
+        
+        Args:
+            limit: Optional limit on number of most recent tests to include
+            output_file: Optional output file path
+            
+        Returns:
+            Path to the generated report file
+        """
+        # Load test results
+        results = self.load_test_results(limit=limit)
+        
+        if not results:
+            console.print("[yellow]No test results found. Run some tests first.[/yellow]")
+            return None
+        
+        # Generate and display summary table
+        summary_table = self.generate_summary_table(results)
+        console.print(summary_table)
+        
+        # Generate comparison report
+        title = f"Crawl4AI Benchmark Report ({len(results)} test runs)"
+        report_file = self.generate_comparison_report(results, title=title, output_file=output_file)
+        
+        if report_file:
+            console.print(f"[bold green]Report generated successfully: {report_file}[/bold green]")
+            return report_file
+        else:
+            console.print("[bold red]Failed to generate report[/bold red]")
+            return None
+
+
+def main():
+    """Main entry point for the benchmark reporter."""
+    parser = argparse.ArgumentParser(description="Generate benchmark reports for Crawl4AI stress tests")
+    
+    parser.add_argument("--reports-dir", type=str, default="reports",
+                      help="Directory containing test result files")
+    parser.add_argument("--output-dir", type=str, default="benchmark_reports",
+                      help="Directory to save generated reports")
+    parser.add_argument("--limit", type=int, default=None,
+                      help="Limit to most recent N test results")
+    parser.add_argument("--output-file", type=str, default=None,
+                      help="Custom output file path for the report")
+    
+    args = parser.parse_args()
+    
+    # Create the benchmark reporter
+    reporter = BenchmarkReporter(reports_dir=args.reports_dir, output_dir=args.output_dir)
+    
+    # Generate the report
+    report_file = reporter.run(limit=args.limit, output_file=args.output_file)
+    
+    if report_file:
+        print(f"Report generated at: {report_file}")
+        return 0
+    else:
+        print("Failed to generate report")
+        return 1
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())
\ No newline at end of file
diff --git a/tests/memory/cap_test.py b/tests/memory/cap_test.py
new file mode 100644
index 00000000..56d7b261
--- /dev/null
+++ b/tests/memory/cap_test.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+"""
+Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
+"""
+
+import asyncio, httpx, json, uuid, argparse
+
+API = "http://localhost:8020/crawl"
+URLS_PER_CALL = 1          # keep it minimal so each arun() == 1 page
+CONCURRENT_CALLS = 20      # way above your cap
+
+payload_template = {
+    "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {"cache_mode": "BYPASS", "verbose": False},
+    }
+}
+
+async def one_call(client):
+    payload = payload_template.copy()
+    payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
+    r = await client.post(API, json=payload)
+    r.raise_for_status()
+    return r.json()["server_peak_memory_mb"]
+
+async def main():
+    async with httpx.AsyncClient(timeout=60) as client:
+        tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
+        mem_usages = await asyncio.gather(*tasks)
+        print("Calls finished OK, server peaks reported:", mem_usages)
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/tests/memory/requirements.txt b/tests/memory/requirements.txt
new file mode 100644
index 00000000..230e0e1f
--- /dev/null
+++ b/tests/memory/requirements.txt
@@ -0,0 +1,4 @@
+pandas>=1.5.0
+matplotlib>=3.5.0
+seaborn>=0.12.0
+rich>=12.0.0
\ No newline at end of file
diff --git a/tests/memory/run_benchmark.py b/tests/memory/run_benchmark.py
new file mode 100755
index 00000000..1e110ddf
--- /dev/null
+++ b/tests/memory/run_benchmark.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+Run a complete Crawl4AI benchmark test using test_stress_sdk.py and generate a report.
+"""
+
+import sys
+import os
+import glob
+import argparse
+import subprocess
+import time
+from datetime import datetime
+
+from rich.console import Console
+from rich.text import Text
+
+console = Console()
+
+# Updated TEST_CONFIGS to use max_sessions
+TEST_CONFIGS = {
+    "quick":   {"urls": 50,   "max_sessions": 4,  "chunk_size": 10, "description": "Quick test (50 URLs, 4 sessions)"},
+    "small":   {"urls": 100,  "max_sessions": 8,  "chunk_size": 20, "description": "Small test (100 URLs, 8 sessions)"},
+    "medium":  {"urls": 500,  "max_sessions": 16, "chunk_size": 50, "description": "Medium test (500 URLs, 16 sessions)"},
+    "large":   {"urls": 1000, "max_sessions": 32, "chunk_size": 100,"description": "Large test (1000 URLs, 32 sessions)"},
+    "extreme": {"urls": 2000, "max_sessions": 64, "chunk_size": 200,"description": "Extreme test (2000 URLs, 64 sessions)"},
+}
+
+# Arguments to forward directly if present in custom_args
+FORWARD_ARGS = {
+    "urls": "--urls",
+    "max_sessions": "--max-sessions",
+    "chunk_size": "--chunk-size",
+    "port": "--port",
+    "monitor_mode": "--monitor-mode",
+}
+# Boolean flags to forward if True
+FORWARD_FLAGS = {
+    "stream": "--stream",
+    "use_rate_limiter": "--use-rate-limiter",
+    "keep_server_alive": "--keep-server-alive",
+    "use_existing_site": "--use-existing-site",
+    "skip_generation": "--skip-generation",
+    "keep_site": "--keep-site",
+    "clean_reports": "--clean-reports", # Note: clean behavior is handled here, but pass flag if needed
+    "clean_site": "--clean-site",     # Note: clean behavior is handled here, but pass flag if needed
+}
+
+def run_benchmark(config_name, custom_args=None, compare=True, clean=False):
+    """Runs the stress test and optionally the report generator."""
+    if config_name not in TEST_CONFIGS and config_name != "custom":
+        console.print(f"[bold red]Unknown configuration: {config_name}[/bold red]")
+        return False
+
+    # Print header
+    title = "Crawl4AI SDK Benchmark Test"
+    if config_name != "custom":
+        title += f" - {TEST_CONFIGS[config_name]['description']}"
+    else:
+        # Safely get custom args for title
+        urls = custom_args.get('urls', '?') if custom_args else '?'
+        sessions = custom_args.get('max_sessions', '?') if custom_args else '?'
+        title += f" - Custom ({urls} URLs, {sessions} sessions)"
+
+    console.print(f"\n[bold blue]{title}[/bold blue]")
+    console.print("=" * (len(title) + 4)) # Adjust underline length
+
+    console.print("\n[bold white]Preparing test...[/bold white]")
+
+    # --- Command Construction ---
+    # Use the new script name
+    cmd = ["python", "test_stress_sdk.py"]
+
+    # Apply config or custom args
+    args_to_use = {}
+    if config_name != "custom":
+        args_to_use = TEST_CONFIGS[config_name].copy()
+        # If custom args are provided (e.g., boolean flags), overlay them
+        if custom_args:
+            args_to_use.update(custom_args)
+    elif custom_args: # Custom config
+        args_to_use = custom_args.copy()
+
+    # Add arguments with values
+    for key, arg_name in FORWARD_ARGS.items():
+        if key in args_to_use:
+            cmd.extend([arg_name, str(args_to_use[key])])
+
+    # Add boolean flags
+    for key, flag_name in FORWARD_FLAGS.items():
+        if args_to_use.get(key, False): # Check if key exists and is True
+             # Special handling for clean flags - apply locally, don't forward?
+             # Decide if test_stress_sdk.py also needs --clean flags or if run_benchmark handles it.
+             # For now, let's assume run_benchmark handles cleaning based on its own --clean flag.
+             # We'll forward other flags.
+            if key not in ["clean_reports", "clean_site"]:
+                 cmd.append(flag_name)
+
+    # Handle the top-level --clean flag for run_benchmark
+    if clean:
+        # Pass clean flags to the stress test script as well, if needed
+        # This assumes test_stress_sdk.py also uses --clean-reports and --clean-site
+        cmd.append("--clean-reports")
+        cmd.append("--clean-site")
+        console.print("[yellow]Applying --clean: Cleaning reports and site before test.[/yellow]")
+        # Actual cleaning logic might reside here or be delegated entirely
+
+    console.print(f"\n[bold white]Running stress test:[/bold white] {' '.join(cmd)}")
+    start = time.time()
+
+    # Execute the stress test script
+    # Use Popen to stream output
+    try:
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, encoding='utf-8', errors='replace')
+        while True:
+            line = proc.stdout.readline()
+            if not line:
+                break
+            console.print(line.rstrip()) # Print line by line
+        proc.wait() # Wait for the process to complete
+    except FileNotFoundError:
+         console.print(f"[bold red]Error: Script 'test_stress_sdk.py' not found. Make sure it's in the correct directory.[/bold red]")
+         return False
+    except Exception as e:
+         console.print(f"[bold red]Error running stress test subprocess: {e}[/bold red]")
+         return False
+
+
+    if proc.returncode != 0:
+        console.print(f"[bold red]Stress test failed with exit code {proc.returncode}[/bold red]")
+        return False
+
+    duration = time.time() - start
+    console.print(f"[bold green]Stress test completed in {duration:.1f} seconds[/bold green]")
+
+    # --- Report Generation (Optional) ---
+    if compare:
+        # Assuming benchmark_report.py exists and works with the generated reports
+        report_script = "benchmark_report.py" # Keep configurable if needed
+        report_cmd = ["python", report_script]
+        console.print(f"\n[bold white]Generating benchmark report: {' '.join(report_cmd)}[/bold white]")
+
+        # Run the report command and capture output
+        try:
+             report_proc = subprocess.run(report_cmd, capture_output=True, text=True, check=False, encoding='utf-8', errors='replace') # Use check=False to handle potential errors
+
+             # Print the captured output from benchmark_report.py
+             if report_proc.stdout:
+                 console.print("\n" + report_proc.stdout)
+             if report_proc.stderr:
+                 console.print("[yellow]Report generator stderr:[/yellow]\n" + report_proc.stderr)
+
+             if report_proc.returncode != 0:
+                 console.print(f"[bold yellow]Benchmark report generation script '{report_script}' failed with exit code {report_proc.returncode}[/bold yellow]")
+                 # Don't return False here, test itself succeeded
+             else:
+                  console.print(f"[bold green]Benchmark report script '{report_script}' completed.[/bold green]")
+
+             # Find and print clickable links to the reports
+             # Assuming reports are saved in 'benchmark_reports' by benchmark_report.py
+             report_dir = "benchmark_reports"
+             if os.path.isdir(report_dir):
+                 report_files = glob.glob(os.path.join(report_dir, "comparison_report_*.html"))
+                 if report_files:
+                     try:
+                         latest_report = max(report_files, key=os.path.getctime)
+                         report_path = os.path.abspath(latest_report)
+                         report_url = pathlib.Path(report_path).as_uri() # Better way to create file URI
+                         console.print(f"[bold cyan]Click to open report: [link={report_url}]{report_url}[/link][/bold cyan]")
+                     except Exception as e:
+                          console.print(f"[yellow]Could not determine latest report: {e}[/yellow]")
+
+                 chart_files = glob.glob(os.path.join(report_dir, "memory_chart_*.png"))
+                 if chart_files:
+                      try:
+                         latest_chart = max(chart_files, key=os.path.getctime)
+                         chart_path = os.path.abspath(latest_chart)
+                         chart_url = pathlib.Path(chart_path).as_uri()
+                         console.print(f"[cyan]Memory chart: [link={chart_url}]{chart_url}[/link][/cyan]")
+                      except Exception as e:
+                           console.print(f"[yellow]Could not determine latest chart: {e}[/yellow]")
+             else:
+                  console.print(f"[yellow]Benchmark report directory '{report_dir}' not found. Cannot link reports.[/yellow]")
+
+        except FileNotFoundError:
+             console.print(f"[bold red]Error: Report script '{report_script}' not found.[/bold red]")
+        except Exception as e:
+             console.print(f"[bold red]Error running report generation subprocess: {e}[/bold red]")
+
+
+    # Prompt to exit
+    console.print("\n[bold green]Benchmark run finished. Press Enter to exit.[/bold green]")
+    try:
+        input() # Wait for user input
+    except EOFError:
+        pass # Handle case where input is piped or unavailable
+
+    return True
+
+def main():
+    parser = argparse.ArgumentParser(description="Run a Crawl4AI SDK benchmark test and generate a report")
+
+    # --- Arguments ---
+    parser.add_argument("config", choices=list(TEST_CONFIGS) + ["custom"],
+                        help="Test configuration: quick, small, medium, large, extreme, or custom")
+
+    # Arguments for 'custom' config or to override presets
+    parser.add_argument("--urls", type=int, help="Number of URLs")
+    parser.add_argument("--max-sessions", type=int, help="Max concurrent sessions (replaces --workers)")
+    parser.add_argument("--chunk-size", type=int, help="URLs per batch (for non-stream logging)")
+    parser.add_argument("--port", type=int, help="HTTP server port")
+    parser.add_argument("--monitor-mode", type=str, choices=["DETAILED", "AGGREGATED"], help="Monitor display mode")
+
+    # Boolean flags / options
+    parser.add_argument("--stream", action="store_true", help="Enable streaming results (disables batch logging)")
+    parser.add_argument("--use-rate-limiter", action="store_true", help="Enable basic rate limiter")
+    parser.add_argument("--no-report", action="store_true", help="Skip generating comparison report")
+    parser.add_argument("--clean", action="store_true", help="Clean up reports and site before running")
+    parser.add_argument("--keep-server-alive", action="store_true", help="Keep HTTP server running after test")
+    parser.add_argument("--use-existing-site", action="store_true", help="Use existing site on specified port")
+    parser.add_argument("--skip-generation", action="store_true", help="Use existing site files without regenerating")
+    parser.add_argument("--keep-site", action="store_true", help="Keep generated site files after test")
+    # Removed url_level_logging as it's implicitly handled by stream/batch mode now
+
+    args = parser.parse_args()
+
+    custom_args = {}
+
+    # Populate custom_args from explicit command-line args
+    if args.urls is not None: custom_args["urls"] = args.urls
+    if args.max_sessions is not None: custom_args["max_sessions"] = args.max_sessions
+    if args.chunk_size is not None: custom_args["chunk_size"] = args.chunk_size
+    if args.port is not None: custom_args["port"] = args.port
+    if args.monitor_mode is not None: custom_args["monitor_mode"] = args.monitor_mode
+    if args.stream: custom_args["stream"] = True
+    if args.use_rate_limiter: custom_args["use_rate_limiter"] = True
+    if args.keep_server_alive: custom_args["keep_server_alive"] = True
+    if args.use_existing_site: custom_args["use_existing_site"] = True
+    if args.skip_generation: custom_args["skip_generation"] = True
+    if args.keep_site: custom_args["keep_site"] = True
+    # Clean flags are handled by the 'clean' argument passed to run_benchmark
+
+    # Validate custom config requirements
+    if args.config == "custom":
+        required_custom = ["urls", "max_sessions", "chunk_size"]
+        missing = [f"--{arg}" for arg in required_custom if arg not in custom_args]
+        if missing:
+            console.print(f"[bold red]Error: 'custom' config requires: {', '.join(missing)}[/bold red]")
+            return 1
+
+    success = run_benchmark(
+        config_name=args.config,
+        custom_args=custom_args, # Pass all collected custom args
+        compare=not args.no_report,
+        clean=args.clean
+    )
+    return 0 if success else 1
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/tests/memory/test_crawler_monitor.py b/tests/memory/test_crawler_monitor.py
new file mode 100644
index 00000000..89cc08b8
--- /dev/null
+++ b/tests/memory/test_crawler_monitor.py
@@ -0,0 +1,168 @@
+"""
+Test script for the CrawlerMonitor component.
+This script simulates a crawler with multiple tasks to demonstrate the real-time monitoring capabilities.
+"""
+
+import time
+import uuid
+import random
+import threading
+import sys
+import os
+
+# Add the parent directory to the path to import crawl4ai
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+
+from crawl4ai.components.crawler_monitor import CrawlerMonitor
+from crawl4ai.models import CrawlStatus
+
+def simulate_crawler_task(monitor, task_id, url, simulate_failure=False):
+    """Simulate a crawler task with different states."""
+    # Task starts in the QUEUED state
+    wait_time = random.uniform(0.5, 3.0)
+    time.sleep(wait_time)
+    
+    # Update to IN_PROGRESS state
+    monitor.update_task(
+        task_id=task_id,
+        status=CrawlStatus.IN_PROGRESS,
+        start_time=time.time(),
+        wait_time=wait_time
+    )
+    
+    # Simulate task running
+    process_time = random.uniform(1.0, 5.0)
+    for i in range(int(process_time * 2)):
+        # Simulate memory usage changes
+        memory_usage = random.uniform(5.0, 25.0)
+        monitor.update_task(
+            task_id=task_id,
+            memory_usage=memory_usage,
+            peak_memory=max(memory_usage, monitor.get_task_stats(task_id).get("peak_memory", 0))
+        )
+        time.sleep(0.5)
+    
+    # Update to COMPLETED or FAILED state
+    if simulate_failure and random.random() < 0.8:  # 80% chance of failure if simulate_failure is True
+        monitor.update_task(
+            task_id=task_id,
+            status=CrawlStatus.FAILED,
+            end_time=time.time(),
+            error_message="Simulated failure: Connection timeout",
+            memory_usage=0.0
+        )
+    else:
+        monitor.update_task(
+            task_id=task_id,
+            status=CrawlStatus.COMPLETED,
+            end_time=time.time(),
+            memory_usage=0.0
+        )
+
+def update_queue_stats(monitor, num_queued_tasks):
+    """Update queue statistics periodically."""
+    while monitor.is_running:
+        queued_tasks = [
+            task for task_id, task in monitor.get_all_task_stats().items()
+            if task["status"] == CrawlStatus.QUEUED.name
+        ]
+        
+        total_queued = len(queued_tasks)
+        
+        if total_queued > 0:
+            current_time = time.time()
+            wait_times = [
+                current_time - task.get("enqueue_time", current_time)
+                for task in queued_tasks
+            ]
+            highest_wait_time = max(wait_times) if wait_times else 0.0
+            avg_wait_time = sum(wait_times) / len(wait_times) if wait_times else 0.0
+        else:
+            highest_wait_time = 0.0
+            avg_wait_time = 0.0
+        
+        monitor.update_queue_statistics(
+            total_queued=total_queued,
+            highest_wait_time=highest_wait_time,
+            avg_wait_time=avg_wait_time
+        )
+        
+        # Simulate memory pressure based on number of active tasks
+        active_tasks = len([
+            task for task_id, task in monitor.get_all_task_stats().items()
+            if task["status"] == CrawlStatus.IN_PROGRESS.name
+        ])
+        
+        if active_tasks > 8:
+            monitor.update_memory_status("CRITICAL")
+        elif active_tasks > 4:
+            monitor.update_memory_status("PRESSURE")
+        else:
+            monitor.update_memory_status("NORMAL")
+            
+        time.sleep(1.0)
+
+def test_crawler_monitor():
+    """Test the CrawlerMonitor with simulated crawler tasks."""
+    # Total number of URLs to crawl
+    total_urls = 50
+    
+    # Initialize the monitor
+    monitor = CrawlerMonitor(urls_total=total_urls, refresh_rate=0.5)
+    
+    # Start the monitor
+    monitor.start()
+    
+    # Start thread to update queue statistics
+    queue_stats_thread = threading.Thread(target=update_queue_stats, args=(monitor, total_urls))
+    queue_stats_thread.daemon = True
+    queue_stats_thread.start()
+    
+    try:
+        # Create task threads
+        threads = []
+        for i in range(total_urls):
+            task_id = str(uuid.uuid4())
+            url = f"https://example.com/page{i}"
+            
+            # Add task to monitor
+            monitor.add_task(task_id, url)
+            
+            # Determine if this task should simulate failure
+            simulate_failure = (i % 10 == 0)  # Every 10th task
+            
+            # Create and start thread for this task
+            thread = threading.Thread(
+                target=simulate_crawler_task,
+                args=(monitor, task_id, url, simulate_failure)
+            )
+            thread.daemon = True
+            threads.append(thread)
+        
+        # Start threads with delay to simulate tasks being added over time
+        batch_size = 5
+        for i in range(0, len(threads), batch_size):
+            batch = threads[i:i+batch_size]
+            for thread in batch:
+                thread.start()
+                time.sleep(0.5)  # Small delay between starting threads
+            
+            # Wait a bit before starting the next batch
+            time.sleep(2.0)
+        
+        # Wait for all threads to complete
+        for thread in threads:
+            thread.join()
+            
+        # Keep monitor running a bit longer to see the final state
+        time.sleep(5.0)
+        
+    except KeyboardInterrupt:
+        print("\nTest interrupted by user")
+    finally:
+        # Stop the monitor
+        monitor.stop()
+        print("\nCrawler monitor test completed")
+
+if __name__ == "__main__":
+    test_crawler_monitor()
\ No newline at end of file
diff --git a/tests/memory/test_dispatcher_stress.py b/tests/memory/test_dispatcher_stress.py
new file mode 100644
index 00000000..f81f78f6
--- /dev/null
+++ b/tests/memory/test_dispatcher_stress.py
@@ -0,0 +1,410 @@
+import asyncio
+import time
+import psutil
+import logging
+import random
+from typing import List, Dict
+import uuid
+import sys
+import os
+
+# Import your crawler components
+from crawl4ai.models import DisplayMode, CrawlStatus, CrawlResult
+from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig, CacheMode
+from crawl4ai import AsyncWebCrawler
+from crawl4ai import MemoryAdaptiveDispatcher, CrawlerMonitor
+
+# Global configuration
+STREAM = False  # Toggle between streaming and non-streaming modes
+
+# Configure logging to file only (to avoid breaking the rich display)
+os.makedirs("logs", exist_ok=True)
+file_handler = logging.FileHandler("logs/memory_stress_test.log")
+file_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s] %(message)s'))
+
+# Root logger - only to file, not console
+root_logger = logging.getLogger()
+root_logger.setLevel(logging.INFO)
+root_logger.addHandler(file_handler)
+
+# Our test logger also writes to file only
+logger = logging.getLogger("memory_stress_test")
+logger.setLevel(logging.INFO)
+logger.addHandler(file_handler)
+logger.propagate = False  # Don't propagate to root logger
+
+# Create a memory restrictor to simulate limited memory environment
+class MemorySimulator:
+    def __init__(self, target_percent: float = 85.0, aggressive: bool = False):
+        """Simulates memory pressure by allocating memory"""
+        self.target_percent = target_percent
+        self.memory_blocks: List[bytearray] = []
+        self.aggressive = aggressive
+        
+    def apply_pressure(self, additional_percent: float = 0.0):
+        """Fill memory until we reach target percentage"""
+        current_percent = psutil.virtual_memory().percent
+        target = self.target_percent + additional_percent
+        
+        if current_percent >= target:
+            return  # Already at target
+            
+        logger.info(f"Current memory: {current_percent}%, target: {target}%")
+        
+        # Calculate how much memory we need to allocate
+        total_memory = psutil.virtual_memory().total
+        target_usage = (target / 100.0) * total_memory
+        current_usage = (current_percent / 100.0) * total_memory
+        bytes_to_allocate = int(target_usage - current_usage)
+        
+        if bytes_to_allocate <= 0:
+            return
+            
+        # Allocate in smaller chunks to avoid overallocation
+        if self.aggressive:
+            # Use larger chunks for faster allocation in aggressive mode
+            chunk_size = min(bytes_to_allocate, 200 * 1024 * 1024)  # 200MB chunks
+        else:
+            chunk_size = min(bytes_to_allocate, 50 * 1024 * 1024)   # 50MB chunks
+        
+        try:
+            logger.info(f"Allocating {chunk_size / (1024 * 1024):.1f}MB to reach target memory usage")
+            self.memory_blocks.append(bytearray(chunk_size))
+            time.sleep(0.5)  # Give system time to register the allocation
+        except MemoryError:
+            logger.warning("Unable to allocate more memory")
+            
+    def release_pressure(self, percent: float = None):
+        """
+        Release allocated memory
+        If percent is specified, release that percentage of blocks
+        """
+        if not self.memory_blocks:
+            return
+            
+        if percent is None:
+            # Release all
+            logger.info(f"Releasing all {len(self.memory_blocks)} memory blocks")
+            self.memory_blocks.clear()
+        else:
+            # Release specified percentage
+            blocks_to_release = int(len(self.memory_blocks) * (percent / 100.0))
+            if blocks_to_release > 0:
+                logger.info(f"Releasing {blocks_to_release} of {len(self.memory_blocks)} memory blocks ({percent}%)")
+                self.memory_blocks = self.memory_blocks[blocks_to_release:]
+                
+    def spike_pressure(self, duration: float = 5.0):
+        """
+        Create a temporary spike in memory pressure then release
+        Useful for forcing requeues
+        """
+        logger.info(f"Creating memory pressure spike for {duration} seconds")
+        # Save current blocks count
+        initial_blocks = len(self.memory_blocks)
+        
+        # Create spike with extra 5%
+        self.apply_pressure(additional_percent=5.0)
+        
+        # Schedule release after duration
+        asyncio.create_task(self._delayed_release(duration, initial_blocks))
+        
+    async def _delayed_release(self, delay: float, target_blocks: int):
+        """Helper for spike_pressure - releases extra blocks after delay"""
+        await asyncio.sleep(delay)
+        
+        # Remove blocks added since spike started
+        if len(self.memory_blocks) > target_blocks:
+            logger.info(f"Releasing memory spike ({len(self.memory_blocks) - target_blocks} blocks)")
+            self.memory_blocks = self.memory_blocks[:target_blocks]
+            
+# Test statistics collector
+class TestResults:
+    def __init__(self):
+        self.start_time = time.time()
+        self.completed_urls: List[str] = []
+        self.failed_urls: List[str] = []
+        self.requeued_count = 0
+        self.memory_warnings = 0
+        self.max_memory_usage = 0.0
+        self.max_queue_size = 0
+        self.max_wait_time = 0.0
+        self.url_to_attempt: Dict[str, int] = {}  # Track retries per URL
+        
+    def log_summary(self):
+        duration = time.time() - self.start_time
+        logger.info("===== TEST SUMMARY =====")
+        logger.info(f"Stream mode: {'ON' if STREAM else 'OFF'}")
+        logger.info(f"Total duration: {duration:.1f} seconds")
+        logger.info(f"Completed URLs: {len(self.completed_urls)}")
+        logger.info(f"Failed URLs: {len(self.failed_urls)}")
+        logger.info(f"Requeue events: {self.requeued_count}")
+        logger.info(f"Memory warnings: {self.memory_warnings}")
+        logger.info(f"Max memory usage: {self.max_memory_usage:.1f}%")
+        logger.info(f"Max queue size: {self.max_queue_size}")
+        logger.info(f"Max wait time: {self.max_wait_time:.1f} seconds")
+        
+        # Log URLs with multiple attempts
+        retried_urls = {url: count for url, count in self.url_to_attempt.items() if count > 1}
+        if retried_urls:
+            logger.info(f"URLs with retries: {len(retried_urls)}")
+            # Log the top 5 most retried
+            top_retries = sorted(retried_urls.items(), key=lambda x: x[1], reverse=True)[:5]
+            for url, count in top_retries:
+                logger.info(f"  URL {url[-30:]} had {count} attempts")
+        
+        # Write summary to a separate human-readable file
+        with open("logs/test_summary.txt", "w") as f:
+            f.write(f"Stream mode: {'ON' if STREAM else 'OFF'}\n")
+            f.write(f"Total duration: {duration:.1f} seconds\n")
+            f.write(f"Completed URLs: {len(self.completed_urls)}\n")
+            f.write(f"Failed URLs: {len(self.failed_urls)}\n")
+            f.write(f"Requeue events: {self.requeued_count}\n")
+            f.write(f"Memory warnings: {self.memory_warnings}\n")
+            f.write(f"Max memory usage: {self.max_memory_usage:.1f}%\n")
+            f.write(f"Max queue size: {self.max_queue_size}\n")
+            f.write(f"Max wait time: {self.max_wait_time:.1f} seconds\n")
+        
+# Custom monitor with stats tracking
+# Custom monitor that extends CrawlerMonitor with test-specific tracking
+class StressTestMonitor(CrawlerMonitor):
+    def __init__(self, test_results: TestResults, **kwargs):
+        # Initialize the parent CrawlerMonitor
+        super().__init__(**kwargs)
+        self.test_results = test_results
+        
+    def update_memory_status(self, status: str):
+        if status != self.memory_status:
+            logger.info(f"Memory status changed: {self.memory_status} -> {status}")
+            if "CRITICAL" in status or "PRESSURE" in status:
+                self.test_results.memory_warnings += 1
+                
+        # Track peak memory usage in test results
+        current_memory = psutil.virtual_memory().percent
+        self.test_results.max_memory_usage = max(self.test_results.max_memory_usage, current_memory)
+        
+        # Call parent method to update the dashboard
+        super().update_memory_status(status)
+        
+    def update_queue_statistics(self, total_queued: int, highest_wait_time: float, avg_wait_time: float):
+        # Track queue metrics in test results
+        self.test_results.max_queue_size = max(self.test_results.max_queue_size, total_queued)
+        self.test_results.max_wait_time = max(self.test_results.max_wait_time, highest_wait_time)
+        
+        # Call parent method to update the dashboard
+        super().update_queue_statistics(total_queued, highest_wait_time, avg_wait_time)
+        
+    def update_task(self, task_id: str, **kwargs):
+        # Track URL status changes for test results
+        if task_id in self.stats:
+            old_status = self.stats[task_id].status
+            
+            # If this is a requeue event (requeued due to memory pressure)
+            if 'error_message' in kwargs and 'requeued' in kwargs['error_message']:
+                if not hasattr(self.stats[task_id], 'counted_requeue') or not self.stats[task_id].counted_requeue:
+                    self.test_results.requeued_count += 1
+                    self.stats[task_id].counted_requeue = True
+                    
+            # Track completion status for test results
+            if 'status' in kwargs:
+                new_status = kwargs['status']
+                if old_status != new_status:
+                    if new_status == CrawlStatus.COMPLETED:
+                        if task_id not in self.test_results.completed_urls:
+                            self.test_results.completed_urls.append(task_id)
+                    elif new_status == CrawlStatus.FAILED:
+                        if task_id not in self.test_results.failed_urls:
+                            self.test_results.failed_urls.append(task_id)
+        
+        # Call parent method to update the dashboard
+        super().update_task(task_id, **kwargs)
+        self.live.update(self._create_table())
+
+# Generate test URLs - use example.com with unique paths to avoid browser caching
+def generate_test_urls(count: int) -> List[str]:
+    urls = []
+    for i in range(count):
+        # Add random path and query parameters to create unique URLs
+        path = f"/path/{uuid.uuid4()}"
+        query = f"?test={i}&random={random.randint(1, 100000)}"
+        urls.append(f"https://example.com{path}{query}")
+    return urls
+
+# Process result callback
+async def process_result(result, test_results: TestResults):
+    # Track attempt counts
+    if result.url not in test_results.url_to_attempt:
+        test_results.url_to_attempt[result.url] = 1
+    else:
+        test_results.url_to_attempt[result.url] += 1
+    
+    if "requeued" in result.error_message:
+        test_results.requeued_count += 1
+        logger.debug(f"Requeued due to memory pressure: {result.url}")
+    elif result.success:
+        test_results.completed_urls.append(result.url)
+        logger.debug(f"Successfully processed: {result.url}")
+    else:
+        test_results.failed_urls.append(result.url)
+        logger.warning(f"Failed to process: {result.url} - {result.error_message}")
+
+# Process multiple results (used in non-streaming mode)
+async def process_results(results, test_results: TestResults):
+    for result in results:
+        await process_result(result, test_results)
+
+# Main test function for extreme memory pressure simulation
+async def run_memory_stress_test(
+    url_count: int = 100,
+    target_memory_percent: float = 92.0,  # Push to dangerous levels
+    chunk_size: int = 20,  # Larger chunks for more chaos
+    aggressive: bool = False,
+    spikes: bool = True
+):
+    test_results = TestResults()
+    memory_simulator = MemorySimulator(target_percent=target_memory_percent, aggressive=aggressive)
+    
+    logger.info(f"Starting stress test with {url_count} URLs in {'STREAM' if STREAM else 'NON-STREAM'} mode")
+    logger.info(f"Target memory usage: {target_memory_percent}%")
+    
+    # First, elevate memory usage to create pressure
+    logger.info("Creating initial memory pressure...")
+    memory_simulator.apply_pressure()
+    
+    # Create test URLs in chunks to simulate real-world crawling where URLs are discovered
+    all_urls = generate_test_urls(url_count)
+    url_chunks = [all_urls[i:i+chunk_size] for i in range(0, len(all_urls), chunk_size)]
+    
+    # Set up the crawler components - low memory thresholds to create more requeues
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        verbose=False,
+        stream=STREAM  # Use the global STREAM variable to set mode
+    )
+    
+    # Create monitor with reference to test results
+    monitor = StressTestMonitor(
+        test_results=test_results,
+        display_mode=DisplayMode.DETAILED,
+        max_visible_rows=20,
+        total_urls=url_count  # Pass total URLs count
+    )
+    
+    # Create dispatcher with EXTREME settings - pure survival mode
+    # These settings are designed to create a memory battleground
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=63.0,  # Start throttling at just 60% memory
+        critical_threshold_percent=70.0,  # Start requeuing at 70% - incredibly aggressive  
+        recovery_threshold_percent=55.0,  # Only resume normal ops when plenty of memory available
+        check_interval=0.1,  # Check extremely frequently (100ms)
+        max_session_permit=20 if aggressive else 10,  # Double the concurrent sessions - pure chaos
+        fairness_timeout=10.0,  # Extremely low timeout - rapid priority changes
+        monitor=monitor
+    )
+    
+    # Set up spike schedule if enabled
+    if spikes:
+        spike_intervals = []
+        # Create 3-5 random spike times
+        num_spikes = random.randint(3, 5)
+        for _ in range(num_spikes):
+            # Schedule spikes at random chunks
+            chunk_index = random.randint(1, len(url_chunks) - 1)
+            spike_intervals.append(chunk_index)
+        logger.info(f"Scheduled memory spikes at chunks: {spike_intervals}")
+    
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            # Process URLs in chunks to simulate discovering URLs over time
+            for chunk_index, url_chunk in enumerate(url_chunks):
+                logger.info(f"Processing chunk {chunk_index+1}/{len(url_chunks)} ({len(url_chunk)} URLs)")
+                
+                # Regular pressure increases
+                if chunk_index % 2 == 0:
+                    logger.info("Increasing memory pressure...")
+                    memory_simulator.apply_pressure()
+                
+                # Memory spike if scheduled for this chunk
+                if spikes and chunk_index in spike_intervals:
+                    logger.info(f"⚠️ CREATING MASSIVE MEMORY SPIKE at chunk {chunk_index+1} ⚠️")
+                    # Create a nightmare scenario - multiple overlapping spikes
+                    memory_simulator.spike_pressure(duration=10.0)  # 10-second spike
+                    
+                    # 50% chance of double-spike (pure evil)
+                    if random.random() < 0.5:
+                        await asyncio.sleep(2.0)  # Wait 2 seconds
+                        logger.info("💀 DOUBLE SPIKE - EXTREME MEMORY PRESSURE 💀")
+                        memory_simulator.spike_pressure(duration=8.0)  # 8-second overlapping spike
+                
+                if STREAM:
+                    # Stream mode - process results as they come in
+                    async for result in dispatcher.run_urls_stream(
+                        urls=url_chunk,
+                        crawler=crawler,
+                        config=run_config
+                    ):
+                        await process_result(result, test_results)
+                else:
+                    # Non-stream mode - get all results at once
+                    results = await dispatcher.run_urls(
+                        urls=url_chunk,
+                        crawler=crawler,
+                        config=run_config
+                    )
+                    await process_results(results, test_results)
+                    
+                # Simulate discovering more URLs while others are still processing
+                await asyncio.sleep(1)
+                
+                # RARELY release pressure - make the system fight for resources
+                if chunk_index % 5 == 4:  # Less frequent releases
+                    release_percent = random.choice([10, 15, 20])  # Smaller, inconsistent releases
+                    logger.info(f"Releasing {release_percent}% of memory blocks - brief respite")
+                    memory_simulator.release_pressure(percent=release_percent)
+    
+    except Exception as e:
+        logger.error(f"Test error: {str(e)}")
+        raise
+    finally:
+        # Release memory pressure
+        memory_simulator.release_pressure()
+        # Log final results
+        test_results.log_summary()
+        
+        # Check for success criteria
+        if len(test_results.completed_urls) + len(test_results.failed_urls) < url_count:
+            logger.error(f"TEST FAILED: Not all URLs were processed. {url_count - len(test_results.completed_urls) - len(test_results.failed_urls)} URLs missing.")
+            return False
+            
+        logger.info("TEST PASSED: All URLs were processed without crashing.")
+        return True
+
+# Command-line entry point
+if __name__ == "__main__":
+    # Parse command line arguments
+    url_count = int(sys.argv[1]) if len(sys.argv) > 1 else 100
+    target_memory = float(sys.argv[2]) if len(sys.argv) > 2 else 85.0
+    
+    # Check if stream mode is specified
+    if len(sys.argv) > 3:
+        STREAM = sys.argv[3].lower() in ('true', 'yes', '1', 'stream')
+    
+    # Check if aggressive mode is specified
+    aggressive = False
+    if len(sys.argv) > 4:
+        aggressive = sys.argv[4].lower() in ('true', 'yes', '1', 'aggressive')
+    
+    print(f"Starting test with {url_count} URLs, {target_memory}% memory target")
+    print(f"Stream mode: {STREAM}, Aggressive: {aggressive}")
+    print("Logs will be written to the logs directory")
+    print("Live display starting now...")
+    
+    # Run the test
+    result = asyncio.run(run_memory_stress_test(
+        url_count=url_count, 
+        target_memory_percent=target_memory,
+        aggressive=aggressive
+    ))
+    
+    # Exit with status code
+    sys.exit(0 if result else 1)
\ No newline at end of file
diff --git a/tests/memory/test_docker_config_gen.py b/tests/memory/test_docker_config_gen.py
new file mode 100644
index 00000000..ae6e533c
--- /dev/null
+++ b/tests/memory/test_docker_config_gen.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+"""
+Quick sanity‑check for /config/dump endpoint.
+
+Usage:
+    python test_config_dump.py  [http://localhost:8020]
+
+If the server isn’t running, start it first:
+    uvicorn deploy.docker.server:app --port 8020
+"""
+
+import sys, json, textwrap, requests
+
+# BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
+BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
+URL  = f"{BASE.rstrip('/')}/config/dump"
+
+CASES = [
+    # --- CrawlRunConfig variants ---
+    "CrawlerRunConfig()",
+    "CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
+    "CrawlerRunConfig(js_only=True, wait_until='networkidle')",
+
+    # --- BrowserConfig variants ---
+    "BrowserConfig()",
+    "BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
+    "BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
+]
+
+for code in CASES:
+    print("\n===  POST:", code)
+    resp = requests.post(URL, json={"code": code}, timeout=15)
+    if resp.ok:
+        print(json.dumps(resp.json(), indent=2)[:400] + "...")
+    else:
+        print("ERROR", resp.status_code, resp.text[:200])
diff --git a/tests/memory/test_stress_api.py b/tests/memory/test_stress_api.py
new file mode 100644
index 00000000..1b4f1a9c
--- /dev/null
+++ b/tests/memory/test_stress_api.py
@@ -0,0 +1,520 @@
+#!/usr/bin/env python3
+"""
+Stress test for Crawl4AI's Docker API server (/crawl and /crawl/stream endpoints).
+
+This version targets a running Crawl4AI API server, sending concurrent requests
+to test its ability to handle multiple crawl jobs simultaneously.
+It uses httpx for async HTTP requests and logs results per batch of requests,
+including server-side memory usage reported by the API.
+"""
+
+import asyncio
+import time
+import uuid
+import argparse
+import json
+import sys
+import os
+import shutil
+from typing import List, Dict, Optional, Union, AsyncGenerator, Tuple
+import httpx
+import pathlib # Import pathlib explicitly
+from rich.console import Console
+from rich.panel import Panel
+from rich.syntax import Syntax
+
+# --- Constants ---
+DEFAULT_API_URL = "http://localhost:11235" # Default port
+DEFAULT_API_URL = "http://localhost:8020" # Default port
+DEFAULT_URL_COUNT = 100
+DEFAULT_MAX_CONCURRENT_REQUESTS = 1
+DEFAULT_CHUNK_SIZE = 10
+DEFAULT_REPORT_PATH = "reports_api"
+DEFAULT_STREAM_MODE = True
+REQUEST_TIMEOUT = 180.0
+
+# Initialize Rich console
+console = Console()
+
+# --- API Health Check (Unchanged) ---
+async def check_server_health(client: httpx.AsyncClient, health_endpoint: str = "/health"):
+    """Check if the API server is healthy."""
+    console.print(f"[bold cyan]Checking API server health at {client.base_url}{health_endpoint}...[/]", end="")
+    try:
+        response = await client.get(health_endpoint, timeout=10.0)
+        response.raise_for_status()
+        health_data = response.json()
+        version = health_data.get('version', 'N/A')
+        console.print(f"[bold green] Server OK! Version: {version}[/]")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        console.print(f"\n[bold red]Server health check FAILED:[/]")
+        console.print(f"Error: {e}")
+        console.print(f"Is the server running and accessible at {client.base_url}?")
+        return False
+    except Exception as e:
+        console.print(f"\n[bold red]An unexpected error occurred during health check:[/]")
+        console.print(e)
+        return False
+
+# --- API Stress Test Class ---
+class ApiStressTest:
+    """Orchestrates the stress test by sending concurrent requests to the API."""
+
+    def __init__(
+        self,
+        api_url: str,
+        url_count: int,
+        max_concurrent_requests: int,
+        chunk_size: int,
+        report_path: str,
+        stream_mode: bool,
+    ):
+        self.api_base_url = api_url.rstrip('/')
+        self.url_count = url_count
+        self.max_concurrent_requests = max_concurrent_requests
+        self.chunk_size = chunk_size
+        self.report_path = pathlib.Path(report_path)
+        self.report_path.mkdir(parents=True, exist_ok=True)
+        self.stream_mode = stream_mode
+        
+        # Ignore repo path and set it to current file path
+        self.repo_path = pathlib.Path(__file__).parent.resolve()
+
+
+        self.test_id = time.strftime("%Y%m%d_%H%M%S")
+        self.results_summary = {
+            "test_id": self.test_id, "api_url": api_url, "url_count": url_count,
+            "max_concurrent_requests": max_concurrent_requests, "chunk_size": chunk_size,
+            "stream_mode": stream_mode, "start_time": "", "end_time": "",
+            "total_time_seconds": 0, "successful_requests": 0, "failed_requests": 0,
+            "successful_urls": 0, "failed_urls": 0, "total_urls_processed": 0,
+            "total_api_calls": 0,
+            "server_memory_metrics": { # To store aggregated server memory info
+                 "batch_mode_avg_delta_mb": None,
+                 "batch_mode_max_delta_mb": None,
+                 "stream_mode_avg_max_snapshot_mb": None,
+                 "stream_mode_max_max_snapshot_mb": None,
+                 "samples": [] # Store individual request memory results
+             }
+        }
+        self.http_client = httpx.AsyncClient(base_url=self.api_base_url, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=max_concurrent_requests + 5, max_keepalive_connections=max_concurrent_requests))
+
+    async def close_client(self):
+        """Close the httpx client."""
+        await self.http_client.aclose()
+
+    async def run(self) -> Dict:
+        """Run the API stress test."""
+        # No client memory tracker needed
+        urls_to_process = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(self.url_count)]
+        url_chunks = [urls_to_process[i:i+self.chunk_size] for i in range(0, len(urls_to_process), self.chunk_size)]
+
+        self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
+        start_time = time.time()
+
+        console.print(f"\n[bold cyan]Crawl4AI API Stress Test - {self.url_count} URLs, {self.max_concurrent_requests} concurrent requests[/bold cyan]")
+        console.print(f"[bold cyan]Target API:[/bold cyan] {self.api_base_url}, [bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]URLs per Request:[/bold cyan] {self.chunk_size}")
+        # Removed client memory log
+
+        semaphore = asyncio.Semaphore(self.max_concurrent_requests)
+
+        # Updated Batch logging header
+        console.print("\n[bold]API Request Batch Progress:[/bold]")
+        # Adjusted spacing and added Peak
+        console.print("[bold] Batch | Progress | SrvMem Peak / Δ|Max (MB) | Reqs/sec | S/F URLs | Time (s) | Status  [/bold]")
+        # Adjust separator length if needed, looks okay for now
+        console.print("─" * 95) 
+
+        # No client memory monitor task needed
+
+        tasks = []
+        total_api_calls = len(url_chunks)
+        self.results_summary["total_api_calls"] = total_api_calls
+
+        try:
+            for i, chunk in enumerate(url_chunks):
+                task = asyncio.create_task(self._make_api_request(
+                    chunk=chunk,
+                    batch_idx=i + 1,
+                    total_batches=total_api_calls,
+                    semaphore=semaphore
+                    # No memory tracker passed
+                ))
+                tasks.append(task)
+
+            api_results = await asyncio.gather(*tasks)
+
+            # Process aggregated results including server memory
+            total_successful_requests = sum(1 for r in api_results if r['request_success'])
+            total_failed_requests = total_api_calls - total_successful_requests
+            total_successful_urls = sum(r['success_urls'] for r in api_results)
+            total_failed_urls = sum(r['failed_urls'] for r in api_results)
+            total_urls_processed = total_successful_urls + total_failed_urls
+
+            # Aggregate server memory metrics
+            valid_samples = [r for r in api_results if r.get('server_delta_or_max_mb') is not None] # Filter results with valid mem data
+            self.results_summary["server_memory_metrics"]["samples"] = valid_samples # Store raw samples with both peak and delta/max
+
+            if valid_samples:
+                 delta_or_max_values = [r['server_delta_or_max_mb'] for r in valid_samples]
+                 if self.stream_mode:
+                     # Stream mode: delta_or_max holds max snapshot
+                     self.results_summary["server_memory_metrics"]["stream_mode_avg_max_snapshot_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
+                     self.results_summary["server_memory_metrics"]["stream_mode_max_max_snapshot_mb"] = max(delta_or_max_values)
+                 else: # Batch mode
+                     # delta_or_max holds delta
+                     self.results_summary["server_memory_metrics"]["batch_mode_avg_delta_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
+                     self.results_summary["server_memory_metrics"]["batch_mode_max_delta_mb"] = max(delta_or_max_values)
+
+                     # Aggregate peak values for batch mode
+                     peak_values = [r['server_peak_memory_mb'] for r in valid_samples if r.get('server_peak_memory_mb') is not None]
+                     if peak_values:
+                          self.results_summary["server_memory_metrics"]["batch_mode_avg_peak_mb"] = sum(peak_values) / len(peak_values)
+                          self.results_summary["server_memory_metrics"]["batch_mode_max_peak_mb"] = max(peak_values)
+
+
+            self.results_summary.update({
+                "successful_requests": total_successful_requests,
+                "failed_requests": total_failed_requests,
+                "successful_urls": total_successful_urls,
+                "failed_urls": total_failed_urls,
+                "total_urls_processed": total_urls_processed,
+            })
+
+        except Exception as e:
+             console.print(f"[bold red]An error occurred during task execution: {e}[/bold red]")
+             import traceback
+             traceback.print_exc()
+        # No finally block needed for monitor task
+
+        end_time = time.time()
+        self.results_summary.update({
+            "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "total_time_seconds": end_time - start_time,
+            # No client memory report
+        })
+        self._save_results()
+        return self.results_summary
+
+    async def _make_api_request(
+        self,
+        chunk: List[str],
+        batch_idx: int,
+        total_batches: int,
+        semaphore: asyncio.Semaphore
+        # No memory tracker
+    ) -> Dict:
+        """Makes a single API request for a chunk of URLs, handling concurrency and logging server memory."""
+        request_success = False
+        success_urls = 0
+        failed_urls = 0
+        status = "Pending"
+        status_color = "grey"
+        server_memory_metric = None # Store delta (batch) or max snapshot (stream)
+        api_call_start_time = time.time()
+
+        async with semaphore:
+            try:
+                # No client memory sampling
+
+                endpoint = "/crawl/stream" if self.stream_mode else "/crawl"
+                payload = {
+                    "urls": chunk,
+                    "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+                    "crawler_config": {
+                        "type": "CrawlerRunConfig",
+                        "params": {"cache_mode": "BYPASS", "stream": self.stream_mode}
+                    }
+                }
+
+                if self.stream_mode:
+                    max_server_mem_snapshot = 0.0 # Track max memory seen in this stream
+                    async with self.http_client.stream("POST", endpoint, json=payload) as response:
+                        initial_status_code = response.status_code
+                        response.raise_for_status()
+
+                        completed_marker_received = False
+                        async for line in response.aiter_lines():
+                            if line:
+                                try:
+                                    data = json.loads(line)
+                                    if data.get("status") == "completed":
+                                        completed_marker_received = True
+                                        break
+                                    elif data.get("url"):
+                                        if data.get("success"): success_urls += 1
+                                        else: failed_urls += 1
+                                        # Extract server memory snapshot per result
+                                        mem_snapshot = data.get('server_memory_mb')
+                                        if mem_snapshot is not None:
+                                            max_server_mem_snapshot = max(max_server_mem_snapshot, float(mem_snapshot))
+                                except json.JSONDecodeError:
+                                    console.print(f"[Batch {batch_idx}] [red]Stream decode error for line:[/red] {line}")
+                                    failed_urls = len(chunk)
+                                    break
+                        request_success = completed_marker_received
+                        if not request_success:
+                             failed_urls = len(chunk) - success_urls
+                        server_memory_metric = max_server_mem_snapshot # Use max snapshot for stream logging
+
+                else: # Batch mode
+                    response = await self.http_client.post(endpoint, json=payload)
+                    response.raise_for_status()
+                    data = response.json()
+
+                    # Extract server memory delta from the response
+                    server_memory_metric = data.get('server_memory_delta_mb')
+                    server_peak_mem_mb = data.get('server_peak_memory_mb') 
+
+                    if data.get("success") and "results" in data:
+                        request_success = True
+                        results_list = data.get("results", [])
+                        for result_item in results_list:
+                            if result_item.get("success"): success_urls += 1
+                            else: failed_urls += 1
+                        if len(results_list) != len(chunk):
+                             console.print(f"[Batch {batch_idx}] [yellow]Warning: Result count ({len(results_list)}) doesn't match URL count ({len(chunk)})[/yellow]")
+                             failed_urls = len(chunk) - success_urls
+                    else:
+                        request_success = False
+                        failed_urls = len(chunk)
+                        # Try to get memory from error detail if available
+                        detail = data.get('detail')
+                        if isinstance(detail, str):
+                            try: detail_json = json.loads(detail)
+                            except: detail_json = {}
+                        elif isinstance(detail, dict):
+                            detail_json = detail
+                        else: detail_json = {}
+                        server_peak_mem_mb = detail_json.get('server_peak_memory_mb', None)
+                        server_memory_metric = detail_json.get('server_memory_delta_mb', None)
+                        console.print(f"[Batch {batch_idx}] [red]API request failed:[/red] {detail_json.get('error', 'No details')}")
+
+
+            except httpx.HTTPStatusError as e:
+                request_success = False
+                failed_urls = len(chunk)
+                console.print(f"[Batch {batch_idx}] [bold red]HTTP Error {e.response.status_code}:[/] {e.request.url}")
+                try:
+                    error_detail = e.response.json()
+                    # Attempt to extract memory info even from error responses
+                    detail_content = error_detail.get('detail', {})
+                    if isinstance(detail_content, str): # Handle if detail is stringified JSON
+                         try: detail_content = json.loads(detail_content)
+                         except: detail_content = {}
+                    server_memory_metric = detail_content.get('server_memory_delta_mb', None)
+                    server_peak_mem_mb = detail_content.get('server_peak_memory_mb', None)
+                    console.print(f"Response: {error_detail}")
+                except Exception:
+                     console.print(f"Response Text: {e.response.text[:200]}...")
+            except httpx.RequestError as e:
+                request_success = False
+                failed_urls = len(chunk)
+                console.print(f"[Batch {batch_idx}] [bold red]Request Error:[/bold] {e.request.url} - {e}")
+            except Exception as e:
+                request_success = False
+                failed_urls = len(chunk)
+                console.print(f"[Batch {batch_idx}] [bold red]Unexpected Error:[/bold] {e}")
+                import traceback
+                traceback.print_exc()
+
+            finally:
+                api_call_time = time.time() - api_call_start_time
+                total_processed_urls = success_urls + failed_urls
+
+                if request_success and failed_urls == 0: status_color, status = "green", "Success"
+                elif request_success and success_urls > 0: status_color, status = "yellow", "Partial"
+                else: status_color, status = "red", "Failed"
+
+                current_total_urls = batch_idx * self.chunk_size
+                progress_pct = min(100.0, (current_total_urls / self.url_count) * 100)
+                reqs_per_sec = 1.0 / api_call_time if api_call_time > 0 else float('inf')
+
+                # --- New Memory Formatting ---
+                mem_display = " N/A " # Default
+                peak_mem_value = None
+                delta_or_max_value = None
+
+                if self.stream_mode:
+                    # server_memory_metric holds max snapshot for stream
+                    if server_memory_metric is not None:
+                        mem_display = f"{server_memory_metric:.1f} (Max)"
+                        delta_or_max_value = server_memory_metric # Store for aggregation
+                else: # Batch mode - expect peak and delta
+                    # We need to get peak and delta from the API response
+                    peak_mem_value = locals().get('server_peak_mem_mb', None) # Get from response data if available
+                    delta_value = server_memory_metric # server_memory_metric holds delta for batch
+
+                    if peak_mem_value is not None and delta_value is not None:
+                        mem_display = f"{peak_mem_value:.1f} / {delta_value:+.1f}"
+                        delta_or_max_value = delta_value # Store delta for aggregation
+                    elif peak_mem_value is not None:
+                         mem_display = f"{peak_mem_value:.1f} / N/A"
+                    elif delta_value is not None:
+                         mem_display = f"N/A / {delta_value:+.1f}"
+                         delta_or_max_value = delta_value # Store delta for aggregation
+
+                # --- Updated Print Statement with Adjusted Padding ---
+                console.print(
+                    f" {batch_idx:<5} | {progress_pct:6.1f}% | {mem_display:>24} | {reqs_per_sec:8.1f} | " # Increased width for memory column
+                    f"{success_urls:^7}/{failed_urls:<6} | {api_call_time:8.2f} | [{status_color}]{status:<7}[/{status_color}] " # Added trailing space
+                )
+
+                # --- Updated Return Dictionary ---
+                return_data = {
+                    "batch_idx": batch_idx,
+                    "request_success": request_success,
+                    "success_urls": success_urls,
+                    "failed_urls": failed_urls,
+                    "time": api_call_time,
+                    # Return both peak (if available) and delta/max
+                    "server_peak_memory_mb": peak_mem_value, # Will be None for stream mode
+                    "server_delta_or_max_mb": delta_or_max_value # Delta for batch, Max for stream
+                }
+                # Add back the specific batch mode delta if needed elsewhere, but delta_or_max covers it
+                # if not self.stream_mode:
+                #    return_data["server_memory_delta_mb"] = delta_value
+                return return_data
+
+    # No _periodic_memory_sample needed
+
+    def _save_results(self) -> None:
+        """Saves the results summary to a JSON file."""
+        results_path = self.report_path / f"api_test_summary_{self.test_id}.json"
+        try:
+            # No client memory path to convert
+            with open(results_path, 'w', encoding='utf-8') as f:
+                json.dump(self.results_summary, f, indent=2, default=str)
+        except Exception as e:
+            console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
+
+
+# --- run_full_test Function ---
+async def run_full_test(args):
+    """Runs the full API stress test process."""
+    client = httpx.AsyncClient(base_url=args.api_url, timeout=REQUEST_TIMEOUT)
+
+    if not await check_server_health(client):
+        console.print("[bold red]Aborting test due to server health check failure.[/]")
+        await client.aclose()
+        return
+    await client.aclose()
+
+    test = ApiStressTest(
+        api_url=args.api_url,
+        url_count=args.urls,
+        max_concurrent_requests=args.max_concurrent_requests,
+        chunk_size=args.chunk_size,
+        report_path=args.report_path,
+        stream_mode=args.stream,
+    )
+    results = {}
+    try:
+        results = await test.run()
+    finally:
+        await test.close_client()
+
+    if not results:
+        console.print("[bold red]Test did not produce results.[/bold red]")
+        return
+
+    console.print("\n" + "=" * 80)
+    console.print("[bold green]API Stress Test Completed[/bold green]")
+    console.print("=" * 80)
+
+    success_rate_reqs = results["successful_requests"] / results["total_api_calls"] * 100 if results["total_api_calls"] > 0 else 0
+    success_rate_urls = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
+    urls_per_second = results["total_urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
+    reqs_per_second = results["total_api_calls"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
+
+
+    console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
+    console.print(f"[bold cyan]Target API:[/bold cyan] {results['api_url']}")
+    console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_concurrent_requests']} concurrent client requests, URLs/Req: {results['chunk_size']}, Stream: {results['stream_mode']}")
+    console.print(f"[bold cyan]API Requests:[/bold cyan] {results['successful_requests']} successful, {results['failed_requests']} failed ({results['total_api_calls']} total, {success_rate_reqs:.1f}% success)")
+    console.print(f"[bold cyan]URL Processing:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['total_urls_processed']} processed, {success_rate_urls:.1f}% success)")
+    console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f}s total | Avg Reqs/sec: {reqs_per_second:.2f} | Avg URLs/sec: {urls_per_second:.2f}")
+
+    # Report Server Memory
+    mem_metrics = results.get("server_memory_metrics", {})
+    mem_samples = mem_metrics.get("samples", [])
+    if mem_samples:
+         num_samples = len(mem_samples)
+         if results['stream_mode']:
+             avg_mem = mem_metrics.get("stream_mode_avg_max_snapshot_mb")
+             max_mem = mem_metrics.get("stream_mode_max_max_snapshot_mb")
+             avg_str = f"{avg_mem:.1f}" if avg_mem is not None else "N/A"
+             max_str = f"{max_mem:.1f}" if max_mem is not None else "N/A"
+             console.print(f"[bold cyan]Server Memory (Stream):[/bold cyan] Avg Max Snapshot: {avg_str} MB | Max Max Snapshot: {max_str} MB (across {num_samples} requests)")
+         else: # Batch mode
+             avg_delta = mem_metrics.get("batch_mode_avg_delta_mb")
+             max_delta = mem_metrics.get("batch_mode_max_delta_mb")
+             avg_peak = mem_metrics.get("batch_mode_avg_peak_mb")
+             max_peak = mem_metrics.get("batch_mode_max_peak_mb")
+
+             avg_delta_str = f"{avg_delta:.1f}" if avg_delta is not None else "N/A"
+             max_delta_str = f"{max_delta:.1f}" if max_delta is not None else "N/A"
+             avg_peak_str = f"{avg_peak:.1f}" if avg_peak is not None else "N/A"
+             max_peak_str = f"{max_peak:.1f}" if max_peak is not None else "N/A"
+
+             console.print(f"[bold cyan]Server Memory (Batch):[/bold cyan] Avg Peak: {avg_peak_str} MB | Max Peak: {max_peak_str} MB | Avg Delta: {avg_delta_str} MB | Max Delta: {max_delta_str} MB (across {num_samples} requests)")
+    else:
+        console.print("[bold cyan]Server Memory:[/bold cyan] No memory data reported by server.")
+
+
+    # No client memory report
+    summary_path = pathlib.Path(args.report_path) / f"api_test_summary_{results['test_id']}.json"
+    console.print(f"[bold green]Results summary saved to {summary_path}[/bold green]")
+
+    if results["failed_requests"] > 0:
+        console.print(f"\n[bold yellow]Warning: {results['failed_requests']} API requests failed ({100-success_rate_reqs:.1f}% failure rate)[/bold yellow]")
+    if results["failed_urls"] > 0:
+         console.print(f"[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate_urls:.1f}% URL failure rate)[/bold yellow]")
+    if results["total_urls_processed"] < results["url_count"]:
+        console.print(f"\n[bold red]Error: Only {results['total_urls_processed']} out of {results['url_count']} target URLs were processed![/bold red]")
+
+
+# --- main Function (Argument parsing mostly unchanged) ---
+def main():
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(description="Crawl4AI API Server Stress Test")
+
+    parser.add_argument("--api-url", type=str, default=DEFAULT_API_URL, help=f"Base URL of the Crawl4AI API server (default: {DEFAULT_API_URL})")
+    parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Total number of unique URLs to process via API calls (default: {DEFAULT_URL_COUNT})")
+    parser.add_argument("--max-concurrent-requests", type=int, default=DEFAULT_MAX_CONCURRENT_REQUESTS, help=f"Maximum concurrent API requests from this client (default: {DEFAULT_MAX_CONCURRENT_REQUESTS})")
+    parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per API request payload (default: {DEFAULT_CHUNK_SIZE})")
+    parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Use the /crawl/stream endpoint instead of /crawl (default: {DEFAULT_STREAM_MODE})")
+    parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
+    parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
+
+    args = parser.parse_args()
+
+    console.print("[bold underline]Crawl4AI API Stress Test Configuration[/bold underline]")
+    console.print(f"API URL: {args.api_url}")
+    console.print(f"Total URLs: {args.urls}, Concurrent Client Requests: {args.max_concurrent_requests}, URLs per Request: {args.chunk_size}")
+    console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}")
+    console.print(f"Report Path: {args.report_path}")
+    console.print("-" * 40)
+    if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
+    console.print("-" * 40)
+
+    if args.clean_reports:
+        report_dir = pathlib.Path(args.report_path)
+        if report_dir.exists():
+            console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]")
+            shutil.rmtree(args.report_path)
+        report_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        asyncio.run(run_full_test(args))
+    except KeyboardInterrupt:
+        console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
+    except Exception as e:
+        console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    # No need to modify sys.path for SimpleMemoryTracker as it's removed
+    main()
\ No newline at end of file
diff --git a/tests/memory/test_stress_api_xs.py b/tests/memory/test_stress_api_xs.py
new file mode 100644
index 00000000..27248883
--- /dev/null
+++ b/tests/memory/test_stress_api_xs.py
@@ -0,0 +1,203 @@
+"""Lite Crawl4AI API stress‑tester.
+
+✔ batch or stream mode (single unified path)
+✔ global stats + JSON summary
+✔ rich table progress
+✔ Typer CLI with presets (quick / soak)
+
+Usage examples:
+    python api_stress_test.py               # uses quick preset
+    python api_stress_test.py soak          # 5 K URLs stress run
+    python api_stress_test.py --urls 200 --concurrent 10 --chunk 20
+"""
+
+from __future__ import annotations
+
+import asyncio, json, time, uuid, pathlib, statistics
+from typing import List, Dict, Optional
+
+import httpx, typer
+from rich.console import Console
+from rich.table import Table
+
+# ───────────────────────── defaults / presets ──────────────────────────
+PRESETS = {
+    "quick": dict(urls=1, concurrent=1, chunk=1, stream=False),
+    "debug": dict(urls=10, concurrent=2, chunk=5, stream=False),
+    "soak": dict(urls=5000, concurrent=20, chunk=50, stream=True),
+}
+
+API_HEALTH_ENDPOINT = "/health"
+REQUEST_TIMEOUT = 180.0
+
+console = Console()
+app = typer.Typer(add_completion=False, rich_markup_mode="rich")
+
+# ───────────────────────── helpers ─────────────────────────────────────
+async def _check_health(client: httpx.AsyncClient) -> None:
+    resp = await client.get(API_HEALTH_ENDPOINT, timeout=10)
+    resp.raise_for_status()
+    console.print(f"[green]Server healthy — version {resp.json().get('version','?')}[/]")
+
+async def _iter_results(resp: httpx.Response, stream: bool):
+    """Yield result dicts from batch JSON or ND‑JSON stream."""
+    if stream:
+        async for line in resp.aiter_lines():
+            if not line:
+                continue
+            rec = json.loads(line)
+            if rec.get("status") == "completed":
+                break
+            yield rec
+    else:
+        data = resp.json()
+        for rec in data.get("results", []):
+            yield rec, data  # rec + whole payload for memory delta/peak
+
+async def _consume_stream(resp: httpx.Response) -> Dict:
+    stats = {"success_urls": 0, "failed_urls": 0, "mem_metric": 0.0}
+    async for line in resp.aiter_lines():
+        if not line:
+            continue
+        rec = json.loads(line)
+        if rec.get("status") == "completed":
+            break
+        if rec.get("success"):
+            stats["success_urls"] += 1
+        else:
+            stats["failed_urls"] += 1
+        mem = rec.get("server_memory_mb")
+        if mem is not None:
+            stats["mem_metric"] = max(stats["mem_metric"], float(mem))
+    return stats
+
+def _consume_batch(body: Dict) -> Dict:
+    stats = {"success_urls": 0, "failed_urls": 0}
+    for rec in body.get("results", []):
+        if rec.get("success"):
+            stats["success_urls"] += 1
+        else:
+            stats["failed_urls"] += 1
+    stats["mem_metric"] = body.get("server_memory_delta_mb")
+    stats["peak"] = body.get("server_peak_memory_mb")
+    return stats
+
+async def _fetch_chunk(
+    client: httpx.AsyncClient,
+    urls: List[str],
+    stream: bool,
+    semaphore: asyncio.Semaphore,
+) -> Dict:
+    endpoint = "/crawl/stream" if stream else "/crawl"
+    payload = {
+        "urls": urls,
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig",
+                           "params": {"cache_mode": "BYPASS", "stream": stream}},
+    }
+
+    async with semaphore:
+        start = time.perf_counter()
+
+        if stream:
+            # ---- streaming request ----
+            async with client.stream("POST", endpoint, json=payload) as resp:
+                resp.raise_for_status()
+                stats = await _consume_stream(resp)
+        else:
+            # ---- batch request ----
+            resp = await client.post(endpoint, json=payload)
+            resp.raise_for_status()
+            stats = _consume_batch(resp.json())
+
+        stats["elapsed"] = time.perf_counter() - start
+        return stats
+
+
+# ───────────────────────── core runner ─────────────────────────────────
+async def _run(api: str, urls: int, concurrent: int, chunk: int, stream: bool, report: pathlib.Path):
+    client = httpx.AsyncClient(base_url=api, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=concurrent+5))
+    await _check_health(client)
+
+    url_list = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(urls)]
+    chunks = [url_list[i:i+chunk] for i in range(0, len(url_list), chunk)]
+    sem = asyncio.Semaphore(concurrent)
+
+    table = Table(show_header=True, header_style="bold magenta")
+    table.add_column("Batch", style="dim", width=6)
+    table.add_column("Success/Fail", width=12)
+    table.add_column("Mem", width=14)
+    table.add_column("Time (s)")
+
+    agg_success = agg_fail = 0
+    deltas, peaks = [], []
+
+    start = time.perf_counter()
+    tasks = [asyncio.create_task(_fetch_chunk(client, c, stream, sem)) for c in chunks]
+    for idx, coro in enumerate(asyncio.as_completed(tasks), 1):
+        res = await coro
+        agg_success += res["success_urls"]
+        agg_fail += res["failed_urls"]
+        if res["mem_metric"] is not None:
+            deltas.append(res["mem_metric"])
+        if res["peak"] is not None:
+            peaks.append(res["peak"])
+
+        mem_txt = f"{res['mem_metric']:.1f}" if res["mem_metric"] is not None else "‑"
+        if res["peak"] is not None:
+            mem_txt = f"{res['peak']:.1f}/{mem_txt}"
+
+        table.add_row(str(idx), f"{res['success_urls']}/{res['failed_urls']}", mem_txt, f"{res['elapsed']:.2f}")
+
+    console.print(table)
+    total_time = time.perf_counter() - start
+
+    summary = {
+        "urls": urls,
+        "concurrent": concurrent,
+        "chunk": chunk,
+        "stream": stream,
+        "success_urls": agg_success,
+        "failed_urls": agg_fail,
+        "elapsed_sec": round(total_time, 2),
+        "avg_mem": round(statistics.mean(deltas), 2) if deltas else None,
+        "max_mem": max(deltas) if deltas else None,
+        "avg_peak": round(statistics.mean(peaks), 2) if peaks else None,
+        "max_peak": max(peaks) if peaks else None,
+    }
+    console.print("\n[bold green]Done:[/]" , summary)
+
+    report.mkdir(parents=True, exist_ok=True)
+    path = report / f"api_test_{int(time.time())}.json"
+    path.write_text(json.dumps(summary, indent=2))
+    console.print(f"[green]Summary → {path}")
+
+    await client.aclose()
+
+# ───────────────────────── Typer CLI ──────────────────────────────────
+@app.command()
+def main(
+    preset: str = typer.Argument("quick", help="quick / debug / soak or custom"),
+    api_url: str = typer.Option("http://localhost:8020", show_default=True),
+    urls: int = typer.Option(None, help="Total URLs to crawl"),
+    concurrent: int = typer.Option(None, help="Concurrent API requests"),
+    chunk: int = typer.Option(None, help="URLs per request"),
+    stream: bool = typer.Option(None, help="Use /crawl/stream"),
+    report: pathlib.Path = typer.Option("reports_api", help="Where to save JSON summary"),
+):
+    """Run a stress test against a running Crawl4AI API server."""
+    if preset not in PRESETS and any(v is None for v in (urls, concurrent, chunk, stream)):
+        console.print(f"[red]Unknown preset '{preset}' and custom params missing[/]")
+        raise typer.Exit(1)
+
+    cfg = PRESETS.get(preset, {})
+    urls = urls or cfg.get("urls")
+    concurrent = concurrent or cfg.get("concurrent")
+    chunk = chunk or cfg.get("chunk")
+    stream = stream if stream is not None else cfg.get("stream", False)
+
+    console.print(f"[cyan]API:[/] {api_url} | URLs: {urls} | Concurrency: {concurrent} | Chunk: {chunk} | Stream: {stream}")
+    asyncio.run(_run(api_url, urls, concurrent, chunk, stream, report))
+
+if __name__ == "__main__":
+    app()
diff --git a/tests/memory/test_stress_docker_api.py b/tests/memory/test_stress_docker_api.py
new file mode 100644
index 00000000..05b3bea8
--- /dev/null
+++ b/tests/memory/test_stress_docker_api.py
@@ -0,0 +1,129 @@
+"""
+Crawl4AI Docker API stress tester.
+
+Examples
+--------
+python test_stress_docker_api.py --urls 1000 --concurrency 32
+python test_stress_docker_api.py --urls 1000 --concurrency 32 --stream
+python test_stress_docker_api.py --base-url http://10.0.0.42:11235 --http2
+"""
+
+import argparse, asyncio, json, secrets, statistics, time
+from typing import List, Tuple
+import httpx
+from rich.console import Console
+from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn
+from rich.table import Table
+
+console = Console()
+
+
+# ───────────────────────── helpers ─────────────────────────
+def make_fake_urls(n: int) -> List[str]:
+    base = "https://httpbin.org/anything/"
+    return [f"{base}{secrets.token_hex(8)}" for _ in range(n)]
+
+
+async def fire(
+    client: httpx.AsyncClient, endpoint: str, payload: dict, sem: asyncio.Semaphore
+) -> Tuple[bool, float]:
+    async with sem:
+        print(f"POST {endpoint} with {len(payload['urls'])} URLs")
+        t0 = time.perf_counter()
+        try:
+            if endpoint.endswith("/stream"):
+                async with client.stream("POST", endpoint, json=payload) as r:
+                    r.raise_for_status()
+                    async for _ in r.aiter_lines():
+                        pass
+            else:
+                r = await client.post(endpoint, json=payload)                
+                r.raise_for_status()
+            return True, time.perf_counter() - t0
+        except Exception:
+            return False, time.perf_counter() - t0
+
+
+def pct(lat: List[float], p: float) -> str:
+    """Return percentile string even for tiny samples."""
+    if not lat:
+        return "-"
+    if len(lat) == 1:
+        return f"{lat[0]:.2f}s"
+    lat_sorted = sorted(lat)
+    k = (p / 100) * (len(lat_sorted) - 1)
+    lo = int(k)
+    hi = min(lo + 1, len(lat_sorted) - 1)
+    frac = k - lo
+    val = lat_sorted[lo] * (1 - frac) + lat_sorted[hi] * frac
+    return f"{val:.2f}s"
+
+
+# ───────────────────────── main ─────────────────────────
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Stress test Crawl4AI Docker API")
+    p.add_argument("--urls", type=int, default=100, help="number of URLs")
+    p.add_argument("--concurrency", type=int, default=1, help="max POSTs in flight")
+    p.add_argument("--chunk-size", type=int, default=50, help="URLs per request")
+    p.add_argument("--base-url", default="http://localhost:11235", help="API root")
+    # p.add_argument("--base-url", default="http://localhost:8020", help="API root")
+    p.add_argument("--stream", action="store_true", help="use /crawl/stream")
+    p.add_argument("--http2", action="store_true", help="enable HTTP/2")
+    p.add_argument("--headless", action="store_true", default=True)
+    return p.parse_args()
+
+
+async def main() -> None:
+    args = parse_args()
+
+    urls = make_fake_urls(args.urls)
+    batches = [urls[i : i + args.chunk_size] for i in range(0, len(urls), args.chunk_size)]
+    endpoint = "/crawl/stream" if args.stream else "/crawl"
+    sem = asyncio.Semaphore(args.concurrency)
+
+    async with httpx.AsyncClient(base_url=args.base_url, http2=args.http2, timeout=None) as client:
+        with Progress(
+            "[progress.description]{task.description}",
+            BarColumn(),
+            "[progress.percentage]{task.percentage:>3.0f}%",
+            TimeElapsedColumn(),
+            TimeRemainingColumn(),
+        ) as progress:
+            task_id = progress.add_task("[cyan]bombarding…", total=len(batches))
+            tasks = []
+            for chunk in batches:
+                payload = {
+                    "urls": chunk,
+                    "browser_config": {"type": "BrowserConfig", "params": {"headless": args.headless}},
+                    "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS", "stream": args.stream}},
+                }
+                tasks.append(asyncio.create_task(fire(client, endpoint, payload, sem)))
+                progress.advance(task_id)
+
+            results = await asyncio.gather(*tasks)
+
+    ok_latencies = [dt for ok, dt in results if ok]
+    err_count = sum(1 for ok, _ in results if not ok)
+
+    table = Table(title="Docker API Stress‑Test Summary")
+    table.add_column("total", justify="right")
+    table.add_column("errors", justify="right")
+    table.add_column("p50", justify="right")
+    table.add_column("p95", justify="right")
+    table.add_column("max", justify="right")
+
+    table.add_row(
+        str(len(results)),
+        str(err_count),
+        pct(ok_latencies, 50),
+        pct(ok_latencies, 95),
+        f"{max(ok_latencies):.2f}s" if ok_latencies else "-",
+    )
+    console.print(table)
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        console.print("\n[yellow]aborted by user[/]")
diff --git a/tests/memory/test_stress_sdk.py b/tests/memory/test_stress_sdk.py
new file mode 100644
index 00000000..14da94a4
--- /dev/null
+++ b/tests/memory/test_stress_sdk.py
@@ -0,0 +1,500 @@
+#!/usr/bin/env python3
+"""
+Stress test for Crawl4AI's arun_many and dispatcher system.
+This version uses a local HTTP server and focuses on testing
+the SDK's ability to handle multiple URLs concurrently, with per-batch logging.
+"""
+
+import asyncio
+import os
+import time
+import pathlib
+import random
+import secrets
+import argparse
+import json
+import sys
+import subprocess
+import signal
+from typing import List, Dict, Optional, Union, AsyncGenerator
+import shutil
+from rich.console import Console
+
+# Crawl4AI components
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    BrowserConfig,
+    MemoryAdaptiveDispatcher,
+    CrawlerMonitor,
+    DisplayMode,
+    CrawlResult,
+    RateLimiter,
+    CacheMode,
+)
+
+# Constants
+DEFAULT_SITE_PATH = "test_site"
+DEFAULT_PORT = 8000
+DEFAULT_MAX_SESSIONS = 16
+DEFAULT_URL_COUNT = 1
+DEFAULT_CHUNK_SIZE = 1 # Define chunk size for batch logging
+DEFAULT_REPORT_PATH = "reports"
+DEFAULT_STREAM_MODE = False
+DEFAULT_MONITOR_MODE = "DETAILED"
+
+# Initialize Rich console
+console = Console()
+
+# --- SiteGenerator Class (Unchanged) ---
+class SiteGenerator:
+    """Generates a local test site with heavy pages for stress testing."""
+
+    def __init__(self, site_path: str = DEFAULT_SITE_PATH, page_count: int = DEFAULT_URL_COUNT):
+        self.site_path = pathlib.Path(site_path)
+        self.page_count = page_count
+        self.images_dir = self.site_path / "images"
+        self.lorem_words = " ".join("lorem ipsum dolor sit amet " * 100).split()
+
+        self.html_template = """<!doctype html>
+<html>
+<head>
+    <title>Test Page {page_num}</title>
+    <meta charset="utf-8">
+</head>
+<body>
+    <h1>Test Page {page_num}</h1>
+    {paragraphs}
+    {images}
+</body>
+</html>
+"""
+
+    def generate_site(self) -> None:
+        self.site_path.mkdir(parents=True, exist_ok=True)
+        self.images_dir.mkdir(exist_ok=True)
+        console.print(f"Generating {self.page_count} test pages...")
+        for i in range(self.page_count):
+            paragraphs = "\n".join(f"<p>{' '.join(random.choices(self.lorem_words, k=200))}</p>" for _ in range(5))
+            images = "\n".join(f'<img src="https://picsum.photos/seed/{secrets.token_hex(8)}/300/200" loading="lazy" alt="Random image {j}"/>' for j in range(3))
+            page_path = self.site_path / f"page_{i}.html"
+            page_path.write_text(self.html_template.format(page_num=i, paragraphs=paragraphs, images=images), encoding="utf-8")
+            if (i + 1) % (self.page_count // 10 or 1) == 0 or i == self.page_count - 1:
+                 console.print(f"Generated {i+1}/{self.page_count} pages")
+        self._create_index_page()
+        console.print(f"[bold green]Successfully generated {self.page_count} test pages in [cyan]{self.site_path}[/cyan][/bold green]")
+
+    def _create_index_page(self) -> None:
+        index_content = """<!doctype html><html><head><title>Test Site Index</title><meta charset="utf-8"></head><body><h1>Test Site Index</h1><p>This is an automatically generated site for testing Crawl4AI.</p><div class="page-links">\n"""
+        for i in range(self.page_count):
+            index_content += f'        <a href="page_{i}.html">Test Page {i}</a><br>\n'
+        index_content += """    </div></body></html>"""
+        (self.site_path / "index.html").write_text(index_content, encoding="utf-8")
+
+# --- LocalHttpServer Class (Unchanged) ---
+class LocalHttpServer:
+    """Manages a local HTTP server for serving test pages."""
+    def __init__(self, site_path: str = DEFAULT_SITE_PATH, port: int = DEFAULT_PORT):
+        self.site_path = pathlib.Path(site_path)
+        self.port = port
+        self.process = None
+
+    def start(self) -> None:
+        if not self.site_path.exists(): raise FileNotFoundError(f"Site directory {self.site_path} does not exist")
+        console.print(f"Attempting to start HTTP server in [cyan]{self.site_path}[/cyan] on port {self.port}...")
+        try:
+            cmd = ["python", "-m", "http.server", str(self.port)]
+            creationflags = 0; preexec_fn = None
+            if sys.platform == 'win32': creationflags = subprocess.CREATE_NEW_PROCESS_GROUP
+            self.process = subprocess.Popen(cmd, cwd=str(self.site_path), stdout=subprocess.PIPE, stderr=subprocess.PIPE, creationflags=creationflags)
+            time.sleep(1.5)
+            if self.is_running(): console.print(f"[bold green]HTTP server started successfully (PID: {self.process.pid})[/bold green]")
+            else:
+                console.print("[bold red]Failed to start HTTP server. Checking logs...[/bold red]")
+                stdout, stderr = self.process.communicate(); print(stdout.decode(errors='ignore')); print(stderr.decode(errors='ignore'))
+                self.stop(); raise RuntimeError("HTTP server failed to start.")
+        except Exception as e: console.print(f"[bold red]Error starting HTTP server: {str(e)}[/bold red]"); self.stop(); raise
+
+    def stop(self) -> None:
+        if self.process and self.is_running():
+            console.print(f"Stopping HTTP server (PID: {self.process.pid})...")
+            try:
+                if sys.platform == 'win32': self.process.send_signal(signal.CTRL_BREAK_EVENT); time.sleep(0.5)
+                self.process.terminate()
+                try: stdout, stderr = self.process.communicate(timeout=5); console.print("[bold yellow]HTTP server stopped[/bold yellow]")
+                except subprocess.TimeoutExpired: console.print("[bold red]Server did not terminate gracefully, killing...[/bold red]"); self.process.kill(); stdout, stderr = self.process.communicate(); console.print("[bold yellow]HTTP server killed[/bold yellow]")
+            except Exception as e: console.print(f"[bold red]Error stopping HTTP server: {str(e)}[/bold red]"); self.process.kill()
+            finally: self.process = None
+        elif self.process: console.print("[dim]HTTP server process already stopped.[/dim]"); self.process = None
+
+    def is_running(self) -> bool:
+        if not self.process: return False
+        return self.process.poll() is None
+
+# --- SimpleMemoryTracker Class (Unchanged) ---
+class SimpleMemoryTracker:
+    """Basic memory tracker that doesn't rely on psutil."""
+    def __init__(self, report_path: str = DEFAULT_REPORT_PATH, test_id: Optional[str] = None):
+        self.report_path = pathlib.Path(report_path); self.report_path.mkdir(parents=True, exist_ok=True)
+        self.test_id = test_id or time.strftime("%Y%m%d_%H%M%S")
+        self.start_time = time.time(); self.memory_samples = []; self.pid = os.getpid()
+        self.csv_path = self.report_path / f"memory_samples_{self.test_id}.csv"
+        with open(self.csv_path, 'w', encoding='utf-8') as f: f.write("timestamp,elapsed_seconds,memory_info_mb\n")
+
+    def sample(self) -> Dict:
+        try:
+            memory_mb = self._get_memory_info_mb()
+            memory_str = f"{memory_mb:.1f} MB" if memory_mb is not None else "Unknown"
+            timestamp = time.time(); elapsed = timestamp - self.start_time
+            sample = {"timestamp": timestamp, "elapsed_seconds": elapsed, "memory_mb": memory_mb, "memory_str": memory_str}
+            self.memory_samples.append(sample)
+            with open(self.csv_path, 'a', encoding='utf-8') as f: f.write(f"{timestamp},{elapsed:.2f},{memory_mb if memory_mb is not None else ''}\n")
+            return sample
+        except Exception as e: return {"memory_mb": None, "memory_str": "Error"}
+
+    def _get_memory_info_mb(self) -> Optional[float]:
+        pid_str = str(self.pid)
+        try:
+            if sys.platform == 'darwin': result = subprocess.run(["ps", "-o", "rss=", "-p", pid_str], capture_output=True, text=True, check=True, encoding='utf-8'); return int(result.stdout.strip()) / 1024.0
+            elif sys.platform == 'linux':
+                with open(f"/proc/{pid_str}/status", encoding='utf-8') as f:
+                    for line in f:
+                        if line.startswith("VmRSS:"): return int(line.split()[1]) / 1024.0
+                return None
+            elif sys.platform == 'win32': result = subprocess.run(["tasklist", "/fi", f"PID eq {pid_str}", "/fo", "csv", "/nh"], capture_output=True, text=True, check=True, encoding='cp850', errors='ignore'); parts = result.stdout.strip().split('","'); return int(parts[4].strip().replace('"', '').replace(' K', '').replace(',', '')) / 1024.0 if len(parts) >= 5 else None
+            else: return None
+        except: return None # Catch all exceptions for robustness
+
+    def get_report(self) -> Dict:
+        if not self.memory_samples: return {"error": "No memory samples collected"}
+        total_time = time.time() - self.start_time; valid_samples = [s['memory_mb'] for s in self.memory_samples if s['memory_mb'] is not None]
+        start_mem = valid_samples[0] if valid_samples else None; end_mem = valid_samples[-1] if valid_samples else None
+        max_mem = max(valid_samples) if valid_samples else None; avg_mem = sum(valid_samples) / len(valid_samples) if valid_samples else None
+        growth = (end_mem - start_mem) if start_mem is not None and end_mem is not None else None
+        return {"test_id": self.test_id, "total_time_seconds": total_time, "sample_count": len(self.memory_samples), "valid_sample_count": len(valid_samples), "csv_path": str(self.csv_path), "platform": sys.platform, "start_memory_mb": start_mem, "end_memory_mb": end_mem, "max_memory_mb": max_mem, "average_memory_mb": avg_mem, "memory_growth_mb": growth}
+
+
+# --- CrawlerStressTest Class (Refactored for Per-Batch Logging) ---
+class CrawlerStressTest:
+    """Orchestrates the stress test using arun_many per chunk and a dispatcher."""
+
+    def __init__(
+        self,
+        url_count: int = DEFAULT_URL_COUNT,
+        port: int = DEFAULT_PORT,
+        max_sessions: int = DEFAULT_MAX_SESSIONS,
+        chunk_size: int = DEFAULT_CHUNK_SIZE, # Added chunk_size
+        report_path: str = DEFAULT_REPORT_PATH,
+        stream_mode: bool = DEFAULT_STREAM_MODE,
+        monitor_mode: str = DEFAULT_MONITOR_MODE,
+        use_rate_limiter: bool = False
+    ):
+        self.url_count = url_count
+        self.server_port = port
+        self.max_sessions = max_sessions
+        self.chunk_size = chunk_size # Store chunk size
+        self.report_path = pathlib.Path(report_path)
+        self.report_path.mkdir(parents=True, exist_ok=True)
+        self.stream_mode = stream_mode
+        self.monitor_mode = DisplayMode[monitor_mode.upper()]
+        self.use_rate_limiter = use_rate_limiter
+
+        self.test_id = time.strftime("%Y%m%d_%H%M%S")
+        self.results_summary = {
+            "test_id": self.test_id, "url_count": url_count, "max_sessions": max_sessions,
+            "chunk_size": chunk_size, "stream_mode": stream_mode, "monitor_mode": monitor_mode,
+            "rate_limiter_used": use_rate_limiter, "start_time": "", "end_time": "",
+            "total_time_seconds": 0, "successful_urls": 0, "failed_urls": 0,
+            "urls_processed": 0, "chunks_processed": 0
+        }
+
+    async def run(self) -> Dict:
+        """Run the stress test and return results."""
+        memory_tracker = SimpleMemoryTracker(report_path=self.report_path, test_id=self.test_id)
+        urls = [f"http://localhost:{self.server_port}/page_{i}.html" for i in range(self.url_count)]
+        # Split URLs into chunks based on self.chunk_size
+        url_chunks = [urls[i:i+self.chunk_size] for i in range(0, len(urls), self.chunk_size)]
+
+        self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
+        start_time = time.time()
+
+        config = CrawlerRunConfig(
+            wait_for_images=False, verbose=False,
+            stream=self.stream_mode, # Still pass stream mode, affects arun_many return type
+            cache_mode=CacheMode.BYPASS
+        )
+
+        total_successful_urls = 0
+        total_failed_urls = 0
+        total_urls_processed = 0
+        start_memory_sample = memory_tracker.sample()
+        start_memory_str = start_memory_sample.get("memory_str", "Unknown")
+
+        # monitor = CrawlerMonitor(display_mode=self.monitor_mode, total_urls=self.url_count)
+        monitor = None
+        rate_limiter = RateLimiter(base_delay=(0.1, 0.3)) if self.use_rate_limiter else None
+        dispatcher = MemoryAdaptiveDispatcher(max_session_permit=self.max_sessions, monitor=monitor, rate_limiter=rate_limiter)
+
+        console.print(f"\n[bold cyan]Crawl4AI Stress Test - {self.url_count} URLs, {self.max_sessions} max sessions[/bold cyan]")
+        console.print(f"[bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]Monitor:[/bold cyan] {self.monitor_mode.name}, [bold cyan]Chunk Size:[/bold cyan] {self.chunk_size}")
+        console.print(f"[bold cyan]Initial Memory:[/bold cyan] {start_memory_str}")
+
+        # Print batch log header only if not streaming
+        if not self.stream_mode:
+            console.print("\n[bold]Batch Progress:[/bold] (Monitor below shows overall progress)")
+            console.print("[bold] Batch | Progress | Start Mem | End Mem   | URLs/sec | Success/Fail | Time (s) | Status [/bold]")
+            console.print("─" * 90)
+
+        monitor_task = asyncio.create_task(self._periodic_memory_sample(memory_tracker, 2.0))
+
+        try:
+            async with AsyncWebCrawler(
+                    config=BrowserConfig( verbose = False)
+                ) as crawler:
+                # Process URLs chunk by chunk
+                for chunk_idx, url_chunk in enumerate(url_chunks):
+                    batch_start_time = time.time()
+                    chunk_success = 0
+                    chunk_failed = 0
+
+                    # Sample memory before the chunk
+                    start_mem_sample = memory_tracker.sample()
+                    start_mem_str = start_mem_sample.get("memory_str", "Unknown")
+
+                    # --- Call arun_many for the current chunk ---
+                    try:
+                        # Note: dispatcher/monitor persist across calls
+                        results_gen_or_list: Union[AsyncGenerator[CrawlResult, None], List[CrawlResult]] = \
+                            await crawler.arun_many(
+                                urls=url_chunk,
+                                config=config,
+                                dispatcher=dispatcher # Reuse the same dispatcher
+                            )
+
+                        if self.stream_mode:
+                            # Process stream results if needed, but batch logging is less relevant
+                            async for result in results_gen_or_list:
+                                total_urls_processed += 1
+                                if result.success: chunk_success += 1
+                                else: chunk_failed += 1
+                            # In stream mode, batch summary isn't as meaningful here
+                            # We could potentially track completion per chunk async, but it's complex
+
+                        else: # Batch mode
+                            # Process the list of results for this chunk
+                            for result in results_gen_or_list:
+                                total_urls_processed += 1
+                                if result.success: chunk_success += 1
+                                else: chunk_failed += 1
+
+                    except Exception as e:
+                        console.print(f"[bold red]Error processing chunk {chunk_idx+1}: {e}[/bold red]")
+                        chunk_failed = len(url_chunk) # Assume all failed in the chunk on error
+                        total_urls_processed += len(url_chunk) # Count them as processed (failed)
+
+                    # --- Log batch results (only if not streaming) ---
+                    if not self.stream_mode:
+                        batch_time = time.time() - batch_start_time
+                        urls_per_sec = len(url_chunk) / batch_time if batch_time > 0 else 0
+                        end_mem_sample = memory_tracker.sample()
+                        end_mem_str = end_mem_sample.get("memory_str", "Unknown")
+
+                        progress_pct = (total_urls_processed / self.url_count) * 100
+
+                        if chunk_failed == 0: status_color, status = "green", "Success"
+                        elif chunk_success == 0: status_color, status = "red", "Failed"
+                        else: status_color, status = "yellow", "Partial"
+
+                        console.print(
+                             f" {chunk_idx+1:<5} | {progress_pct:6.1f}% | {start_mem_str:>9} | {end_mem_str:>9} | {urls_per_sec:8.1f} | "
+                            f"{chunk_success:^7}/{chunk_failed:<6} | {batch_time:8.2f} | [{status_color}]{status:<7}[/{status_color}]"
+                        )
+
+                    # Accumulate totals
+                    total_successful_urls += chunk_success
+                    total_failed_urls += chunk_failed
+                    self.results_summary["chunks_processed"] += 1
+
+                    # Optional small delay between starting chunks if needed
+                    # await asyncio.sleep(0.1)
+
+        except Exception as e:
+             console.print(f"[bold red]An error occurred during the main crawl loop: {e}[/bold red]")
+        finally:
+            if 'monitor_task' in locals() and not monitor_task.done():
+                 monitor_task.cancel()
+                 try: await monitor_task
+                 except asyncio.CancelledError: pass
+
+        end_time = time.time()
+        self.results_summary.update({
+            "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "total_time_seconds": end_time - start_time,
+            "successful_urls": total_successful_urls,
+            "failed_urls": total_failed_urls,
+            "urls_processed": total_urls_processed,
+            "memory": memory_tracker.get_report()
+        })
+        self._save_results()
+        return self.results_summary
+
+    async def _periodic_memory_sample(self, tracker: SimpleMemoryTracker, interval: float):
+        """Background task to sample memory periodically."""
+        while True:
+            tracker.sample()
+            try:
+                await asyncio.sleep(interval)
+            except asyncio.CancelledError:
+                break # Exit loop on cancellation
+
+    def _save_results(self) -> None:
+        results_path = self.report_path / f"test_summary_{self.test_id}.json"
+        try:
+            with open(results_path, 'w', encoding='utf-8') as f: json.dump(self.results_summary, f, indent=2, default=str)
+            # console.print(f"\n[bold green]Results summary saved to {results_path}[/bold green]") # Moved summary print to run_full_test
+        except Exception as e: console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
+
+
+# --- run_full_test Function (Adjusted) ---
+async def run_full_test(args):
+    """Run the complete test process from site generation to crawling."""
+    server = None
+    site_generated = False
+
+    # --- Site Generation --- (Same as before)
+    if not args.use_existing_site and not args.skip_generation:
+        if os.path.exists(args.site_path): console.print(f"[yellow]Removing existing site directory: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+        site_generator = SiteGenerator(site_path=args.site_path, page_count=args.urls); site_generator.generate_site(); site_generated = True
+    elif args.use_existing_site: console.print(f"[cyan]Using existing site assumed to be running on port {args.port}[/cyan]")
+    elif args.skip_generation:
+         console.print(f"[cyan]Skipping site generation, using existing directory: {args.site_path}[/cyan]")
+         if not os.path.exists(args.site_path) or not os.path.isdir(args.site_path): console.print(f"[bold red]Error: Site path '{args.site_path}' does not exist or is not a directory.[/bold red]"); return
+
+    # --- Start Local Server --- (Same as before)
+    server_started = False
+    if not args.use_existing_site:
+        server = LocalHttpServer(site_path=args.site_path, port=args.port)
+        try: server.start(); server_started = True
+        except Exception as e:
+            console.print(f"[bold red]Failed to start local server. Aborting test.[/bold red]")
+            if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+            return
+
+    try:
+        # --- Run the Stress Test ---
+        test = CrawlerStressTest(
+            url_count=args.urls,
+            port=args.port,
+            max_sessions=args.max_sessions,
+            chunk_size=args.chunk_size, # Pass chunk_size
+            report_path=args.report_path,
+            stream_mode=args.stream,
+            monitor_mode=args.monitor_mode,
+            use_rate_limiter=args.use_rate_limiter
+        )
+        results = await test.run() # Run the test which now handles chunks internally
+
+        # --- Print Summary ---
+        console.print("\n" + "=" * 80)
+        console.print("[bold green]Test Completed[/bold green]")
+        console.print("=" * 80)
+
+        # (Summary printing logic remains largely the same)
+        success_rate = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
+        urls_per_second = results["urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
+
+        console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
+        console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_sessions']} sessions, Chunk: {results['chunk_size']}, Stream: {results['stream_mode']}, Monitor: {results['monitor_mode']}")
+        console.print(f"[bold cyan]Results:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['urls_processed']} processed, {success_rate:.1f}% success)")
+        console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f} seconds total, {urls_per_second:.2f} URLs/second avg")
+
+        mem_report = results.get("memory", {})
+        mem_info_str = "Memory tracking data unavailable."
+        if mem_report and not mem_report.get("error"):
+            start_mb = mem_report.get('start_memory_mb'); end_mb = mem_report.get('end_memory_mb'); max_mb = mem_report.get('max_memory_mb'); growth_mb = mem_report.get('memory_growth_mb')
+            mem_parts = []
+            if start_mb is not None: mem_parts.append(f"Start: {start_mb:.1f} MB")
+            if end_mb is not None: mem_parts.append(f"End: {end_mb:.1f} MB")
+            if max_mb is not None: mem_parts.append(f"Max: {max_mb:.1f} MB")
+            if growth_mb is not None: mem_parts.append(f"Growth: {growth_mb:.1f} MB")
+            if mem_parts: mem_info_str = ", ".join(mem_parts)
+            csv_path = mem_report.get('csv_path')
+            if csv_path: console.print(f"[dim]Memory samples saved to: {csv_path}[/dim]")
+
+        console.print(f"[bold cyan]Memory Usage:[/bold cyan] {mem_info_str}")
+        console.print(f"[bold green]Results summary saved to {results['memory']['csv_path'].replace('memory_samples', 'test_summary').replace('.csv', '.json')}[/bold green]") # Infer summary path
+
+
+        if results["failed_urls"] > 0: console.print(f"\n[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate:.1f}% failure rate)[/bold yellow]")
+        if results["urls_processed"] < results["url_count"]: console.print(f"\n[bold red]Error: Only {results['urls_processed']} out of {results['url_count']} URLs were processed![/bold red]")
+
+
+    finally:
+        # --- Stop Server / Cleanup --- (Same as before)
+        if server_started and server and not args.keep_server_alive: server.stop()
+        elif server_started and server and args.keep_server_alive:
+            console.print(f"[bold cyan]Server is kept running on port {args.port}. Press Ctrl+C to stop it.[/bold cyan]")
+            try: await asyncio.Future() # Keep running indefinitely
+            except KeyboardInterrupt: console.print("\n[bold yellow]Stopping server due to user interrupt...[/bold yellow]"); server.stop()
+
+        if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+        elif args.clean_site and os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+
+
+# --- main Function (Added chunk_size argument) ---
+def main():
+    """Main entry point for the script."""
+    parser = argparse.ArgumentParser(description="Crawl4AI SDK High Volume Stress Test using arun_many")
+
+    # Test parameters
+    parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Number of URLs to test (default: {DEFAULT_URL_COUNT})")
+    parser.add_argument("--max-sessions", type=int, default=DEFAULT_MAX_SESSIONS, help=f"Maximum concurrent crawling sessions (default: {DEFAULT_MAX_SESSIONS})")
+    parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per batch for logging (default: {DEFAULT_CHUNK_SIZE})") # Added
+    parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Enable streaming mode (disables batch logging) (default: {DEFAULT_STREAM_MODE})")
+    parser.add_argument("--monitor-mode", type=str, default=DEFAULT_MONITOR_MODE, choices=["DETAILED", "AGGREGATED"], help=f"Display mode for the live monitor (default: {DEFAULT_MONITOR_MODE})")
+    parser.add_argument("--use-rate-limiter", action="store_true", default=False, help="Enable a basic rate limiter (default: False)")
+
+    # Environment parameters
+    parser.add_argument("--site-path", type=str, default=DEFAULT_SITE_PATH, help=f"Path to generate/use the test site (default: {DEFAULT_SITE_PATH})")
+    parser.add_argument("--port", type=int, default=DEFAULT_PORT, help=f"Port for the local HTTP server (default: {DEFAULT_PORT})")
+    parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
+
+    # Site/Server management
+    parser.add_argument("--skip-generation", action="store_true", help="Use existing test site folder without regenerating")
+    parser.add_argument("--use-existing-site", action="store_true", help="Do not generate site or start local server; assume site exists on --port")
+    parser.add_argument("--keep-server-alive", action="store_true", help="Keep the local HTTP server running after test")
+    parser.add_argument("--keep-site", action="store_true", help="Keep the generated test site files after test")
+    parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
+    parser.add_argument("--clean-site", action="store_true", help="Clean up site directory before running (if generating) or after")
+
+    args = parser.parse_args()
+
+    # Display config
+    console.print("[bold underline]Crawl4AI SDK Stress Test Configuration[/bold underline]")
+    console.print(f"URLs: {args.urls}, Max Sessions: {args.max_sessions}, Chunk Size: {args.chunk_size}") # Added chunk size
+    console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}, Monitor: {args.monitor_mode}, Rate Limit: {args.use_rate_limiter}")
+    console.print(f"Site Path: {args.site_path}, Port: {args.port}, Report Path: {args.report_path}")
+    console.print("-" * 40)
+    # (Rest of config display and cleanup logic is the same)
+    if args.use_existing_site: console.print("[cyan]Mode: Using existing external site/server[/cyan]")
+    elif args.skip_generation: console.print("[cyan]Mode: Using existing site files, starting local server[/cyan]")
+    else: console.print("[cyan]Mode: Generating site files, starting local server[/cyan]")
+    if args.keep_server_alive: console.print("[cyan]Option: Keep server alive after test[/cyan]")
+    if args.keep_site: console.print("[cyan]Option: Keep site files after test[/cyan]")
+    if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
+    if args.clean_site: console.print("[cyan]Option: Clean site directory[/cyan]")
+    console.print("-" * 40)
+
+    if args.clean_reports:
+        if os.path.exists(args.report_path): console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]"); shutil.rmtree(args.report_path)
+        os.makedirs(args.report_path, exist_ok=True)
+    if args.clean_site and not args.use_existing_site:
+         if os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+
+    # Run
+    try: asyncio.run(run_full_test(args))
+    except KeyboardInterrupt: console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
+    except Exception as e: console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}"); import traceback; traceback.print_exc()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tests/profiler/test_crteate_profile.py b/tests/profiler/test_crteate_profile.py
new file mode 100644
index 00000000..e441ea4a
--- /dev/null
+++ b/tests/profiler/test_crteate_profile.py
@@ -0,0 +1,32 @@
+from crawl4ai import BrowserProfiler
+import asyncio
+
+
+if __name__ == "__main__":
+    # Example usage
+    profiler = BrowserProfiler()
+    
+    # Create a new profile
+    import os
+    from pathlib import Path
+    home_dir = Path.home()
+    profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
+    
+    print(f"Profile created at: {profile_path}")
+
+        
+            
+    # # Launch a standalone browser
+    # asyncio.run(profiler.launch_standalone_browser())
+    
+    # # List profiles
+    # profiles = profiler.list_profiles()
+    # for profile in profiles:
+    #     print(f"Profile: {profile['name']}, Path: {profile['path']}")
+    
+    # # Delete a profile
+    # success = profiler.delete_profile("my-profile")
+    # if success:
+    #     print("Profile deleted successfully")
+    # else:
+    #     print("Failed to delete profile")
\ No newline at end of file
diff --git a/tests/test_scraping_strategy.py b/tests/test_scraping_strategy.py
index 425d02c9..df462854 100644
--- a/tests/test_scraping_strategy.py
+++ b/tests/test_scraping_strategy.py
@@ -19,7 +19,7 @@ async def main():
     async with AsyncWebCrawler() as crawler:
         result = await crawler.arun(url="https://example.com", config=config)
         print(f"Success: {result.success}")
-        print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
+        print(f"Markdown length: {len(result.markdown.raw_markdown)}")
 
 
 if __name__ == "__main__":
diff --git a/tests/test_web_crawler.py b/tests/test_web_crawler.py
index d6eddfdc..b8453192 100644
--- a/tests/test_web_crawler.py
+++ b/tests/test_web_crawler.py
@@ -1,4 +1,5 @@
 import unittest, os
+from crawl4ai import LLMConfig
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import (
     RegexChunking,
@@ -42,7 +43,7 @@ class TestWebCrawler(unittest.TestCase):
             word_count_threshold=5,
             chunking_strategy=FixedLengthWordChunking(chunk_size=100),
             extraction_strategy=LLMExtractionStrategy(
-                provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")
+                llm_config=LLMConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
             ),
             bypass_cache=True,
         )