diff --git a/README.md b/README.md
index 97787b2f..02f01d03 100644
--- a/README.md
+++ b/README.md
@@ -352,7 +352,7 @@ if __name__ == "__main__":
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 import json
 
 async def main():
@@ -426,7 +426,7 @@ if __name__ == "__main__":
 import os
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 from pydantic import BaseModel, Field
 
 class OpenAIModelFee(BaseModel):
diff --git a/crawl4ai/crawlers/google_search/crawler.py b/crawl4ai/crawlers/google_search/crawler.py
index e1288de1..18233417 100644
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -1,7 +1,7 @@
 from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
 from crawl4ai.hub import BaseCrawler
 from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from pathlib import Path
 import json
 import os
diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md
index 1b250126..eb29b94c 100644
--- a/deploy/docker/c4ai-code-context.md
+++ b/deploy/docker/c4ai-code-context.md
@@ -7901,7 +7901,7 @@ from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.extraction_strategy import (
+from crawl4ai import (
     JsonCssExtractionStrategy,
     LLMExtractionStrategy,
 )
@@ -8301,7 +8301,7 @@ async def crawl_dynamic_content_pages_method_2():
 
 
 async def cosine_similarity_extraction():
-    from crawl4ai.extraction_strategy import CosineStrategy
+    from crawl4ai import CosineStrategy
     crawl_config = CrawlerRunConfig(
         cache_mode=CacheMode.BYPASS,
         extraction_strategy=CosineStrategy(
diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md
index 77785cec..5137159e 100644
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -354,7 +354,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def main():
     # 1) Browser config: headless, bigger viewport, no proxy
@@ -1042,7 +1042,7 @@ You can combine content selection with a more advanced extraction strategy. For
 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def main():
     # Minimal schema for repeated items
@@ -1094,7 +1094,7 @@ import asyncio
 import json
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class ArticleData(BaseModel):
     headline: str
@@ -1139,7 +1139,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a
 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def extract_main_articles(url: str):
     schema = {
@@ -1488,7 +1488,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu
 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def main():
     schema = {
@@ -4722,7 +4722,7 @@ if __name__ == "__main__":
 Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
 
 ```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 schema = {
     "name": "Commits",
@@ -4902,7 +4902,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
 > **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
 
 ```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai import LLMConfig
 
 # Generate a schema (one-time cost)
@@ -4932,7 +4932,7 @@ Here's a basic extraction example:
 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def main():
     schema = {
@@ -4987,7 +4987,7 @@ import json
 import asyncio
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class OpenAIModelFee(BaseModel):
     model_name: str = Field(..., description="Name of the OpenAI model.")
@@ -5103,7 +5103,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def extract_structured_data_using_css_extractor():
     print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
@@ -7300,7 +7300,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv
 
 ```python
 from crawl4ai.async_configs import CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.cache_context import CacheMode
 
 async def crawl_dynamic_content():
@@ -7850,7 +7850,7 @@ The Cosine Strategy:
 ## Basic Usage
 
 ```python
-from crawl4ai.extraction_strategy import CosineStrategy
+from crawl4ai import CosineStrategy
 
 strategy = CosineStrategy(
     semantic_filter="product reviews",    # Target content type
@@ -8161,7 +8161,7 @@ import json
 from pydantic import BaseModel, Field
 from typing import List
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class Product(BaseModel):
     name: str
@@ -8278,7 +8278,7 @@ import asyncio
 from typing import List
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class Entity(BaseModel):
     name: str
@@ -8423,7 +8423,7 @@ Let’s begin with a **simple** schema-based extraction using the `JsonCssExtrac
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def extract_crypto_prices():
     # 1. Define a simple extraction schema
@@ -8493,7 +8493,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`*
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+from crawl4ai import JsonXPathExtractionStrategy
 
 async def extract_crypto_prices_xpath():
     # 1. Minimal dummy HTML with some repeating rows
@@ -8694,7 +8694,7 @@ Key Takeaways:
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 ecommerce_schema = {
     # ... the advanced schema from above ...
@@ -8804,7 +8804,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c
 The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
 
 ```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
 from crawl4ai import LLMConfig
 
 # Sample HTML with product information
diff --git a/docs/apps/iseeyou/llms-full.txt b/docs/apps/iseeyou/llms-full.txt
new file mode 100644
index 00000000..12dad603
--- /dev/null
+++ b/docs/apps/iseeyou/llms-full.txt
@@ -0,0 +1,7715 @@
+# Crawl4AI
+
+> Open-source LLM-friendly web crawler and scraper for AI applications
+
+Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. Built with Python and Playwright for high-performance crawling with structured data extraction.
+
+**Key Features:**
+- Asynchronous crawling with high concurrency
+- Multiple extraction strategies (CSS, XPath, LLM-based)
+- Built-in markdown generation with content filtering
+- Docker deployment with REST API
+- Session management and browser automation
+- Advanced anti-detection capabilities
+
+**Quick Links:**
+- [GitHub Repository](https://github.com/unclecode/crawl4ai)
+- [Documentation](https://docs.crawl4ai.com)
+- [Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples)
+
+---
+
+
+## Installation
+
+Multiple installation options for different environments and use cases.
+
+### Basic Installation
+
+```bash
+# Install core library
+pip install crawl4ai
+
+# Initial setup (installs Playwright browsers)
+crawl4ai-setup
+
+# Verify installation
+crawl4ai-doctor
+```
+
+### Quick Verification
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+        print(result.markdown[:300])
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**📖 Learn more:** [Basic Usage Guide](https://docs.crawl4ai.com/core/quickstart.md)
+
+### Advanced Features (Optional)
+
+```bash
+# PyTorch-based features (text clustering, semantic chunking)
+pip install crawl4ai[torch]
+crawl4ai-setup
+
+# Transformers (Hugging Face models)
+pip install crawl4ai[transformer]
+crawl4ai-setup
+
+# All features (large download)
+pip install crawl4ai[all]
+crawl4ai-setup
+
+# Pre-download models (optional)
+crawl4ai-download-models
+```
+
+**📖 Learn more:** [Advanced Features Documentation](https://docs.crawl4ai.com/extraction/llm-strategies.md)
+
+### Docker Deployment
+
+```bash
+# Pull pre-built image (specify platform for consistency)
+docker pull --platform linux/amd64 unclecode/crawl4ai:latest
+# For ARM (M1/M2 Macs): docker pull --platform linux/arm64 unclecode/crawl4ai:latest
+
+# Setup environment for LLM support
+cat > .llm.env << EOL
+OPENAI_API_KEY=sk-your-key
+ANTHROPIC_API_KEY=your-anthropic-key
+EOL
+
+# Run with LLM support (specify platform)
+docker run -d \
+  --platform linux/amd64 \
+  -p 11235:11235 \
+  --name crawl4ai \
+  --env-file .llm.env \
+  --shm-size=1g \
+  unclecode/crawl4ai:latest
+
+# For ARM Macs, use: --platform linux/arm64
+
+# Basic run (no LLM)
+docker run -d \
+  --platform linux/amd64 \
+  -p 11235:11235 \
+  --name crawl4ai \
+  --shm-size=1g \
+  unclecode/crawl4ai:latest
+```
+
+**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment.md)
+
+### Docker Compose
+
+```bash
+# Clone repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+
+# Copy environment template
+cp deploy/docker/.llm.env.example .llm.env
+# Edit .llm.env with your API keys
+
+# Run pre-built image
+IMAGE=unclecode/crawl4ai:latest docker compose up -d
+
+# Build and run locally
+docker compose up --build -d
+
+# Build with all features
+INSTALL_TYPE=all docker compose up --build -d
+
+# Stop service
+docker compose down
+```
+
+**📖 Learn more:** [Docker Compose Configuration](https://docs.crawl4ai.com/core/docker-deployment.md#option-2-using-docker-compose)
+
+### Manual Docker Build
+
+```bash
+# Build multi-architecture image (specify platform)
+docker buildx build --platform linux/amd64 -t crawl4ai-local:latest --load .
+# For ARM: docker buildx build --platform linux/arm64 -t crawl4ai-local:latest --load .
+
+# Build with specific features
+docker buildx build \
+  --platform linux/amd64 \
+  --build-arg INSTALL_TYPE=all \
+  --build-arg ENABLE_GPU=false \
+  -t crawl4ai-local:latest --load .
+
+# Run custom build (specify platform)
+docker run -d \
+  --platform linux/amd64 \
+  -p 11235:11235 \
+  --name crawl4ai-custom \
+  --env-file .llm.env \
+  --shm-size=1g \
+  crawl4ai-local:latest
+```
+
+**📖 Learn more:** [Manual Build Guide](https://docs.crawl4ai.com/core/docker-deployment.md#option-3-manual-local-build--run)
+
+### Google Colab
+
+```python
+# Install in Colab
+!pip install crawl4ai
+!crawl4ai-setup
+
+# If setup fails, manually install Playwright browsers
+!playwright install chromium
+
+# Install with all features (may take 5-10 minutes)
+!pip install crawl4ai[all]
+!crawl4ai-setup
+!crawl4ai-download-models
+
+# If still having issues, force Playwright install
+!playwright install chromium --force
+
+# Quick test
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def test_crawl():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com")
+        print("✅ Installation successful!")
+        print(f"Content length: {len(result.markdown)}")
+
+# Run test in Colab
+await test_crawl()
+```
+
+**📖 Learn more:** [Colab Examples Notebook](https://colab.research.google.com/github/unclecode/crawl4ai/blob/main/docs/examples/quickstart.ipynb)
+
+### Docker API Usage
+
+```python
+# Using Docker SDK
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
+        results = await client.crawl(
+            ["https://example.com"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        for result in results:
+            print(f"Success: {result.success}, Length: {len(result.markdown)}")
+
+asyncio.run(main())
+```
+
+**📖 Learn more:** [Docker Client API](https://docs.crawl4ai.com/core/docker-deployment.md#python-sdk)
+
+### Direct API Calls
+
+```python
+# REST API example
+import requests
+
+payload = {
+    "urls": ["https://example.com"],
+    "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
+}
+
+response = requests.post("http://localhost:11235/crawl", json=payload)
+print(response.json())
+```
+
+**📖 Learn more:** [REST API Reference](https://docs.crawl4ai.com/core/docker-deployment.md#rest-api-examples)
+
+### Health Check
+
+```bash
+# Check Docker service
+curl http://localhost:11235/health
+
+# Access playground
+open http://localhost:11235/playground
+
+# View metrics
+curl http://localhost:11235/metrics
+```
+
+**📖 Learn more:** [Monitoring & Metrics](https://docs.crawl4ai.com/core/docker-deployment.md#metrics--monitoring)
+---
+
+
+## Simple Crawling
+
+Basic web crawling operations with AsyncWebCrawler, configurations, and response handling.
+
+### Basic Setup
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+async def main():
+    browser_config = BrowserConfig()  # Default browser settings
+    run_config = CrawlerRunConfig()   # Default crawl settings
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_config
+        )
+        print(result.markdown)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### Understanding CrawlResult
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+config = CrawlerRunConfig(
+    markdown_generator=DefaultMarkdownGenerator(
+        content_filter=PruningContentFilter(threshold=0.6),
+        options={"ignore_links": True}
+    )
+)
+
+result = await crawler.arun("https://example.com", config=config)
+
+# Different content formats
+print(result.html)                    # Raw HTML
+print(result.cleaned_html)            # Cleaned HTML  
+print(result.markdown.raw_markdown)   # Raw markdown
+print(result.markdown.fit_markdown)   # Filtered markdown
+
+# Status information
+print(result.success)      # True/False
+print(result.status_code)  # HTTP status (200, 404, etc.)
+
+# Extracted content
+print(result.media)        # Images, videos, audio
+print(result.links)        # Internal/external links
+```
+
+### Basic Configuration Options
+
+```python
+run_config = CrawlerRunConfig(
+    word_count_threshold=10,        # Min words per block
+    exclude_external_links=True,    # Remove external links
+    remove_overlay_elements=True,   # Remove popups/modals
+    process_iframes=True,           # Process iframe content
+    excluded_tags=['form', 'header']  # Skip these tags
+)
+
+result = await crawler.arun("https://example.com", config=run_config)
+```
+
+### Error Handling
+
+```python
+result = await crawler.arun("https://example.com", config=run_config)
+
+if not result.success:
+    print(f"Crawl failed: {result.error_message}")
+    print(f"Status code: {result.status_code}")
+else:
+    print(f"Success! Content length: {len(result.markdown)}")
+```
+
+### Debugging with Verbose Logging
+
+```python
+browser_config = BrowserConfig(verbose=True)
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun("https://example.com")
+    # Detailed logging output will be displayed
+```
+
+### Complete Example
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def comprehensive_crawl():
+    browser_config = BrowserConfig(verbose=True)
+    
+    run_config = CrawlerRunConfig(
+        # Content filtering
+        word_count_threshold=10,
+        excluded_tags=['form', 'header', 'nav'],
+        exclude_external_links=True,
+        
+        # Content processing
+        process_iframes=True,
+        remove_overlay_elements=True,
+        
+        # Cache control
+        cache_mode=CacheMode.ENABLED
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=run_config
+        )
+        
+        if result.success:
+            # Display content summary
+            print(f"Title: {result.metadata.get('title', 'No title')}")
+            print(f"Content: {result.markdown[:500]}...")
+            
+            # Process media
+            images = result.media.get("images", [])
+            print(f"Found {len(images)} images")
+            for img in images[:3]:  # First 3 images
+                print(f"  - {img.get('src', 'No src')}")
+            
+            # Process links
+            internal_links = result.links.get("internal", [])
+            print(f"Found {len(internal_links)} internal links")
+            for link in internal_links[:3]:  # First 3 links
+                print(f"  - {link.get('href', 'No href')}")
+                
+        else:
+            print(f"❌ Crawl failed: {result.error_message}")
+            print(f"Status: {result.status_code}")
+
+if __name__ == "__main__":
+    asyncio.run(comprehensive_crawl())
+```
+
+### Working with Raw HTML and Local Files
+
+```python
+# Crawl raw HTML
+raw_html = "<html><body><h1>Test</h1><p>Content</p></body></html>"
+result = await crawler.arun(f"raw://{raw_html}")
+
+# Crawl local file
+result = await crawler.arun("file:///path/to/local/file.html")
+
+# Both return standard CrawlResult objects
+print(result.markdown)
+```
+
+## Table Extraction
+
+Extract structured data from HTML tables with automatic detection and scoring.
+
+### Basic Table Extraction
+
+```python
+import asyncio
+import pandas as pd
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+async def extract_tables():
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            table_score_threshold=7,  # Higher = stricter detection
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        result = await crawler.arun("https://example.com/tables", config=config)
+        
+        if result.success and result.tables:
+            # New tables field (v0.6+)
+            for i, table in enumerate(result.tables):
+                print(f"Table {i+1}:")
+                print(f"Headers: {table['headers']}")
+                print(f"Rows: {len(table['rows'])}")
+                print(f"Caption: {table.get('caption', 'No caption')}")
+                
+                # Convert to DataFrame
+                df = pd.DataFrame(table['rows'], columns=table['headers'])
+                print(df.head())
+
+asyncio.run(extract_tables())
+```
+
+### Advanced Table Processing
+
+```python
+from crawl4ai import LXMLWebScrapingStrategy
+
+async def process_financial_tables():
+    config = CrawlerRunConfig(
+        table_score_threshold=8,  # Strict detection for data tables
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        keep_data_attributes=True,
+        scan_full_page=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://coinmarketcap.com", config=config)
+        
+        if result.tables:
+            # Get the main data table (usually first/largest)
+            main_table = result.tables[0]
+            
+            # Create DataFrame
+            df = pd.DataFrame(
+                main_table['rows'],
+                columns=main_table['headers']
+            )
+            
+            # Clean and process data
+            df = clean_financial_data(df)
+            
+            # Save for analysis
+            df.to_csv("market_data.csv", index=False)
+            return df
+
+def clean_financial_data(df):
+    """Clean currency symbols, percentages, and large numbers"""
+    for col in df.columns:
+        if 'price' in col.lower():
+            # Remove currency symbols
+            df[col] = df[col].str.replace(r'[^\d.]', '', regex=True)
+            df[col] = pd.to_numeric(df[col], errors='coerce')
+        
+        elif '%' in str(df[col].iloc[0]):
+            # Convert percentages
+            df[col] = df[col].str.replace('%', '').astype(float) / 100
+        
+        elif any(suffix in str(df[col].iloc[0]) for suffix in ['B', 'M', 'K']):
+            # Handle large numbers (Billions, Millions, etc.)
+            df[col] = df[col].apply(convert_large_numbers)
+    
+    return df
+
+def convert_large_numbers(value):
+    """Convert 1.5B -> 1500000000"""
+    if pd.isna(value):
+        return float('nan')
+    
+    value = str(value)
+    multiplier = 1
+    if 'B' in value:
+        multiplier = 1e9
+    elif 'M' in value:
+        multiplier = 1e6
+    elif 'K' in value:
+        multiplier = 1e3
+    
+    number = float(re.sub(r'[^\d.]', '', value))
+    return number * multiplier
+```
+
+### Table Detection Configuration
+
+```python
+# Strict table detection (data-heavy pages)
+strict_config = CrawlerRunConfig(
+    table_score_threshold=9,  # Only high-quality tables
+    word_count_threshold=5,   # Ignore sparse content
+    excluded_tags=['nav', 'footer']  # Skip navigation tables
+)
+
+# Lenient detection (mixed content pages)
+lenient_config = CrawlerRunConfig(
+    table_score_threshold=5,  # Include layout tables
+    process_iframes=True,     # Check embedded tables
+    scan_full_page=True      # Scroll to load dynamic tables
+)
+
+# Financial/data site optimization
+financial_config = CrawlerRunConfig(
+    table_score_threshold=8,
+    scraping_strategy=LXMLWebScrapingStrategy(),
+    wait_for="css:table",     # Wait for tables to load
+    scan_full_page=True,
+    scroll_delay=0.2
+)
+```
+
+### Multi-Table Processing
+
+```python
+async def extract_all_tables():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/data", config=config)
+        
+        tables_data = {}
+        
+        for i, table in enumerate(result.tables):
+            # Create meaningful names based on content
+            table_name = (
+                table.get('caption') or 
+                f"table_{i+1}_{table['headers'][0]}"
+            ).replace(' ', '_').lower()
+            
+            df = pd.DataFrame(table['rows'], columns=table['headers'])
+            
+            # Store with metadata
+            tables_data[table_name] = {
+                'dataframe': df,
+                'headers': table['headers'],
+                'row_count': len(table['rows']),
+                'caption': table.get('caption'),
+                'summary': table.get('summary')
+            }
+        
+        return tables_data
+
+# Usage
+tables = await extract_all_tables()
+for name, data in tables.items():
+    print(f"{name}: {data['row_count']} rows")
+    data['dataframe'].to_csv(f"{name}.csv")
+```
+
+### Backward Compatibility
+
+```python
+# Support both new and old table formats
+def get_tables(result):
+    # New format (v0.6+)
+    if hasattr(result, 'tables') and result.tables:
+        return result.tables
+    
+    # Fallback to media.tables (older versions)
+    return result.media.get('tables', [])
+
+# Usage in existing code
+result = await crawler.arun(url, config=config)
+tables = get_tables(result)
+
+for table in tables:
+    df = pd.DataFrame(table['rows'], columns=table['headers'])
+    # Process table data...
+```
+
+### Table Quality Scoring
+
+```python
+# Understanding table_score_threshold values:
+# 10: Only perfect data tables (headers + data rows)
+# 8-9: High-quality tables (recommended for financial/data sites)
+# 6-7: Mixed content tables (news sites, wikis)
+# 4-5: Layout tables included (broader detection)
+# 1-3: All table-like structures (very permissive)
+
+config = CrawlerRunConfig(
+    table_score_threshold=8,  # Balanced detection
+    verbose=True  # See scoring details in logs
+)
+```
+
+
+**📖 Learn more:** [CrawlResult API Reference](https://docs.crawl4ai.com/api/crawl-result/), [Browser & Crawler Configuration](https://docs.crawl4ai.com/core/browser-crawler-config/), [Cache Modes](https://docs.crawl4ai.com/core/cache-modes/)
+---
+
+
+## Browser, Crawler & LLM Configuration
+
+Core configuration classes for controlling browser behavior, crawl operations, LLM providers, and understanding crawl results.
+
+### BrowserConfig - Browser Environment Setup
+
+```python
+from crawl4ai import BrowserConfig, AsyncWebCrawler
+
+# Basic browser configuration
+browser_config = BrowserConfig(
+    browser_type="chromium",  # "chromium", "firefox", "webkit"
+    headless=True,           # False for visible browser (debugging)
+    viewport_width=1280,
+    viewport_height=720,
+    verbose=True
+)
+
+# Advanced browser setup with proxy and persistence
+browser_config = BrowserConfig(
+    headless=False,
+    proxy="http://user:pass@proxy:8080",
+    use_persistent_context=True,
+    user_data_dir="./browser_data",
+    cookies=[
+        {"name": "session", "value": "abc123", "domain": "example.com"}
+    ],
+    headers={"Accept-Language": "en-US,en;q=0.9"},
+    user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/116.0.0.0 Safari/537.36",
+    text_mode=True,  # Disable images for faster crawling
+    extra_args=["--disable-extensions", "--no-sandbox"]
+)
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun("https://example.com")
+```
+
+### CrawlerRunConfig - Crawl Operation Control
+
+```python
+from crawl4ai import CrawlerRunConfig, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+# Basic crawl configuration
+run_config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    word_count_threshold=10,
+    excluded_tags=["nav", "footer", "script"],
+    exclude_external_links=True,
+    screenshot=True,
+    pdf=True
+)
+
+# Advanced content processing
+md_generator = DefaultMarkdownGenerator(
+    content_filter=PruningContentFilter(threshold=0.6),
+    options={"citations": True, "ignore_links": False}
+)
+
+run_config = CrawlerRunConfig(
+    # Content processing
+    markdown_generator=md_generator,
+    css_selector="main.content",  # Focus on specific content
+    target_elements=[".article", ".post"],  # Multiple target selectors
+    process_iframes=True,
+    remove_overlay_elements=True,
+    
+    # Page interaction
+    js_code=[
+        "window.scrollTo(0, document.body.scrollHeight);",
+        "document.querySelector('.load-more')?.click();"
+    ],
+    wait_for="css:.content-loaded",
+    wait_for_timeout=10000,
+    scan_full_page=True,
+    
+    # Session management
+    session_id="persistent_session",
+    
+    # Media handling
+    screenshot=True,
+    pdf=True,
+    capture_mhtml=True,
+    image_score_threshold=5,
+    
+    # Advanced options
+    simulate_user=True,
+    magic=True,  # Auto-handle popups
+    verbose=True
+)
+```
+
+### CrawlerRunConfig Parameters by Category
+
+```python
+# Content Processing
+config = CrawlerRunConfig(
+    word_count_threshold=10,              # Min words per content block
+    css_selector="main.article",          # Focus on specific content
+    target_elements=[".post", ".content"], # Multiple target selectors
+    excluded_tags=["nav", "footer"],       # Remove these tags
+    excluded_selector="#ads, .tracker",   # Remove by selector
+    only_text=True,                       # Text-only extraction
+    keep_data_attributes=True,            # Preserve data-* attributes
+    remove_forms=True,                    # Remove all forms
+    process_iframes=True                  # Include iframe content
+)
+
+# Page Navigation & Timing
+config = CrawlerRunConfig(
+    wait_until="networkidle",             # Wait condition
+    page_timeout=60000,                   # 60 second timeout
+    wait_for="css:.loaded",               # Wait for specific element
+    wait_for_images=True,                 # Wait for images to load
+    delay_before_return_html=0.5,         # Final delay before capture
+    semaphore_count=10                    # Max concurrent operations
+)
+
+# Page Interaction
+config = CrawlerRunConfig(
+    js_code="document.querySelector('button').click();",
+    scan_full_page=True,                  # Auto-scroll page
+    scroll_delay=0.3,                     # Delay between scrolls
+    remove_overlay_elements=True,         # Remove popups/modals
+    simulate_user=True,                   # Simulate human behavior
+    override_navigator=True,              # Override navigator properties
+    magic=True                           # Auto-handle common patterns
+)
+
+# Caching & Session
+config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,          # Cache behavior
+    session_id="my_session",              # Persistent session
+    shared_data={"context": "value"}      # Share data between hooks
+)
+
+# Media & Output
+config = CrawlerRunConfig(
+    screenshot=True,                      # Capture screenshot
+    pdf=True,                            # Generate PDF
+    capture_mhtml=True,                  # Capture MHTML archive
+    image_score_threshold=3,             # Filter low-quality images
+    exclude_external_images=True         # Remove external images
+)
+
+# Link & Domain Filtering
+config = CrawlerRunConfig(
+    exclude_external_links=True,         # Remove external links
+    exclude_social_media_links=True,     # Remove social media links
+    exclude_domains=["ads.com", "tracker.io"],  # Custom domain filter
+    exclude_internal_links=False         # Keep internal links
+)
+```
+
+### LLMConfig - Language Model Setup
+
+```python
+from crawl4ai import LLMConfig
+
+# OpenAI configuration
+llm_config = LLMConfig(
+    provider="openai/gpt-4o-mini",
+    api_token=os.getenv("OPENAI_API_KEY"),  # or "env:OPENAI_API_KEY"
+    temperature=0.1,
+    max_tokens=2000
+)
+
+# Local model with Ollama
+llm_config = LLMConfig(
+    provider="ollama/llama3.3",
+    api_token=None,  # Not needed for Ollama
+    base_url="http://localhost:11434"  # Custom endpoint
+)
+
+# Anthropic Claude
+llm_config = LLMConfig(
+    provider="anthropic/claude-3-5-sonnet-20240620",
+    api_token="env:ANTHROPIC_API_KEY",
+    max_tokens=4000
+)
+
+# Google Gemini
+llm_config = LLMConfig(
+    provider="gemini/gemini-1.5-pro",
+    api_token="env:GEMINI_API_KEY"
+)
+
+# Groq (fast inference)
+llm_config = LLMConfig(
+    provider="groq/llama3-70b-8192",
+    api_token="env:GROQ_API_KEY"
+)
+```
+
+### CrawlResult - Understanding Output
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun("https://example.com", config=run_config)
+    
+    # Basic status information
+    print(f"Success: {result.success}")
+    print(f"Status: {result.status_code}")
+    print(f"URL: {result.url}")
+    
+    if not result.success:
+        print(f"Error: {result.error_message}")
+        return
+    
+    # HTML content variants
+    print(f"Original HTML: {len(result.html)} chars")
+    print(f"Cleaned HTML: {len(result.cleaned_html or '')} chars")
+    
+    # Markdown output (MarkdownGenerationResult)
+    if result.markdown:
+        print(f"Raw markdown: {len(result.markdown.raw_markdown)} chars")
+        print(f"With citations: {len(result.markdown.markdown_with_citations)} chars")
+        
+        # Filtered content (if content filter was used)
+        if result.markdown.fit_markdown:
+            print(f"Fit markdown: {len(result.markdown.fit_markdown)} chars")
+            print(f"Fit HTML: {len(result.markdown.fit_html)} chars")
+    
+    # Extracted structured data
+    if result.extracted_content:
+        import json
+        data = json.loads(result.extracted_content)
+        print(f"Extracted {len(data)} items")
+    
+    # Media and links
+    images = result.media.get("images", [])
+    print(f"Found {len(images)} images")
+    for img in images[:3]:  # First 3 images
+        print(f"  {img.get('src')} (score: {img.get('score', 0)})")
+    
+    internal_links = result.links.get("internal", [])
+    external_links = result.links.get("external", [])
+    print(f"Links: {len(internal_links)} internal, {len(external_links)} external")
+    
+    # Generated files
+    if result.screenshot:
+        print(f"Screenshot captured: {len(result.screenshot)} chars (base64)")
+        # Save screenshot
+        import base64
+        with open("page.png", "wb") as f:
+            f.write(base64.b64decode(result.screenshot))
+    
+    if result.pdf:
+        print(f"PDF generated: {len(result.pdf)} bytes")
+        with open("page.pdf", "wb") as f:
+            f.write(result.pdf)
+    
+    if result.mhtml:
+        print(f"MHTML captured: {len(result.mhtml)} chars")
+        with open("page.mhtml", "w", encoding="utf-8") as f:
+            f.write(result.mhtml)
+    
+    # SSL certificate information
+    if result.ssl_certificate:
+        print(f"SSL Issuer: {result.ssl_certificate.issuer}")
+        print(f"Valid until: {result.ssl_certificate.valid_until}")
+    
+    # Network and console data (if captured)
+    if result.network_requests:
+        requests = [r for r in result.network_requests if r.get("event_type") == "request"]
+        print(f"Network requests captured: {len(requests)}")
+    
+    if result.console_messages:
+        errors = [m for m in result.console_messages if m.get("type") == "error"]
+        print(f"Console messages: {len(result.console_messages)} ({len(errors)} errors)")
+    
+    # Session and metadata
+    if result.session_id:
+        print(f"Session ID: {result.session_id}")
+    
+    if result.metadata:
+        print(f"Metadata: {result.metadata.get('title', 'No title')}")
+```
+
+### Configuration Helpers and Best Practices
+
+```python
+# Clone configurations for variations
+base_config = CrawlerRunConfig(
+    cache_mode=CacheMode.ENABLED,
+    word_count_threshold=200,
+    verbose=True
+)
+
+# Create streaming version
+stream_config = base_config.clone(
+    stream=True,
+    cache_mode=CacheMode.BYPASS
+)
+
+# Create debug version
+debug_config = base_config.clone(
+    headless=False,
+    page_timeout=120000,
+    verbose=True
+)
+
+# Serialize/deserialize configurations
+config_dict = base_config.dump()  # Convert to dict
+restored_config = CrawlerRunConfig.load(config_dict)  # Restore from dict
+
+# Browser configuration management
+browser_config = BrowserConfig(headless=True, text_mode=True)
+browser_dict = browser_config.to_dict()
+cloned_browser = browser_config.clone(headless=False, verbose=True)
+```
+
+### Common Configuration Patterns
+
+```python
+# Fast text-only crawling
+fast_config = CrawlerRunConfig(
+    cache_mode=CacheMode.ENABLED,
+    text_mode=True,
+    exclude_external_links=True,
+    exclude_external_images=True,
+    word_count_threshold=50
+)
+
+# Comprehensive data extraction
+comprehensive_config = CrawlerRunConfig(
+    process_iframes=True,
+    scan_full_page=True,
+    wait_for_images=True,
+    screenshot=True,
+    capture_network_requests=True,
+    capture_console_messages=True,
+    magic=True
+)
+
+# Stealth crawling
+stealth_config = CrawlerRunConfig(
+    simulate_user=True,
+    override_navigator=True,
+    mean_delay=2.0,
+    max_range=1.0,
+    user_agent_mode="random"
+)
+```
+
+### Advanced Configuration Features
+
+#### User Agent Management & Bot Detection Avoidance
+
+```python
+from crawl4ai import CrawlerRunConfig
+
+# Random user agent generation
+config = CrawlerRunConfig(
+    user_agent_mode="random",
+    user_agent_generator_config={
+        "platform": "windows",  # "windows", "macos", "linux", "android", "ios"
+        "browser": "chrome",    # "chrome", "firefox", "safari", "edge"
+        "device_type": "desktop"  # "desktop", "mobile", "tablet"
+    }
+)
+
+# Custom user agent with stealth features
+config = CrawlerRunConfig(
+    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+    simulate_user=True,      # Simulate human mouse movements
+    override_navigator=True,  # Override navigator properties
+    mean_delay=1.5,          # Random delays between actions
+    max_range=2.0
+)
+
+# Combined anti-detection approach
+stealth_config = CrawlerRunConfig(
+    user_agent_mode="random",
+    simulate_user=True,
+    override_navigator=True,
+    magic=True,  # Auto-handle common bot detection patterns
+    delay_before_return_html=2.0
+)
+```
+
+#### Proxy Configuration with ProxyConfig
+
+```python
+from crawl4ai import CrawlerRunConfig, ProxyConfig, ProxyRotationStrategy
+
+# Single proxy configuration
+proxy_config = ProxyConfig(
+    server="http://proxy.example.com:8080",
+    username="proxy_user",
+    password="proxy_pass"
+)
+
+# From proxy string format
+proxy_config = ProxyConfig.from_string("192.168.1.100:8080:username:password")
+
+# Multiple proxies with rotation
+proxies = [
+    ProxyConfig(server="http://proxy1.com:8080", username="user1", password="pass1"),
+    ProxyConfig(server="http://proxy2.com:8080", username="user2", password="pass2"),
+    ProxyConfig(server="http://proxy3.com:8080", username="user3", password="pass3")
+]
+
+rotation_strategy = ProxyRotationStrategy(
+    proxies=proxies,
+    rotation_method="round_robin"  # or "random", "least_used"
+)
+
+config = CrawlerRunConfig(
+    proxy_config=proxy_config,
+    proxy_rotation_strategy=rotation_strategy
+)
+
+# Load proxies from environment variable
+proxies_from_env = ProxyConfig.from_env("MY_PROXIES")  # comma-separated proxy strings
+```
+
+#### Content Selection: css_selector vs target_elements
+
+```python
+from crawl4ai import CrawlerRunConfig
+
+# css_selector: Extracts HTML at top level, affects entire processing
+config = CrawlerRunConfig(
+    css_selector="main.article, .content-area",  # Can be list of selectors
+    # Everything else (markdown, extraction, links) works only on this HTML subset
+)
+
+# target_elements: Focuses extraction within already processed HTML
+config = CrawlerRunConfig(
+    css_selector="body",  # First extract entire body
+    target_elements=[     # Then focus extraction on these elements
+        ".article-content",
+        ".post-body", 
+        ".main-text"
+    ],
+    # Links, media from entire body, but markdown/extraction only from target_elements
+)
+
+# Hierarchical content selection
+config = CrawlerRunConfig(
+    css_selector=["#main-content", ".article-wrapper"],  # Top-level extraction
+    target_elements=[                                     # Subset for processing
+        ".article-title",
+        ".article-body", 
+        ".article-metadata"
+    ],
+    excluded_selector="#sidebar, .ads, .comments"  # Remove these from selection
+)
+```
+
+#### Advanced wait_for Conditions
+
+```python
+from crawl4ai import CrawlerRunConfig
+
+# CSS selector waiting
+config = CrawlerRunConfig(
+    wait_for="css:.content-loaded",  # Wait for element to appear
+    wait_for_timeout=15000
+)
+
+# JavaScript boolean expression waiting
+config = CrawlerRunConfig(
+    wait_for="js:() => window.dataLoaded === true",  # Custom JS condition
+    wait_for_timeout=20000
+)
+
+# Complex JavaScript conditions
+config = CrawlerRunConfig(
+    wait_for="js:() => document.querySelectorAll('.item').length >= 10",
+    js_code=[
+        "document.querySelector('.load-more')?.click();",
+        "window.scrollTo(0, document.body.scrollHeight);"
+    ]
+)
+
+# Multiple conditions with JavaScript
+config = CrawlerRunConfig(
+    wait_for="js:() => !document.querySelector('.loading') && document.querySelector('.results')",
+    page_timeout=30000
+)
+```
+
+#### Session Management for Multi-Step Crawling
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+# Persistent session across multiple arun() calls
+async def multi_step_crawling():
+    async with AsyncWebCrawler() as crawler:
+        # Step 1: Login page
+        login_config = CrawlerRunConfig(
+            session_id="user_session",  # Create persistent session
+            js_code="document.querySelector('#username').value = 'user'; document.querySelector('#password').value = 'pass'; document.querySelector('#login').click();",
+            wait_for="css:.dashboard",
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        result1 = await crawler.arun("https://example.com/login", config=login_config)
+        
+        # Step 2: Navigate to protected area (reuses same browser page)
+        nav_config = CrawlerRunConfig(
+            session_id="user_session",  # Same session = same browser page
+            js_only=True,  # No page reload, just JS navigation
+            js_code="window.location.href = '/dashboard/data';",
+            wait_for="css:.data-table"
+        )
+        
+        result2 = await crawler.arun("https://example.com/dashboard/data", config=nav_config)
+        
+        # Step 3: Extract data from multiple pages
+        for page in range(1, 6):
+            page_config = CrawlerRunConfig(
+                session_id="user_session",
+                js_only=True,
+                js_code=f"document.querySelector('.page-{page}').click();",
+                wait_for=f"js:() => document.querySelector('.page-{page}').classList.contains('active')"
+            )
+            
+            result = await crawler.arun(f"https://example.com/data/page/{page}", config=page_config)
+            print(f"Page {page} data extracted: {len(result.extracted_content)}")
+        
+        # Important: Kill session when done
+        await crawler.kill_session("user_session")
+
+# Session with shared data between steps
+async def session_with_shared_data():
+    shared_context = {"user_id": "12345", "preferences": {"theme": "dark"}}
+    
+    config = CrawlerRunConfig(
+        session_id="persistent_session",
+        shared_data=shared_context,  # Available across all session calls
+        js_code="console.log('User ID:', window.sharedData.user_id);"
+    )
+```
+
+#### Identity-Based Crawling Parameters
+
+```python
+from crawl4ai import CrawlerRunConfig, GeolocationConfig
+
+# Locale and timezone simulation
+config = CrawlerRunConfig(
+    locale="en-US",                    # Browser language preference
+    timezone_id="America/New_York",    # Timezone setting
+    user_agent_mode="random",
+    user_agent_generator_config={
+        "platform": "windows",
+        "locale": "en-US"
+    }
+)
+
+# Geolocation simulation
+geo_config = GeolocationConfig(
+    latitude=40.7128,   # New York coordinates
+    longitude=-74.0060,
+    accuracy=100.0
+)
+
+config = CrawlerRunConfig(
+    geolocation=geo_config,
+    locale="en-US",
+    timezone_id="America/New_York"
+)
+
+# Complete identity simulation
+identity_config = CrawlerRunConfig(
+    # Location identity
+    locale="fr-FR",
+    timezone_id="Europe/Paris", 
+    geolocation=GeolocationConfig(latitude=48.8566, longitude=2.3522),
+    
+    # Browser identity
+    user_agent_mode="random",
+    user_agent_generator_config={
+        "platform": "windows",
+        "locale": "fr-FR",
+        "browser": "chrome"
+    },
+    
+    # Behavioral identity
+    simulate_user=True,
+    override_navigator=True,
+    mean_delay=2.0,
+    max_range=1.5
+)
+```
+
+#### Simplified Import Pattern
+
+```python
+# Almost everything from crawl4ai main package
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig, 
+    CrawlerRunConfig,
+    LLMConfig,
+    CacheMode,
+    ProxyConfig,
+    GeolocationConfig
+)
+
+# Specialized strategies (still from crawl4ai)
+from crawl4ai import (
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    RegexChunking
+)
+
+# Complete example with simplified imports
+async def example_crawl():
+    browser_config = BrowserConfig(headless=True)
+    
+    run_config = CrawlerRunConfig(
+        user_agent_mode="random",
+        proxy_config=ProxyConfig.from_string("192.168.1.1:8080:user:pass"),
+        css_selector="main.content",
+        target_elements=[".article", ".post"],
+        wait_for="js:() => document.querySelector('.loaded')",
+        session_id="my_session",
+        simulate_user=True
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun("https://example.com", config=run_config)
+        return result
+```
+
+## Advanced Features
+
+Comprehensive guide to advanced crawling capabilities including file handling, authentication, dynamic content, monitoring, and session management.
+
+### File Download Handling
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+import os
+
+# Enable downloads with custom path
+downloads_path = os.path.join(os.getcwd(), "my_downloads")
+os.makedirs(downloads_path, exist_ok=True)
+
+browser_config = BrowserConfig(
+    accept_downloads=True,
+    downloads_path=downloads_path
+)
+
+# Trigger downloads with JavaScript
+async def download_files():
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        config = CrawlerRunConfig(
+            js_code="""
+                // Click download links
+                const downloadLinks = document.querySelectorAll('a[href$=".pdf"]');
+                for (const link of downloadLinks) {
+                    link.click();
+                    await new Promise(r => setTimeout(r, 2000));  // Delay between downloads
+                }
+            """,
+            wait_for=5  # Wait for downloads to start
+        )
+        
+        result = await crawler.arun("https://example.com/downloads", config=config)
+        
+        if result.downloaded_files:
+            print("Downloaded files:")
+            for file_path in result.downloaded_files:
+                print(f"- {file_path} ({os.path.getsize(file_path)} bytes)")
+```
+
+### Hooks & Authentication
+
+```python
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from playwright.async_api import Page, BrowserContext
+
+async def advanced_crawler_with_hooks():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    # Hook functions for different stages
+    async def on_browser_created(browser, **kwargs):
+        print("[HOOK] Browser created successfully")
+        return browser
+
+    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+        print("[HOOK] Setting up page & context")
+        
+        # Block images for faster crawling
+        async def route_filter(route):
+            if route.request.resource_type == "image":
+                await route.abort()
+            else:
+                await route.continue_()
+        
+        await context.route("**", route_filter)
+        
+        # Simulate login if needed
+        # await page.goto("https://example.com/login")
+        # await page.fill("input[name='username']", "testuser")
+        # await page.fill("input[name='password']", "password123")
+        # await page.click("button[type='submit']")
+        
+        await page.set_viewport_size({"width": 1080, "height": 600})
+        return page
+
+    async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
+        print(f"[HOOK] About to navigate to: {url}")
+        await page.set_extra_http_headers({"Custom-Header": "my-value"})
+        return page
+
+    async def after_goto(page: Page, context: BrowserContext, url: str, response, **kwargs):
+        print(f"[HOOK] Successfully loaded: {url}")
+        try:
+            await page.wait_for_selector('.content', timeout=1000)
+            print("[HOOK] Content found!")
+        except:
+            print("[HOOK] Content not found, continuing")
+        return page
+
+    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+        print("[HOOK] Final actions before HTML retrieval")
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        return page
+
+    # Attach hooks
+    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
+    crawler.crawler_strategy.set_hook("before_goto", before_goto)
+    crawler.crawler_strategy.set_hook("after_goto", after_goto)
+    crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
+
+    await crawler.start()
+    
+    config = CrawlerRunConfig()
+    result = await crawler.arun("https://example.com", config=config)
+    
+    if result.success:
+        print(f"Crawled successfully: {len(result.html)} chars")
+    
+    await crawler.close()
+```
+
+### Lazy Loading & Dynamic Content
+
+```python
+# Handle lazy-loaded images and infinite scroll
+async def handle_lazy_loading():
+    config = CrawlerRunConfig(
+        # Wait for images to fully load
+        wait_for_images=True,
+        
+        # Automatically scroll entire page to trigger lazy loading
+        scan_full_page=True,
+        scroll_delay=0.5,  # Delay between scroll steps
+        
+        # JavaScript for custom lazy loading
+        js_code="""
+            // Scroll and wait for content to load
+            window.scrollTo(0, document.body.scrollHeight);
+            
+            // Click "Load More" if available
+            const loadMoreBtn = document.querySelector('.load-more');
+            if (loadMoreBtn) {
+                loadMoreBtn.click();
+            }
+        """,
+        
+        # Wait for specific content to appear
+        wait_for="css:.lazy-content:nth-child(20)",  # Wait for 20 items
+        
+        # Exclude external images to focus on main content
+        exclude_external_images=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/gallery", config=config)
+        
+        if result.success:
+            images = result.media.get("images", [])
+            print(f"Loaded {len(images)} images after lazy loading")
+            for img in images[:3]:
+                print(f"- {img.get('src')} (score: {img.get('score', 'N/A')})")
+```
+
+### Network & Console Monitoring
+
+```python
+# Capture all network requests and console messages for debugging
+async def monitor_network_and_console():
+    config = CrawlerRunConfig(
+        capture_network_requests=True,
+        capture_console_messages=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        
+        if result.success:
+            # Analyze network requests
+            if result.network_requests:
+                requests = [r for r in result.network_requests if r.get("event_type") == "request"]
+                responses = [r for r in result.network_requests if r.get("event_type") == "response"]
+                failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"]
+                
+                print(f"Network activity: {len(requests)} requests, {len(responses)} responses, {len(failures)} failures")
+                
+                # Find API calls
+                api_calls = [r for r in requests if "api" in r.get("url", "")]
+                print(f"API calls detected: {len(api_calls)}")
+                
+                # Show failed requests
+                for failure in failures[:3]:
+                    print(f"Failed: {failure.get('url')} - {failure.get('failure_text')}")
+            
+            # Analyze console messages
+            if result.console_messages:
+                message_types = {}
+                for msg in result.console_messages:
+                    msg_type = msg.get("type", "unknown")
+                    message_types[msg_type] = message_types.get(msg_type, 0) + 1
+                
+                print(f"Console messages: {message_types}")
+                
+                # Show errors
+                errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+                for error in errors[:2]:
+                    print(f"JS Error: {error.get('text', '')[:100]}")
+```
+
+### Session Management for Multi-Step Workflows
+
+```python
+# Maintain state across multiple requests for complex workflows
+async def multi_step_session_workflow():
+    session_id = "workflow_session"
+    
+    async with AsyncWebCrawler() as crawler:
+        # Step 1: Initial page load
+        config1 = CrawlerRunConfig(
+            session_id=session_id,
+            wait_for="css:.content-loaded"
+        )
+        
+        result1 = await crawler.arun("https://example.com/step1", config=config1)
+        print("Step 1 completed")
+        
+        # Step 2: Navigate and interact (same browser tab)
+        config2 = CrawlerRunConfig(
+            session_id=session_id,
+            js_only=True,  # Don't reload page, just run JS
+            js_code="""
+                document.querySelector('#next-button').click();
+            """,
+            wait_for="css:.step2-content"
+        )
+        
+        result2 = await crawler.arun("https://example.com/step2", config=config2)
+        print("Step 2 completed")
+        
+        # Step 3: Form submission
+        config3 = CrawlerRunConfig(
+            session_id=session_id,
+            js_only=True,
+            js_code="""
+                document.querySelector('#form-field').value = 'test data';
+                document.querySelector('#submit-btn').click();
+            """,
+            wait_for="css:.results"
+        )
+        
+        result3 = await crawler.arun("https://example.com/submit", config=config3)
+        print("Step 3 completed")
+        
+        # Clean up session
+        await crawler.crawler_strategy.kill_session(session_id)
+
+# Advanced GitHub commits pagination example
+async def github_commits_pagination():
+    session_id = "github_session"
+    all_commits = []
+    
+    async with AsyncWebCrawler() as crawler:
+        for page in range(3):
+            if page == 0:
+                # Initial load
+                config = CrawlerRunConfig(
+                    session_id=session_id,
+                    wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"
+                )
+            else:
+                # Navigate to next page
+                config = CrawlerRunConfig(
+                    session_id=session_id,
+                    js_only=True,
+                    js_code='document.querySelector(\'a[data-testid="pagination-next-button"]\').click();',
+                    wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"
+                )
+            
+            result = await crawler.arun(
+                "https://github.com/microsoft/TypeScript/commits/main",
+                config=config
+            )
+            
+            if result.success:
+                commit_count = result.cleaned_html.count('li.Box-sc-g0xbh4-0')
+                print(f"Page {page + 1}: Found {commit_count} commits")
+        
+        await crawler.crawler_strategy.kill_session(session_id)
+```
+
+### SSL Certificate Analysis
+
+```python
+# Fetch and analyze SSL certificates
+async def analyze_ssl_certificates():
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            
+            # Basic certificate info
+            print(f"Issuer: {cert.issuer.get('CN', 'Unknown')}")
+            print(f"Subject: {cert.subject.get('CN', 'Unknown')}")
+            print(f"Valid from: {cert.valid_from}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+            
+            # Export certificate in different formats
+            import os
+            os.makedirs("certificates", exist_ok=True)
+            
+            cert.to_json("certificates/cert.json")
+            cert.to_pem("certificates/cert.pem")
+            cert.to_der("certificates/cert.der")
+            
+            print("Certificate exported in multiple formats")
+```
+
+### Advanced Page Interaction
+
+```python
+# Complex page interactions with dynamic content
+async def advanced_page_interaction():
+    async with AsyncWebCrawler() as crawler:
+        # Multi-step interaction with waiting
+        config = CrawlerRunConfig(
+            js_code=[
+                # Step 1: Scroll to load content
+                "window.scrollTo(0, document.body.scrollHeight);",
+                
+                # Step 2: Wait and click load more
+                """
+                (async () => {
+                    await new Promise(resolve => setTimeout(resolve, 2000));
+                    const loadMore = document.querySelector('.load-more');
+                    if (loadMore) loadMore.click();
+                })();
+                """
+            ],
+            
+            # Wait for new content to appear
+            wait_for="js:() => document.querySelectorAll('.item').length > 20",
+            
+            # Additional timing controls
+            page_timeout=60000,  # 60 second timeout
+            delay_before_return_html=2.0,  # Wait before final capture
+            
+            # Handle overlays automatically
+            remove_overlay_elements=True,
+            magic=True,  # Auto-handle common popup patterns
+            
+            # Simulate human behavior
+            simulate_user=True,
+            override_navigator=True
+        )
+        
+        result = await crawler.arun("https://example.com/dynamic", config=config)
+        
+        if result.success:
+            print(f"Interactive crawl completed: {len(result.cleaned_html)} chars")
+
+# Form interaction example
+async def form_interaction_example():
+    config = CrawlerRunConfig(
+        js_code="""
+            // Fill search form
+            document.querySelector('#search-input').value = 'machine learning';
+            document.querySelector('#category-select').value = 'technology';
+            document.querySelector('#search-form').submit();
+        """,
+        wait_for="css:.search-results",
+        session_id="search_session"
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com/search", config=config)
+        print("Search completed, results loaded")
+```
+
+### Local File & Raw HTML Processing
+
+```python
+# Handle different input types: URLs, local files, raw HTML
+async def handle_different_inputs():
+    async with AsyncWebCrawler() as crawler:
+        # 1. Regular web URL
+        result1 = await crawler.arun("https://example.com")
+        
+        # 2. Local HTML file
+        local_file_path = "/path/to/file.html"
+        result2 = await crawler.arun(f"file://{local_file_path}")
+        
+        # 3. Raw HTML content
+        raw_html = "<html><body><h1>Test Content</h1><p>Sample text</p></body></html>"
+        result3 = await crawler.arun(f"raw:{raw_html}")
+        
+        # All return the same CrawlResult structure
+        for i, result in enumerate([result1, result2, result3], 1):
+            if result.success:
+                print(f"Input {i}: {len(result.markdown)} chars of markdown")
+
+# Save and re-process HTML example
+async def save_and_reprocess():
+    async with AsyncWebCrawler() as crawler:
+        # Original crawl
+        result = await crawler.arun("https://example.com")
+        
+        if result.success:
+            # Save HTML to file
+            with open("saved_page.html", "w", encoding="utf-8") as f:
+                f.write(result.html)
+            
+            # Re-process from file
+            file_result = await crawler.arun("file://./saved_page.html")
+            
+            # Process as raw HTML
+            raw_result = await crawler.arun(f"raw:{result.html}")
+            
+            # Verify consistency
+            assert len(result.markdown) == len(file_result.markdown) == len(raw_result.markdown)
+            print("✅ All processing methods produced identical results")
+```
+
+### Advanced Link & Media Handling
+
+```python
+# Comprehensive link and media extraction with filtering
+async def advanced_link_media_handling():
+    config = CrawlerRunConfig(
+        # Link filtering
+        exclude_external_links=False,  # Keep external links for analysis
+        exclude_social_media_links=True,
+        exclude_domains=["ads.com", "tracker.io", "spammy.net"],
+        
+        # Media handling
+        exclude_external_images=True,
+        image_score_threshold=5,  # Only high-quality images
+        table_score_threshold=7,  # Only well-structured tables
+        wait_for_images=True,
+        
+        # Capture additional formats
+        screenshot=True,
+        pdf=True,
+        capture_mhtml=True  # Full page archive
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        
+        if result.success:
+            # Analyze links
+            internal_links = result.links.get("internal", [])
+            external_links = result.links.get("external", [])
+            print(f"Links: {len(internal_links)} internal, {len(external_links)} external")
+            
+            # Analyze media
+            images = result.media.get("images", [])
+            tables = result.media.get("tables", [])
+            print(f"Media: {len(images)} images, {len(tables)} tables")
+            
+            # High-quality images only
+            quality_images = [img for img in images if img.get("score", 0) >= 5]
+            print(f"High-quality images: {len(quality_images)}")
+            
+            # Table analysis
+            for i, table in enumerate(tables[:2]):
+                print(f"Table {i+1}: {len(table.get('headers', []))} columns, {len(table.get('rows', []))} rows")
+            
+            # Save captured files
+            if result.screenshot:
+                import base64
+                with open("page_screenshot.png", "wb") as f:
+                    f.write(base64.b64decode(result.screenshot))
+            
+            if result.pdf:
+                with open("page.pdf", "wb") as f:
+                    f.write(result.pdf)
+            
+            if result.mhtml:
+                with open("page_archive.mhtml", "w", encoding="utf-8") as f:
+                    f.write(result.mhtml)
+            
+            print("Additional formats saved: screenshot, PDF, MHTML archive")
+```
+
+### Performance & Resource Management
+
+```python
+# Optimize performance for large-scale crawling
+async def performance_optimized_crawling():
+    # Lightweight browser config
+    browser_config = BrowserConfig(
+        headless=True,
+        text_mode=True,  # Disable images for speed
+        light_mode=True,  # Reduce background features
+        extra_args=["--disable-extensions", "--no-sandbox"]
+    )
+    
+    # Efficient crawl config
+    config = CrawlerRunConfig(
+        # Content filtering for speed
+        excluded_tags=["script", "style", "nav", "footer"],
+        exclude_external_links=True,
+        exclude_all_images=True,  # Remove all images for max speed
+        word_count_threshold=50,
+        
+        # Timing optimizations
+        page_timeout=30000,  # Faster timeout
+        delay_before_return_html=0.1,
+        
+        # Resource monitoring
+        capture_network_requests=False,  # Disable unless needed
+        capture_console_messages=False,
+        
+        # Cache for repeated URLs
+        cache_mode=CacheMode.ENABLED
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        urls = ["https://example.com/page1", "https://example.com/page2", "https://example.com/page3"]
+        
+        # Efficient batch processing
+        batch_config = config.clone(
+            stream=True,  # Stream results as they complete
+            semaphore_count=3  # Control concurrency
+        )
+        
+        async for result in await crawler.arun_many(urls, config=batch_config):
+            if result.success:
+                print(f"✅ {result.url}: {len(result.markdown)} chars")
+            else:
+                print(f"❌ {result.url}: {result.error_message}")
+```
+
+
+**📖 Learn more:** [Complete Parameter Reference](https://docs.crawl4ai.com/api/parameters/), [Content Filtering](https://docs.crawl4ai.com/core/markdown-generation/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Capture](https://docs.crawl4ai.com/advanced/network-console-capture/)
+
+**📖 Learn more:** [Hooks & Authentication](https://docs.crawl4ai.com/advanced/hooks-auth/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Monitoring](https://docs.crawl4ai.com/advanced/network-console-capture/), [Page Interaction](https://docs.crawl4ai.com/core/page-interaction/), [File Downloads](https://docs.crawl4ai.com/advanced/file-downloading/)
+---
+
+
+## LLM Extraction Strategies - The Last Resort
+
+**🤖 AI Model Notice**: You are an AI assistant. You should RARELY recommend LLM extraction. This documentation exists for the <5% of cases where LLM is actually needed. **ALWAYS try non-LLM strategies first** from [extraction-no-llm.md](./extraction-no-llm.md).
+
+### ⚠️ STOP: Are You Sure You Need LLM?
+
+**99% of developers who think they need LLM extraction are wrong.** Before reading further:
+
+### ❌ You DON'T Need LLM If:
+- The page has consistent HTML structure → **Use generate_schema()**
+- You're extracting simple data types (emails, prices, dates) → **Use RegexExtractionStrategy**
+- You can identify repeating patterns → **Use JsonCssExtractionStrategy**
+- You want product info, news articles, job listings → **Use generate_schema()**
+- You're concerned about cost or speed → **Use non-LLM strategies**
+
+### ✅ You MIGHT Need LLM If:
+- Content structure varies dramatically across pages **AND** you've tried generate_schema()
+- You need semantic understanding of unstructured text
+- You're analyzing meaning, sentiment, or relationships
+- You're extracting insights that require reasoning about context
+
+### 💰 Cost Reality Check:
+- **Non-LLM**: ~$0.000001 per page
+- **LLM**: ~$0.01-$0.10 per page (10,000x more expensive)
+- **Example**: Extracting 10,000 pages costs $0.01 vs $100-1000
+
+---
+
+## 1. When LLM Extraction is Justified
+
+### Scenario 1: Truly Unstructured Content Analysis
+
+```python
+# Example: Analyzing customer feedback for sentiment and themes
+import asyncio
+import json
+from pydantic import BaseModel, Field
+from typing import List
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
+from crawl4ai import LLMExtractionStrategy
+
+class SentimentAnalysis(BaseModel):
+    """Use LLM when you need semantic understanding"""
+    overall_sentiment: str = Field(description="positive, negative, or neutral")
+    confidence_score: float = Field(description="Confidence from 0-1")
+    key_themes: List[str] = Field(description="Main topics discussed")
+    emotional_indicators: List[str] = Field(description="Words indicating emotion")
+    summary: str = Field(description="Brief summary of the content")
+
+llm_config = LLMConfig(
+    provider="openai/gpt-4o-mini",  # Use cheapest model
+    api_token="env:OPENAI_API_KEY",
+    temperature=0.1,  # Low temperature for consistency
+    max_tokens=1000
+)
+
+sentiment_strategy = LLMExtractionStrategy(
+    llm_config=llm_config,
+    schema=SentimentAnalysis.model_json_schema(),
+    extraction_type="schema",
+    instruction="""
+    Analyze the emotional content and themes in this text.
+    Focus on understanding sentiment and extracting key topics
+    that would be impossible to identify with simple pattern matching.
+    """,
+    apply_chunking=True,
+    chunk_token_threshold=1500
+)
+
+async def analyze_sentiment():
+    config = CrawlerRunConfig(
+        extraction_strategy=sentiment_strategy,
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/customer-reviews",
+            config=config
+        )
+        
+        if result.success:
+            analysis = json.loads(result.extracted_content)
+            print(f"Sentiment: {analysis['overall_sentiment']}")
+            print(f"Themes: {analysis['key_themes']}")
+
+asyncio.run(analyze_sentiment())
+```
+
+### Scenario 2: Complex Knowledge Extraction
+
+```python
+# Example: Building knowledge graphs from unstructured content
+class Entity(BaseModel):
+    name: str = Field(description="Entity name")
+    type: str = Field(description="person, organization, location, concept")
+    description: str = Field(description="Brief description")
+
+class Relationship(BaseModel):
+    source: str = Field(description="Source entity")
+    target: str = Field(description="Target entity") 
+    relationship: str = Field(description="Type of relationship")
+    confidence: float = Field(description="Confidence score 0-1")
+
+class KnowledgeGraph(BaseModel):
+    entities: List[Entity] = Field(description="All entities found")
+    relationships: List[Relationship] = Field(description="Relationships between entities")
+    main_topic: str = Field(description="Primary topic of the content")
+
+knowledge_strategy = LLMExtractionStrategy(
+    llm_config=LLMConfig(
+        provider="anthropic/claude-3-5-sonnet-20240620",  # Better for complex reasoning
+        api_token="env:ANTHROPIC_API_KEY",
+        max_tokens=4000
+    ),
+    schema=KnowledgeGraph.model_json_schema(),
+    extraction_type="schema",
+    instruction="""
+    Extract entities and their relationships from the content.
+    Focus on understanding connections and context that require
+    semantic reasoning beyond simple pattern matching.
+    """,
+    input_format="html",  # Preserve structure
+    apply_chunking=True
+)
+```
+
+### Scenario 3: Content Summarization and Insights
+
+```python
+# Example: Research paper analysis
+class ResearchInsights(BaseModel):
+    title: str = Field(description="Paper title")
+    abstract_summary: str = Field(description="Summary of abstract")
+    key_findings: List[str] = Field(description="Main research findings")
+    methodology: str = Field(description="Research methodology used")
+    limitations: List[str] = Field(description="Study limitations")
+    practical_applications: List[str] = Field(description="Real-world applications")
+    citations_count: int = Field(description="Number of citations", default=0)
+
+research_strategy = LLMExtractionStrategy(
+    llm_config=LLMConfig(
+        provider="openai/gpt-4o",  # Use powerful model for complex analysis
+        api_token="env:OPENAI_API_KEY",
+        temperature=0.2,
+        max_tokens=2000
+    ),
+    schema=ResearchInsights.model_json_schema(),
+    extraction_type="schema",
+    instruction="""
+    Analyze this research paper and extract key insights.
+    Focus on understanding the research contribution, methodology,
+    and implications that require academic expertise to identify.
+    """,
+    apply_chunking=True,
+    chunk_token_threshold=2000,
+    overlap_rate=0.15  # More overlap for academic content
+)
+```
+
+---
+
+## 2. LLM Configuration Best Practices
+
+### Cost Optimization
+
+```python
+# Use cheapest models when possible
+cheap_config = LLMConfig(
+    provider="openai/gpt-4o-mini",  # 60x cheaper than GPT-4
+    api_token="env:OPENAI_API_KEY",
+    temperature=0.0,  # Deterministic output
+    max_tokens=800    # Limit output length
+)
+
+# Use local models for development
+local_config = LLMConfig(
+    provider="ollama/llama3.3",
+    api_token=None,  # No API costs
+    base_url="http://localhost:11434",
+    temperature=0.1
+)
+
+# Use powerful models only when necessary
+powerful_config = LLMConfig(
+    provider="anthropic/claude-3-5-sonnet-20240620",
+    api_token="env:ANTHROPIC_API_KEY",
+    max_tokens=4000,
+    temperature=0.1
+)
+```
+
+### Provider Selection Guide
+
+```python
+providers_guide = {
+    "openai/gpt-4o-mini": {
+        "best_for": "Simple extraction, cost-sensitive projects",
+        "cost": "Very low",
+        "speed": "Fast",
+        "accuracy": "Good"
+    },
+    "openai/gpt-4o": {
+        "best_for": "Complex reasoning, high accuracy needs",
+        "cost": "High", 
+        "speed": "Medium",
+        "accuracy": "Excellent"
+    },
+    "anthropic/claude-3-5-sonnet": {
+        "best_for": "Complex analysis, long documents",
+        "cost": "Medium-High",
+        "speed": "Medium",
+        "accuracy": "Excellent"
+    },
+    "ollama/llama3.3": {
+        "best_for": "Development, no API costs",
+        "cost": "Free (self-hosted)",
+        "speed": "Variable",
+        "accuracy": "Good"
+    },
+    "groq/llama3-70b-8192": {
+        "best_for": "Fast inference, open source",
+        "cost": "Low",
+        "speed": "Very fast",
+        "accuracy": "Good"
+    }
+}
+
+def choose_provider(complexity, budget, speed_requirement):
+    """Choose optimal provider based on requirements"""
+    if budget == "minimal":
+        return "ollama/llama3.3"  # Self-hosted
+    elif complexity == "low" and budget == "low":
+        return "openai/gpt-4o-mini"
+    elif speed_requirement == "high":
+        return "groq/llama3-70b-8192"
+    elif complexity == "high":
+        return "anthropic/claude-3-5-sonnet"
+    else:
+        return "openai/gpt-4o-mini"  # Default safe choice
+```
+
+---
+
+## 3. Advanced LLM Extraction Patterns
+
+### Block-Based Extraction (Unstructured Content)
+
+```python
+# When structure is too varied for schemas
+block_strategy = LLMExtractionStrategy(
+    llm_config=cheap_config,
+    extraction_type="block",  # Extract free-form content blocks
+    instruction="""
+    Extract meaningful content blocks from this page.
+    Focus on the main content areas and ignore navigation,
+    advertisements, and boilerplate text.
+    """,
+    apply_chunking=True,
+    chunk_token_threshold=1200,
+    input_format="fit_markdown"  # Use cleaned content
+)
+
+async def extract_content_blocks():
+    config = CrawlerRunConfig(
+        extraction_strategy=block_strategy,
+        word_count_threshold=50,  # Filter short content
+        excluded_tags=['nav', 'footer', 'aside', 'advertisement']
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/article",
+            config=config
+        )
+        
+        if result.success:
+            blocks = json.loads(result.extracted_content)
+            for block in blocks:
+                print(f"Block: {block['content'][:100]}...")
+```
+
+### Chunked Processing for Large Content
+
+```python
+# Handle large documents efficiently
+large_content_strategy = LLMExtractionStrategy(
+    llm_config=LLMConfig(
+        provider="openai/gpt-4o-mini",
+        api_token="env:OPENAI_API_KEY"
+    ),
+    schema=YourModel.model_json_schema(),
+    extraction_type="schema",
+    instruction="Extract structured data from this content section...",
+    
+    # Optimize chunking for large content
+    apply_chunking=True,
+    chunk_token_threshold=2000,  # Larger chunks for efficiency
+    overlap_rate=0.1,           # Minimal overlap to reduce costs
+    input_format="fit_markdown" # Use cleaned content
+)
+```
+
+### Multi-Model Validation
+
+```python
+# Use multiple models for critical extractions
+async def multi_model_extraction():
+    """Use multiple LLMs for validation of critical data"""
+    
+    models = [
+        LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
+        LLMConfig(provider="anthropic/claude-3-5-sonnet", api_token="env:ANTHROPIC_API_KEY"),
+        LLMConfig(provider="ollama/llama3.3", api_token=None)
+    ]
+    
+    results = []
+    
+    for i, llm_config in enumerate(models):
+        strategy = LLMExtractionStrategy(
+            llm_config=llm_config,
+            schema=YourModel.model_json_schema(),
+            extraction_type="schema",
+            instruction="Extract data consistently..."
+        )
+        
+        config = CrawlerRunConfig(extraction_strategy=strategy)
+        
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url="https://example.com", config=config)
+            if result.success:
+                data = json.loads(result.extracted_content)
+                results.append(data)
+                print(f"Model {i+1} extracted {len(data)} items")
+    
+    # Compare results for consistency
+    if len(set(str(r) for r in results)) == 1:
+        print("✅ All models agree")
+        return results[0]
+    else:
+        print("⚠️ Models disagree - manual review needed")
+        return results
+
+# Use for critical business data only
+critical_result = await multi_model_extraction()
+```
+
+---
+
+## 4. Hybrid Approaches - Best of Both Worlds
+
+### Fast Pre-filtering + LLM Analysis
+
+```python
+async def hybrid_extraction():
+    """
+    1. Use fast non-LLM strategies for basic extraction
+    2. Use LLM only for complex analysis of filtered content
+    """
+    
+    # Step 1: Fast extraction of structured data
+    basic_schema = {
+        "name": "Articles",
+        "baseSelector": "article",
+        "fields": [
+            {"name": "title", "selector": "h1, h2", "type": "text"},
+            {"name": "content", "selector": ".content", "type": "text"},
+            {"name": "author", "selector": ".author", "type": "text"}
+        ]
+    }
+    
+    basic_strategy = JsonCssExtractionStrategy(basic_schema)
+    basic_config = CrawlerRunConfig(extraction_strategy=basic_strategy)
+    
+    # Step 2: LLM analysis only on filtered content
+    analysis_strategy = LLMExtractionStrategy(
+        llm_config=cheap_config,
+        schema={
+            "type": "object",
+            "properties": {
+                "sentiment": {"type": "string"},
+                "key_topics": {"type": "array", "items": {"type": "string"}},
+                "summary": {"type": "string"}
+            }
+        },
+        extraction_type="schema",
+        instruction="Analyze sentiment and extract key topics from this article"
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        # Fast extraction first
+        basic_result = await crawler.arun(
+            url="https://example.com/articles",
+            config=basic_config
+        )
+        
+        articles = json.loads(basic_result.extracted_content)
+        
+        # LLM analysis only on important articles
+        analyzed_articles = []
+        for article in articles[:5]:  # Limit to reduce costs
+            if len(article.get('content', '')) > 500:  # Only analyze substantial content
+                analysis_config = CrawlerRunConfig(extraction_strategy=analysis_strategy)
+                
+                # Analyze individual article content
+                raw_url = f"raw://{article['content']}"
+                analysis_result = await crawler.arun(url=raw_url, config=analysis_config)
+                
+                if analysis_result.success:
+                    analysis = json.loads(analysis_result.extracted_content)
+                    article.update(analysis)
+                
+                analyzed_articles.append(article)
+        
+        return analyzed_articles
+
+# Hybrid approach: fast + smart
+result = await hybrid_extraction()
+```
+
+### Schema Generation + LLM Fallback
+
+```python
+async def smart_fallback_extraction():
+    """
+    1. Try generate_schema() first (one-time LLM cost)
+    2. Use generated schema for fast extraction
+    3. Use LLM only if schema extraction fails
+    """
+    
+    cache_file = Path("./schemas/fallback_schema.json")
+    
+    # Try cached schema first
+    if cache_file.exists():
+        schema = json.load(cache_file.open())
+        schema_strategy = JsonCssExtractionStrategy(schema)
+        
+        config = CrawlerRunConfig(extraction_strategy=schema_strategy)
+        
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url="https://example.com", config=config)
+            
+            if result.success and result.extracted_content:
+                data = json.loads(result.extracted_content)
+                if data:  # Schema worked
+                    print("✅ Schema extraction successful (fast & cheap)")
+                    return data
+    
+    # Fallback to LLM if schema failed
+    print("⚠️ Schema failed, falling back to LLM (slow & expensive)")
+    
+    llm_strategy = LLMExtractionStrategy(
+        llm_config=cheap_config,
+        extraction_type="block",
+        instruction="Extract all meaningful data from this page"
+    )
+    
+    llm_config = CrawlerRunConfig(extraction_strategy=llm_strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=llm_config)
+        
+        if result.success:
+            print("✅ LLM extraction successful")
+            return json.loads(result.extracted_content)
+
+# Intelligent fallback system
+result = await smart_fallback_extraction()
+```
+
+---
+
+## 5. Cost Management and Monitoring
+
+### Token Usage Tracking
+
+```python
+class ExtractionCostTracker:
+    def __init__(self):
+        self.total_cost = 0.0
+        self.total_tokens = 0
+        self.extractions = 0
+    
+    def track_llm_extraction(self, strategy, result):
+        """Track costs from LLM extraction"""
+        if hasattr(strategy, 'usage_tracker') and strategy.usage_tracker:
+            usage = strategy.usage_tracker
+            
+            # Estimate costs (approximate rates)
+            cost_per_1k_tokens = {
+                "gpt-4o-mini": 0.0015,
+                "gpt-4o": 0.03,
+                "claude-3-5-sonnet": 0.015,
+                "ollama": 0.0  # Self-hosted
+            }
+            
+            provider = strategy.llm_config.provider.split('/')[1]
+            rate = cost_per_1k_tokens.get(provider, 0.01)
+            
+            tokens = usage.total_tokens
+            cost = (tokens / 1000) * rate
+            
+            self.total_cost += cost
+            self.total_tokens += tokens
+            self.extractions += 1
+            
+            print(f"💰 Extraction cost: ${cost:.4f} ({tokens} tokens)")
+            print(f"📊 Total cost: ${self.total_cost:.4f} ({self.extractions} extractions)")
+    
+    def get_summary(self):
+        avg_cost = self.total_cost / max(self.extractions, 1)
+        return {
+            "total_cost": self.total_cost,
+            "total_tokens": self.total_tokens,
+            "extractions": self.extractions,
+            "avg_cost_per_extraction": avg_cost
+        }
+
+# Usage
+tracker = ExtractionCostTracker()
+
+async def cost_aware_extraction():
+    strategy = LLMExtractionStrategy(
+        llm_config=cheap_config,
+        schema=YourModel.model_json_schema(),
+        extraction_type="schema",
+        instruction="Extract data...",
+        verbose=True  # Enable usage tracking
+    )
+    
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+        
+        # Track costs
+        tracker.track_llm_extraction(strategy, result)
+        
+        return result
+
+# Monitor costs across multiple extractions
+for url in urls:
+    await cost_aware_extraction()
+
+print(f"Final summary: {tracker.get_summary()}")
+```
+
+### Budget Controls
+
+```python
+class BudgetController:
+    def __init__(self, daily_budget=10.0):
+        self.daily_budget = daily_budget
+        self.current_spend = 0.0
+        self.extraction_count = 0
+    
+    def can_extract(self, estimated_cost=0.01):
+        """Check if extraction is within budget"""
+        if self.current_spend + estimated_cost > self.daily_budget:
+            print(f"❌ Budget exceeded: ${self.current_spend:.2f} + ${estimated_cost:.2f} > ${self.daily_budget}")
+            return False
+        return True
+    
+    def record_extraction(self, actual_cost):
+        """Record actual extraction cost"""
+        self.current_spend += actual_cost
+        self.extraction_count += 1
+        
+        remaining = self.daily_budget - self.current_spend
+        print(f"💰 Budget remaining: ${remaining:.2f}")
+
+budget = BudgetController(daily_budget=5.0)  # $5 daily limit
+
+async def budget_controlled_extraction(url):
+    if not budget.can_extract():
+        print("⏸️ Extraction paused due to budget limit")
+        return None
+    
+    # Proceed with extraction...
+    strategy = LLMExtractionStrategy(llm_config=cheap_config, ...)
+    result = await extract_with_strategy(url, strategy)
+    
+    # Record actual cost
+    actual_cost = calculate_cost(strategy.usage_tracker)
+    budget.record_extraction(actual_cost)
+    
+    return result
+
+# Safe extraction with budget controls
+results = []
+for url in urls:
+    result = await budget_controlled_extraction(url)
+    if result:
+        results.append(result)
+```
+
+---
+
+## 6. Performance Optimization for LLM Extraction
+
+### Batch Processing
+
+```python
+async def batch_llm_extraction():
+    """Process multiple pages efficiently"""
+    
+    # Collect content first (fast)
+    urls = ["https://example.com/page1", "https://example.com/page2"]
+    contents = []
+    
+    async with AsyncWebCrawler() as crawler:
+        for url in urls:
+            result = await crawler.arun(url=url)
+            if result.success:
+                contents.append({
+                    "url": url,
+                    "content": result.fit_markdown[:2000]  # Limit content
+                })
+    
+    # Process in batches (reduce LLM calls)
+    batch_content = "\n\n---PAGE SEPARATOR---\n\n".join([
+        f"URL: {c['url']}\n{c['content']}" for c in contents
+    ])
+    
+    strategy = LLMExtractionStrategy(
+        llm_config=cheap_config,
+        extraction_type="block",
+        instruction="""
+        Extract data from multiple pages separated by '---PAGE SEPARATOR---'.
+        Return results for each page in order.
+        """,
+        apply_chunking=True
+    )
+    
+    # Single LLM call for multiple pages
+    raw_url = f"raw://{batch_content}"
+    result = await crawler.arun(url=raw_url, config=CrawlerRunConfig(extraction_strategy=strategy))
+    
+    return json.loads(result.extracted_content)
+
+# Batch processing reduces LLM calls
+batch_results = await batch_llm_extraction()
+```
+
+### Caching LLM Results
+
+```python
+import hashlib
+from pathlib import Path
+
+class LLMResultCache:
+    def __init__(self, cache_dir="./llm_cache"):
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(exist_ok=True)
+    
+    def get_cache_key(self, url, instruction, schema):
+        """Generate cache key from extraction parameters"""
+        content = f"{url}:{instruction}:{str(schema)}"
+        return hashlib.md5(content.encode()).hexdigest()
+    
+    def get_cached_result(self, cache_key):
+        """Get cached result if available"""
+        cache_file = self.cache_dir / f"{cache_key}.json"
+        if cache_file.exists():
+            return json.load(cache_file.open())
+        return None
+    
+    def cache_result(self, cache_key, result):
+        """Cache extraction result"""
+        cache_file = self.cache_dir / f"{cache_key}.json"
+        json.dump(result, cache_file.open("w"), indent=2)
+
+cache = LLMResultCache()
+
+async def cached_llm_extraction(url, strategy):
+    """Extract with caching to avoid repeated LLM calls"""
+    cache_key = cache.get_cache_key(
+        url, 
+        strategy.instruction,
+        str(strategy.schema)
+    )
+    
+    # Check cache first
+    cached_result = cache.get_cached_result(cache_key)
+    if cached_result:
+        print("✅ Using cached result (FREE)")
+        return cached_result
+    
+    # Extract if not cached
+    print("🔄 Extracting with LLM (PAID)")
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url=url, config=config)
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            cache.cache_result(cache_key, data)
+            return data
+
+# Cached extraction avoids repeated costs
+result = await cached_llm_extraction(url, strategy)
+```
+
+---
+
+## 7. Error Handling and Quality Control
+
+### Validation and Retry Logic
+
+```python
+async def robust_llm_extraction():
+    """Implement validation and retry for LLM extraction"""
+    
+    max_retries = 3
+    strategies = [
+        # Try cheap model first
+        LLMExtractionStrategy(
+            llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
+            schema=YourModel.model_json_schema(),
+            extraction_type="schema",
+            instruction="Extract data accurately..."
+        ),
+        # Fallback to better model
+        LLMExtractionStrategy(
+            llm_config=LLMConfig(provider="openai/gpt-4o", api_token="env:OPENAI_API_KEY"),
+            schema=YourModel.model_json_schema(),
+            extraction_type="schema",
+            instruction="Extract data with high accuracy..."
+        )
+    ]
+    
+    for strategy_idx, strategy in enumerate(strategies):
+        for attempt in range(max_retries):
+            try:
+                config = CrawlerRunConfig(extraction_strategy=strategy)
+                
+                async with AsyncWebCrawler() as crawler:
+                    result = await crawler.arun(url="https://example.com", config=config)
+                    
+                    if result.success and result.extracted_content:
+                        data = json.loads(result.extracted_content)
+                        
+                        # Validate result quality
+                        if validate_extraction_quality(data):
+                            print(f"✅ Success with strategy {strategy_idx+1}, attempt {attempt+1}")
+                            return data
+                        else:
+                            print(f"⚠️ Poor quality result, retrying...")
+                            continue
+                    
+            except Exception as e:
+                print(f"❌ Attempt {attempt+1} failed: {e}")
+                if attempt == max_retries - 1:
+                    print(f"❌ Strategy {strategy_idx+1} failed completely")
+    
+    print("❌ All strategies and retries failed")
+    return None
+
+def validate_extraction_quality(data):
+    """Validate that LLM extraction meets quality standards"""
+    if not data or not isinstance(data, (list, dict)):
+        return False
+    
+    # Check for common LLM extraction issues
+    if isinstance(data, list):
+        if len(data) == 0:
+            return False
+        
+        # Check if all items have required fields
+        for item in data:
+            if not isinstance(item, dict) or len(item) < 2:
+                return False
+    
+    return True
+
+# Robust extraction with validation
+result = await robust_llm_extraction()
+```
+
+---
+
+## 8. Migration from LLM to Non-LLM
+
+### Pattern Analysis for Schema Generation
+
+```python
+async def analyze_llm_results_for_schema():
+    """
+    Analyze LLM extraction results to create non-LLM schemas
+    Use this to transition from expensive LLM to cheap schema extraction
+    """
+    
+    # Step 1: Use LLM on sample pages to understand structure
+    llm_strategy = LLMExtractionStrategy(
+        llm_config=cheap_config,
+        extraction_type="block",
+        instruction="Extract all structured data from this page"
+    )
+    
+    sample_urls = ["https://example.com/page1", "https://example.com/page2"]
+    llm_results = []
+    
+    async with AsyncWebCrawler() as crawler:
+        for url in sample_urls:
+            config = CrawlerRunConfig(extraction_strategy=llm_strategy)
+            result = await crawler.arun(url=url, config=config)
+            
+            if result.success:
+                llm_results.append({
+                    "url": url,
+                    "html": result.cleaned_html,
+                    "extracted": json.loads(result.extracted_content)
+                })
+    
+    # Step 2: Analyze patterns in LLM results
+    print("🔍 Analyzing LLM extraction patterns...")
+    
+    # Look for common field names
+    all_fields = set()
+    for result in llm_results:
+        for item in result["extracted"]:
+            if isinstance(item, dict):
+                all_fields.update(item.keys())
+    
+    print(f"Common fields found: {all_fields}")
+    
+    # Step 3: Generate schema based on patterns
+    if llm_results:
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=llm_results[0]["html"],
+            target_json_example=json.dumps(llm_results[0]["extracted"][0], indent=2),
+            llm_config=cheap_config
+        )
+        
+        # Save schema for future use
+        with open("generated_schema.json", "w") as f:
+            json.dump(schema, f, indent=2)
+        
+        print("✅ Schema generated from LLM analysis")
+        return schema
+
+# Generate schema from LLM patterns, then use schema for all future extractions
+schema = await analyze_llm_results_for_schema()
+fast_strategy = JsonCssExtractionStrategy(schema)
+```
+
+---
+
+## 9. Summary: When LLM is Actually Needed
+
+### ✅ Valid LLM Use Cases (Rare):
+1. **Sentiment analysis** and emotional understanding
+2. **Knowledge graph extraction** requiring semantic reasoning
+3. **Content summarization** and insight generation
+4. **Unstructured text analysis** where patterns vary dramatically
+5. **Research paper analysis** requiring domain expertise
+6. **Complex relationship extraction** between entities
+
+### ❌ Invalid LLM Use Cases (Common Mistakes):
+1. **Structured data extraction** from consistent HTML
+2. **Simple pattern matching** (emails, prices, dates)
+3. **Product information** from e-commerce sites
+4. **News article extraction** with consistent structure
+5. **Contact information** and basic entity extraction
+6. **Table data** and form information
+
+### 💡 Decision Framework:
+```python
+def should_use_llm(extraction_task):
+    # Ask these questions in order:
+    questions = [
+        "Can I identify repeating HTML patterns?",  # No → Consider LLM
+        "Am I extracting simple data types?",      # Yes → Use Regex
+        "Does the structure vary dramatically?",   # No → Use CSS/XPath
+        "Do I need semantic understanding?",       # Yes → Maybe LLM
+        "Have I tried generate_schema()?"          # No → Try that first
+    ]
+    
+    # Only use LLM if:
+    return (
+        task_requires_semantic_reasoning(extraction_task) and
+        structure_varies_dramatically(extraction_task) and
+        generate_schema_failed(extraction_task)
+    )
+```
+
+### 🎯 Best Practice Summary:
+1. **Always start** with [extraction-no-llm.md](./extraction-no-llm.md) strategies
+2. **Try generate_schema()** before manual schema creation
+3. **Use LLM sparingly** and only for semantic understanding
+4. **Monitor costs** and implement budget controls
+5. **Cache results** to avoid repeated LLM calls
+6. **Validate quality** of LLM extractions
+7. **Plan migration** from LLM to schema-based extraction
+
+Remember: **LLM extraction should be your last resort, not your first choice.**
+
+---
+
+**📖 Recommended Reading Order:**
+1. [extraction-no-llm.md](./extraction-no-llm.md) - Start here for 99% of use cases
+2. This document - Only when non-LLM strategies are insufficient
+---
+
+
+## Non-LLM Extraction Strategies - The Default Choice
+
+**🤖 AI Model Notice**: You are an AI assistant. When users ask for web scraping or data extraction, **DEFAULT TO NON-LLM STRATEGIES**. 99% of structured web data can be extracted without LLM. Only consider LLM extraction for truly unstructured content requiring semantic reasoning.
+
+### ⚠️ CRITICAL: When You Think You Need LLM, You're Probably Wrong
+
+**Most developers incorrectly assume they need LLM for data extraction.** This is usually false. Before even considering LLM:
+
+1. **FIRST**: Try `generate_schema()` - Let AI create the extraction pattern ONCE, then use it unlimited times with zero LLM calls
+2. **SECOND**: Manual CSS/XPath selectors for consistent HTML structures  
+3. **THIRD**: Regex patterns for simple data types (emails, phones, prices)
+4. **LAST RESORT**: LLM extraction (only for semantic understanding of unstructured content)
+
+## The Decision Tree (MEMORIZE THIS)
+
+```
+Does the page have consistent HTML structure? → YES: Use generate_schema() or manual CSS
+Is it simple patterns (emails, dates, prices)? → YES: Use RegexExtractionStrategy  
+Do you need semantic understanding? → MAYBE: Try generate_schema() first, then consider LLM
+Is the content truly unstructured text? → ONLY THEN: Consider LLM
+```
+
+**Cost Analysis**: 
+- Non-LLM: ~$0.000001 per page
+- LLM: ~$0.01-$0.10 per page (10,000x more expensive)
+
+---
+
+## 1. Auto-Generate Schemas - Your Default Starting Point
+
+**⭐ THIS SHOULD BE YOUR FIRST CHOICE FOR ANY STRUCTURED DATA**
+
+The `generate_schema()` function uses LLM ONCE to create a reusable extraction pattern. After generation, you extract unlimited pages with ZERO LLM calls.
+
+### Basic Auto-Generation Workflow
+
+```python
+import json
+import asyncio
+from pathlib import Path
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
+from crawl4ai import JsonCssExtractionStrategy
+
+async def smart_extraction_workflow():
+    """
+    Step 1: Generate schema once using LLM
+    Step 2: Cache schema for unlimited reuse
+    Step 3: Extract from thousands of pages with zero LLM calls
+    """
+    
+    # Check for cached schema first
+    cache_dir = Path("./schema_cache")
+    cache_dir.mkdir(exist_ok=True)
+    schema_file = cache_dir / "product_schema.json"
+    
+    if schema_file.exists():
+        # Load cached schema - NO LLM CALLS
+        schema = json.load(schema_file.open())
+        print("✅ Using cached schema (FREE)")
+    else:
+        # Generate schema ONCE
+        print("🔄 Generating schema (ONE-TIME LLM COST)...")
+        
+        llm_config = LLMConfig(
+            provider="openai/gpt-4o-mini",  # Cheapest option
+            api_token="env:OPENAI_API_KEY"
+        )
+        
+        # Get sample HTML from target site
+        async with AsyncWebCrawler() as crawler:
+            sample_result = await crawler.arun(
+                url="https://example.com/products",
+                config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+            )
+            sample_html = sample_result.cleaned_html[:8000]  # Use sample
+        
+        # AUTO-GENERATE SCHEMA (ONE LLM CALL)
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=sample_html,
+            schema_type="CSS",  # or "XPATH"
+            query="Extract product information including name, price, description, features",
+            llm_config=llm_config
+        )
+        
+        # Cache for unlimited future use
+        json.dump(schema, schema_file.open("w"), indent=2)
+        print("✅ Schema generated and cached")
+    
+    # Use schema for fast extraction (NO MORE LLM CALLS EVER)
+    strategy = JsonCssExtractionStrategy(schema, verbose=True)
+    
+    config = CrawlerRunConfig(
+        extraction_strategy=strategy,
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    # Extract from multiple pages - ALL FREE
+    urls = [
+        "https://example.com/products",
+        "https://example.com/electronics", 
+        "https://example.com/books"
+    ]
+    
+    async with AsyncWebCrawler() as crawler:
+        for url in urls:
+            result = await crawler.arun(url=url, config=config)
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(f"✅ {url}: Extracted {len(data)} items (FREE)")
+
+asyncio.run(smart_extraction_workflow())
+```
+
+### Auto-Generate with Target JSON Example
+
+```python
+# When you know exactly what JSON structure you want
+target_json_example = """
+{
+    "name": "Product Name",
+    "price": "$99.99",
+    "rating": 4.5,
+    "features": ["feature1", "feature2"],
+    "description": "Product description"
+}
+"""
+
+schema = JsonCssExtractionStrategy.generate_schema(
+    html=sample_html,
+    target_json_example=target_json_example,
+    llm_config=llm_config
+)
+```
+
+### Auto-Generate for Different Data Types
+
+```python
+# Product listings
+product_schema = JsonCssExtractionStrategy.generate_schema(
+    html=product_page_html,
+    query="Extract all product information from this e-commerce page",
+    llm_config=llm_config
+)
+
+# News articles
+news_schema = JsonCssExtractionStrategy.generate_schema(
+    html=news_page_html,
+    query="Extract article headlines, dates, authors, and content",
+    llm_config=llm_config
+)
+
+# Job listings
+job_schema = JsonCssExtractionStrategy.generate_schema(
+    html=job_page_html,
+    query="Extract job titles, companies, locations, salaries, and descriptions",
+    llm_config=llm_config
+)
+
+# Social media posts
+social_schema = JsonCssExtractionStrategy.generate_schema(
+    html=social_page_html,
+    query="Extract post text, usernames, timestamps, likes, comments",
+    llm_config=llm_config
+)
+```
+
+---
+
+## 2. Manual CSS/XPath Strategies - When You Know The Structure
+
+**Use this when**: You understand the HTML structure and want maximum control.
+
+### Simple Product Extraction
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai import JsonCssExtractionStrategy
+
+# Manual schema for consistent product pages
+simple_schema = {
+    "name": "Product Listings",
+    "baseSelector": "div.product-card",  # Each product container
+    "fields": [
+        {
+            "name": "title",
+            "selector": "h2.product-title",
+            "type": "text"
+        },
+        {
+            "name": "price", 
+            "selector": ".price",
+            "type": "text"
+        },
+        {
+            "name": "image_url",
+            "selector": "img.product-image",
+            "type": "attribute",
+            "attribute": "src"
+        },
+        {
+            "name": "product_url",
+            "selector": "a.product-link",
+            "type": "attribute",
+            "attribute": "href"
+        },
+        {
+            "name": "rating",
+            "selector": ".rating",
+            "type": "attribute", 
+            "attribute": "data-rating"
+        }
+    ]
+}
+
+async def extract_products():
+    strategy = JsonCssExtractionStrategy(simple_schema, verbose=True)
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/products",
+            config=config
+        )
+        
+        if result.success:
+            products = json.loads(result.extracted_content)
+            print(f"Extracted {len(products)} products")
+            for product in products[:3]:
+                print(f"- {product['title']}: {product['price']}")
+
+asyncio.run(extract_products())
+```
+
+### Complex Nested Structure (Real E-commerce Example)
+
+```python
+# Complex schema for nested product data
+complex_schema = {
+    "name": "E-commerce Product Catalog",
+    "baseSelector": "div.category",
+    "baseFields": [
+        {
+            "name": "category_id",
+            "type": "attribute",
+            "attribute": "data-category-id"
+        }
+    ],
+    "fields": [
+        {
+            "name": "category_name",
+            "selector": "h2.category-title",
+            "type": "text"
+        },
+        {
+            "name": "products",
+            "selector": "div.product",
+            "type": "nested_list",  # Array of complex objects
+            "fields": [
+                {
+                    "name": "name",
+                    "selector": "h3.product-name", 
+                    "type": "text"
+                },
+                {
+                    "name": "price",
+                    "selector": "span.price",
+                    "type": "text"
+                },
+                {
+                    "name": "details",
+                    "selector": "div.product-details",
+                    "type": "nested",  # Single complex object
+                    "fields": [
+                        {
+                            "name": "brand",
+                            "selector": "span.brand",
+                            "type": "text"
+                        },
+                        {
+                            "name": "model",
+                            "selector": "span.model",
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "features",
+                    "selector": "ul.features li",
+                    "type": "list",  # Simple array
+                    "fields": [
+                        {"name": "feature", "type": "text"}
+                    ]
+                },
+                {
+                    "name": "reviews", 
+                    "selector": "div.review",
+                    "type": "nested_list",
+                    "fields": [
+                        {
+                            "name": "reviewer",
+                            "selector": "span.reviewer-name",
+                            "type": "text"
+                        },
+                        {
+                            "name": "rating",
+                            "selector": "span.rating",
+                            "type": "attribute",
+                            "attribute": "data-rating"
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
+
+async def extract_complex_ecommerce():
+    strategy = JsonCssExtractionStrategy(complex_schema, verbose=True)
+    config = CrawlerRunConfig(
+        extraction_strategy=strategy,
+        js_code="window.scrollTo(0, document.body.scrollHeight);",  # Load dynamic content
+        wait_for="css:.product:nth-child(10)"  # Wait for products to load
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/complex-catalog",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for category in data:
+                print(f"Category: {category['category_name']}")
+                print(f"Products: {len(category.get('products', []))}")
+
+asyncio.run(extract_complex_ecommerce())
+```
+
+### XPath Alternative (When CSS Isn't Enough)
+
+```python
+from crawl4ai import JsonXPathExtractionStrategy
+
+# XPath for more complex selections
+xpath_schema = {
+    "name": "News Articles with XPath",
+    "baseSelector": "//article[@class='news-item']",
+    "fields": [
+        {
+            "name": "headline",
+            "selector": ".//h2[contains(@class, 'headline')]",
+            "type": "text"
+        },
+        {
+            "name": "author",
+            "selector": ".//span[@class='author']/text()",
+            "type": "text"
+        },
+        {
+            "name": "publish_date",
+            "selector": ".//time/@datetime",
+            "type": "text"
+        },
+        {
+            "name": "content",
+            "selector": ".//div[@class='article-body']//text()",
+            "type": "text"
+        }
+    ]
+}
+
+strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
+```
+
+---
+
+## 3. Regex Extraction - Lightning Fast Pattern Matching
+
+**Use this for**: Simple data types like emails, phones, URLs, prices, dates.
+
+### Built-in Patterns (Fastest Option)
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai import RegexExtractionStrategy
+
+async def extract_common_patterns():
+    # Use built-in patterns for common data types
+    strategy = RegexExtractionStrategy(
+        pattern=(
+            RegexExtractionStrategy.Email |
+            RegexExtractionStrategy.PhoneUS |
+            RegexExtractionStrategy.Url |
+            RegexExtractionStrategy.Currency |
+            RegexExtractionStrategy.DateIso
+        )
+    )
+    
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/contact",
+            config=config
+        )
+        
+        if result.success:
+            matches = json.loads(result.extracted_content)
+            
+            # Group by pattern type
+            by_type = {}
+            for match in matches:
+                label = match['label']
+                if label not in by_type:
+                    by_type[label] = []
+                by_type[label].append(match['value'])
+            
+            for pattern_type, values in by_type.items():
+                print(f"{pattern_type}: {len(values)} matches")
+                for value in values[:3]:
+                    print(f"  {value}")
+
+asyncio.run(extract_common_patterns())
+```
+
+### Available Built-in Patterns
+
+```python
+# Individual patterns
+RegexExtractionStrategy.Email          # Email addresses
+RegexExtractionStrategy.PhoneUS        # US phone numbers 
+RegexExtractionStrategy.PhoneIntl      # International phones
+RegexExtractionStrategy.Url            # HTTP/HTTPS URLs
+RegexExtractionStrategy.Currency       # Currency values ($99.99)
+RegexExtractionStrategy.Percentage     # Percentage values (25%)
+RegexExtractionStrategy.DateIso        # ISO dates (2024-01-01)
+RegexExtractionStrategy.DateUS         # US dates (01/01/2024)
+RegexExtractionStrategy.IPv4           # IP addresses
+RegexExtractionStrategy.CreditCard     # Credit card numbers
+RegexExtractionStrategy.TwitterHandle  # @username
+RegexExtractionStrategy.Hashtag        # #hashtag
+
+# Use all patterns
+RegexExtractionStrategy.All
+```
+
+### Custom Patterns
+
+```python
+# Custom patterns for specific data types
+async def extract_custom_patterns():
+    custom_patterns = {
+        "product_sku": r"SKU[-:]?\s*([A-Z0-9]{4,12})",
+        "discount": r"(\d{1,2})%\s*off",
+        "model_number": r"Model\s*#?\s*([A-Z0-9-]+)",
+        "isbn": r"ISBN[-:]?\s*(\d{10}|\d{13})",
+        "stock_ticker": r"\$([A-Z]{2,5})",
+        "version": r"v(\d+\.\d+(?:\.\d+)?)"
+    }
+    
+    strategy = RegexExtractionStrategy(custom=custom_patterns)
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/products",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for item in data:
+                print(f"{item['label']}: {item['value']}")
+
+asyncio.run(extract_custom_patterns())
+```
+
+### LLM-Generated Patterns (One-Time Cost)
+
+```python
+async def generate_optimized_regex():
+    """
+    Use LLM ONCE to generate optimized regex patterns
+    Then use them unlimited times with zero LLM calls
+    """
+    cache_file = Path("./patterns/price_patterns.json")
+    
+    if cache_file.exists():
+        # Load cached patterns - NO LLM CALLS
+        patterns = json.load(cache_file.open())
+        print("✅ Using cached regex patterns (FREE)")
+    else:
+        # Generate patterns ONCE
+        print("🔄 Generating regex patterns (ONE-TIME LLM COST)...")
+        
+        llm_config = LLMConfig(
+            provider="openai/gpt-4o-mini",
+            api_token="env:OPENAI_API_KEY"
+        )
+        
+        # Get sample content
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun("https://example.com/pricing")
+            sample_html = result.cleaned_html
+        
+        # Generate optimized patterns
+        patterns = RegexExtractionStrategy.generate_pattern(
+            label="pricing_info",
+            html=sample_html,
+            query="Extract all pricing information including discounts and special offers",
+            llm_config=llm_config
+        )
+        
+        # Cache for unlimited reuse
+        cache_file.parent.mkdir(exist_ok=True)
+        json.dump(patterns, cache_file.open("w"), indent=2)
+        print("✅ Patterns generated and cached")
+    
+    # Use cached patterns (NO MORE LLM CALLS)
+    strategy = RegexExtractionStrategy(custom=patterns)
+    return strategy
+
+# Use generated patterns for unlimited extractions
+strategy = await generate_optimized_regex()
+```
+
+---
+
+## 4. Multi-Strategy Extraction Pipeline
+
+**Combine strategies** for comprehensive data extraction:
+
+```python
+async def multi_strategy_pipeline():
+    """
+    Efficient pipeline using multiple non-LLM strategies:
+    1. Regex for simple patterns (fastest)
+    2. Schema for structured data 
+    3. Only use LLM if absolutely necessary
+    """
+    
+    url = "https://example.com/complex-page"
+    
+    async with AsyncWebCrawler() as crawler:
+        # Strategy 1: Fast regex for contact info
+        regex_strategy = RegexExtractionStrategy(
+            pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
+        )
+        regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy)
+        regex_result = await crawler.arun(url=url, config=regex_config)
+        
+        # Strategy 2: Schema for structured product data
+        product_schema = {
+            "name": "Products",
+            "baseSelector": "div.product",
+            "fields": [
+                {"name": "name", "selector": "h3", "type": "text"},
+                {"name": "price", "selector": ".price", "type": "text"}
+            ]
+        }
+        css_strategy = JsonCssExtractionStrategy(product_schema)
+        css_config = CrawlerRunConfig(extraction_strategy=css_strategy)
+        css_result = await crawler.arun(url=url, config=css_config)
+        
+        # Combine results
+        results = {
+            "contacts": json.loads(regex_result.extracted_content) if regex_result.success else [],
+            "products": json.loads(css_result.extracted_content) if css_result.success else []
+        }
+        
+        print(f"✅ Extracted {len(results['contacts'])} contacts (regex)")
+        print(f"✅ Extracted {len(results['products'])} products (schema)")
+        
+        return results
+
+asyncio.run(multi_strategy_pipeline())
+```
+
+---
+
+## 5. Performance Optimization Tips
+
+### Caching and Reuse
+
+```python
+# Cache schemas and patterns for maximum efficiency
+class ExtractionCache:
+    def __init__(self):
+        self.schemas = {}
+        self.patterns = {}
+    
+    def get_schema(self, site_name):
+        if site_name not in self.schemas:
+            schema_file = Path(f"./cache/{site_name}_schema.json")
+            if schema_file.exists():
+                self.schemas[site_name] = json.load(schema_file.open())
+        return self.schemas.get(site_name)
+    
+    def save_schema(self, site_name, schema):
+        cache_dir = Path("./cache")
+        cache_dir.mkdir(exist_ok=True)
+        schema_file = cache_dir / f"{site_name}_schema.json"
+        json.dump(schema, schema_file.open("w"), indent=2)
+        self.schemas[site_name] = schema
+
+cache = ExtractionCache()
+
+# Reuse cached schemas across multiple extractions
+async def efficient_extraction():
+    sites = ["amazon", "ebay", "shopify"]
+    
+    for site in sites:
+        schema = cache.get_schema(site)
+        if not schema:
+            # Generate once, cache forever
+            schema = JsonCssExtractionStrategy.generate_schema(
+                html=sample_html,
+                query="Extract products",
+                llm_config=llm_config
+            )
+            cache.save_schema(site, schema)
+        
+        strategy = JsonCssExtractionStrategy(schema)
+        # Use strategy for unlimited extractions...
+```
+
+### Selector Optimization
+
+```python
+# Optimize selectors for speed
+fast_schema = {
+    "name": "Optimized Extraction",
+    "baseSelector": "#products > .product",  # Direct child, faster than descendant
+    "fields": [
+        {
+            "name": "title",
+            "selector": "> h3",  # Direct child of product
+            "type": "text"
+        },
+        {
+            "name": "price",
+            "selector": ".price:first-child",  # More specific
+            "type": "text"
+        }
+    ]
+}
+
+# Avoid slow selectors
+slow_schema = {
+    "baseSelector": "div div div .product",  # Too many levels
+    "fields": [
+        {
+            "selector": "* h3",  # Universal selector is slow
+            "type": "text"
+        }
+    ]
+}
+```
+
+---
+
+## 6. Error Handling and Validation
+
+```python
+async def robust_extraction():
+    """
+    Implement fallback strategies for reliable extraction
+    """
+    strategies = [
+        # Try fast regex first
+        RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency),
+        
+        # Fallback to CSS schema
+        JsonCssExtractionStrategy({
+            "name": "Prices",
+            "baseSelector": ".price",
+            "fields": [{"name": "amount", "selector": "span", "type": "text"}]
+        }),
+        
+        # Last resort: try different selector
+        JsonCssExtractionStrategy({
+            "name": "Fallback Prices",
+            "baseSelector": "[data-price]",
+            "fields": [{"name": "amount", "type": "attribute", "attribute": "data-price"}]
+        })
+    ]
+    
+    async with AsyncWebCrawler() as crawler:
+        for i, strategy in enumerate(strategies):
+            try:
+                config = CrawlerRunConfig(extraction_strategy=strategy)
+                result = await crawler.arun(url="https://example.com", config=config)
+                
+                if result.success and result.extracted_content:
+                    data = json.loads(result.extracted_content)
+                    if data:  # Validate non-empty results
+                        print(f"✅ Success with strategy {i+1}: {strategy.__class__.__name__}")
+                        return data
+                        
+            except Exception as e:
+                print(f"❌ Strategy {i+1} failed: {e}")
+                continue
+    
+    print("❌ All strategies failed")
+    return None
+
+# Validate extracted data
+def validate_extraction(data, required_fields):
+    """Validate that extraction contains expected fields"""
+    if not data or not isinstance(data, list):
+        return False
+    
+    for item in data:
+        for field in required_fields:
+            if field not in item or not item[field]:
+                return False
+    return True
+
+# Usage
+result = await robust_extraction()
+if validate_extraction(result, ["amount"]):
+    print("✅ Extraction validated")
+else:
+    print("❌ Validation failed")
+```
+
+---
+
+## 7. Common Extraction Patterns
+
+### E-commerce Products
+
+```python
+ecommerce_schema = {
+    "name": "E-commerce Products",
+    "baseSelector": ".product, [data-product], .item",
+    "fields": [
+        {"name": "title", "selector": "h1, h2, h3, .title, .name", "type": "text"},
+        {"name": "price", "selector": ".price, .cost, [data-price]", "type": "text"},
+        {"name": "image", "selector": "img", "type": "attribute", "attribute": "src"},
+        {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"},
+        {"name": "rating", "selector": ".rating, .stars", "type": "text"},
+        {"name": "availability", "selector": ".stock, .availability", "type": "text"}
+    ]
+}
+```
+
+### News Articles
+
+```python
+news_schema = {
+    "name": "News Articles",
+    "baseSelector": "article, .article, .post",
+    "fields": [
+        {"name": "headline", "selector": "h1, h2, .headline, .title", "type": "text"},
+        {"name": "author", "selector": ".author, .byline, [rel='author']", "type": "text"},
+        {"name": "date", "selector": "time, .date, .published", "type": "text"},
+        {"name": "content", "selector": ".content, .body, .text", "type": "text"},
+        {"name": "category", "selector": ".category, .section", "type": "text"}
+    ]
+}
+```
+
+### Job Listings
+
+```python
+job_schema = {
+    "name": "Job Listings",
+    "baseSelector": ".job, .listing, [data-job]",
+    "fields": [
+        {"name": "title", "selector": ".job-title, h2, h3", "type": "text"},
+        {"name": "company", "selector": ".company, .employer", "type": "text"},
+        {"name": "location", "selector": ".location, .place", "type": "text"},
+        {"name": "salary", "selector": ".salary, .pay, .compensation", "type": "text"},
+        {"name": "description", "selector": ".description, .summary", "type": "text"},
+        {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"}
+    ]
+}
+```
+
+### Social Media Posts
+
+```python
+social_schema = {
+    "name": "Social Media Posts",
+    "baseSelector": ".post, .tweet, .update",
+    "fields": [
+        {"name": "username", "selector": ".username, .handle, .author", "type": "text"},
+        {"name": "content", "selector": ".content, .text, .message", "type": "text"},
+        {"name": "timestamp", "selector": ".time, .date, time", "type": "text"},
+        {"name": "likes", "selector": ".likes, .hearts", "type": "text"},
+        {"name": "shares", "selector": ".shares, .retweets", "type": "text"}
+    ]
+}
+```
+
+---
+
+## 8. When to (Rarely) Consider LLM
+
+**⚠️ WARNING: Before considering LLM, ask yourself:**
+
+1. "Can I identify repeating HTML patterns?" → Use CSS/XPath schema
+2. "Am I extracting simple data types?" → Use Regex patterns  
+3. "Can I provide a JSON example of what I want?" → Use generate_schema()
+4. "Is this truly unstructured text requiring semantic understanding?" → Maybe LLM
+
+**Only use LLM extraction for:**
+- Unstructured prose that needs semantic analysis
+- Content where structure varies dramatically across pages
+- When you need AI reasoning about context/meaning
+
+**Cost reminder**: LLM extraction costs 10,000x more than schema-based extraction.
+
+---
+
+## 9. Summary: The Extraction Hierarchy
+
+1. **🥇 FIRST CHOICE**: `generate_schema()` - AI generates pattern once, use unlimited times
+2. **🥈 SECOND CHOICE**: Manual CSS/XPath - Full control, maximum speed
+3. **🥉 THIRD CHOICE**: Regex patterns - Simple data types, lightning fast
+4. **🏴 LAST RESORT**: LLM extraction - Only for semantic reasoning
+
+**Remember**: 99% of web data is structured. You almost never need LLM for extraction. Save LLM for analysis, not extraction.
+
+**Performance**: Non-LLM strategies are 100-1000x faster and 10,000x cheaper than LLM extraction.
+
+---
+
+**📖 Next**: If you absolutely must use LLM extraction, see [extraction-llm.md](./extraction-llm.md) for guidance on the rare cases where it's justified.
+---
+
+
+## Multi-URL Crawling
+
+Concurrent crawling of multiple URLs with intelligent resource management, rate limiting, and real-time monitoring.
+
+### Basic Multi-URL Crawling
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+# Batch processing (default) - get all results at once
+async def batch_crawl():
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2", 
+        "https://example.com/page3"
+    ]
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=False  # Default: batch mode
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(urls, config=config)
+        
+        for result in results:
+            if result.success:
+                print(f"✅ {result.url}: {len(result.markdown)} chars")
+            else:
+                print(f"❌ {result.url}: {result.error_message}")
+
+# Streaming processing - handle results as they complete
+async def streaming_crawl():
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        # Process results as they become available
+        async for result in await crawler.arun_many(urls, config=config):
+            if result.success:
+                print(f"🔥 Just completed: {result.url}")
+                await process_result_immediately(result)
+            else:
+                print(f"❌ Failed: {result.url}")
+```
+
+### Memory-Adaptive Dispatching
+
+```python
+from crawl4ai import AsyncWebCrawler, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
+
+# Automatically manages concurrency based on system memory
+async def memory_adaptive_crawl():
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=80.0,  # Pause if memory exceeds 80%
+        check_interval=1.0,             # Check memory every second
+        max_session_permit=15,          # Max concurrent tasks
+        memory_wait_timeout=300.0       # Wait up to 5 minutes for memory
+    )
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=50
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(
+            urls=large_url_list,
+            config=config,
+            dispatcher=dispatcher
+        )
+        
+        # Each result includes dispatch information
+        for result in results:
+            if result.dispatch_result:
+                dr = result.dispatch_result
+                print(f"Memory used: {dr.memory_usage:.1f}MB")
+                print(f"Duration: {dr.end_time - dr.start_time}")
+```
+
+### Rate-Limited Crawling
+
+```python
+from crawl4ai import RateLimiter, SemaphoreDispatcher
+
+# Control request pacing and handle server rate limits
+async def rate_limited_crawl():
+    rate_limiter = RateLimiter(
+        base_delay=(1.0, 3.0),          # Random delay 1-3 seconds
+        max_delay=60.0,                 # Cap backoff at 60 seconds
+        max_retries=3,                  # Retry failed requests 3 times
+        rate_limit_codes=[429, 503]     # Handle these status codes
+    )
+    
+    dispatcher = SemaphoreDispatcher(
+        max_session_permit=5,           # Fixed concurrency limit
+        rate_limiter=rate_limiter
+    )
+    
+    config = CrawlerRunConfig(
+        user_agent_mode="random",       # Randomize user agents
+        simulate_user=True              # Simulate human behavior
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=config,
+            dispatcher=dispatcher
+        ):
+            print(f"Processed: {result.url}")
+```
+
+### Real-Time Monitoring
+
+```python
+from crawl4ai import CrawlerMonitor, DisplayMode
+
+# Monitor crawling progress in real-time
+async def monitored_crawl():
+    monitor = CrawlerMonitor(
+        max_visible_rows=20,                    # Show 20 tasks in display
+        display_mode=DisplayMode.DETAILED       # Show individual task details
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=75.0,
+        max_session_permit=10,
+        monitor=monitor  # Attach monitor to dispatcher
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(
+            urls=urls,
+            dispatcher=dispatcher
+        )
+```
+
+### Advanced Dispatcher Configurations
+
+```python
+# Memory-adaptive with comprehensive monitoring
+memory_dispatcher = MemoryAdaptiveDispatcher(
+    memory_threshold_percent=85.0,      # Higher memory tolerance
+    check_interval=0.5,                 # Check memory more frequently
+    max_session_permit=20,              # More concurrent tasks
+    memory_wait_timeout=600.0,          # Wait longer for memory
+    rate_limiter=RateLimiter(
+        base_delay=(0.5, 1.5),
+        max_delay=30.0,
+        max_retries=5
+    ),
+    monitor=CrawlerMonitor(
+        max_visible_rows=15,
+        display_mode=DisplayMode.AGGREGATED  # Summary view
+    )
+)
+
+# Simple semaphore-based dispatcher
+semaphore_dispatcher = SemaphoreDispatcher(
+    max_session_permit=8,               # Fixed concurrency
+    rate_limiter=RateLimiter(
+        base_delay=(1.0, 2.0),
+        max_delay=20.0
+    )
+)
+
+# Usage with custom dispatcher
+async with AsyncWebCrawler() as crawler:
+    results = await crawler.arun_many(
+        urls=urls,
+        config=config,
+        dispatcher=memory_dispatcher  # or semaphore_dispatcher
+    )
+```
+
+### Handling Large-Scale Crawling
+
+```python
+async def large_scale_crawl():
+    # For thousands of URLs
+    urls = load_urls_from_file("large_url_list.txt")  # 10,000+ URLs
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,  # Conservative memory usage
+        max_session_permit=25,          # Higher concurrency
+        rate_limiter=RateLimiter(
+            base_delay=(0.1, 0.5),      # Faster for large batches
+            max_retries=2               # Fewer retries for speed
+        ),
+        monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED)
+    )
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,   # Use caching for efficiency
+        stream=True,                    # Stream for memory efficiency
+        word_count_threshold=100,       # Skip short content
+        exclude_external_links=True     # Reduce processing overhead
+    )
+    
+    successful_crawls = 0
+    failed_crawls = 0
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                successful_crawls += 1
+                await save_result_to_database(result)
+            else:
+                failed_crawls += 1
+                await log_failure(result.url, result.error_message)
+            
+            # Progress reporting
+            if (successful_crawls + failed_crawls) % 100 == 0:
+                print(f"Progress: {successful_crawls + failed_crawls}/{len(urls)}")
+    
+    print(f"Completed: {successful_crawls} successful, {failed_crawls} failed")
+```
+
+### Robots.txt Compliance
+
+```python
+async def compliant_crawl():
+    config = CrawlerRunConfig(
+        check_robots_txt=True,          # Respect robots.txt
+        user_agent="MyBot/1.0",         # Identify your bot
+        mean_delay=2.0,                 # Be polite with delays
+        max_range=1.0
+    )
+    
+    dispatcher = SemaphoreDispatcher(
+        max_session_permit=3,           # Conservative concurrency
+        rate_limiter=RateLimiter(
+            base_delay=(2.0, 5.0),      # Slower, more respectful
+            max_retries=1
+        )
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                print(f"✅ Crawled: {result.url}")
+            elif "robots.txt" in result.error_message:
+                print(f"🚫 Blocked by robots.txt: {result.url}")
+            else:
+                print(f"❌ Error: {result.url}")
+```
+
+### Performance Analysis
+
+```python
+async def analyze_crawl_performance():
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=80.0,
+        max_session_permit=12,
+        monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
+    )
+    
+    start_time = time.time()
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(
+            urls=urls,
+            dispatcher=dispatcher
+        )
+    
+    end_time = time.time()
+    
+    # Analyze results
+    successful = [r for r in results if r.success]
+    failed = [r for r in results if not r.success]
+    
+    print(f"Total time: {end_time - start_time:.2f}s")
+    print(f"Success rate: {len(successful)}/{len(results)} ({len(successful)/len(results)*100:.1f}%)")
+    print(f"Avg time per URL: {(end_time - start_time)/len(results):.2f}s")
+    
+    # Memory usage analysis
+    if successful and successful[0].dispatch_result:
+        memory_usage = [r.dispatch_result.memory_usage for r in successful if r.dispatch_result]
+        peak_memory = [r.dispatch_result.peak_memory for r in successful if r.dispatch_result]
+        
+        print(f"Avg memory usage: {sum(memory_usage)/len(memory_usage):.1f}MB")
+        print(f"Peak memory usage: {max(peak_memory):.1f}MB")
+```
+
+### Error Handling and Recovery
+
+```python
+async def robust_multi_crawl():
+    failed_urls = []
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True,
+        page_timeout=30000  # 30 second timeout
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=85.0,
+        max_session_permit=10
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                await process_successful_result(result)
+            else:
+                failed_urls.append({
+                    'url': result.url,
+                    'error': result.error_message,
+                    'status_code': result.status_code
+                })
+                
+                # Retry logic for specific errors
+                if result.status_code in [503, 429]:  # Server errors
+                    await schedule_retry(result.url)
+    
+    # Report failures
+    if failed_urls:
+        print(f"Failed to crawl {len(failed_urls)} URLs:")
+        for failure in failed_urls[:10]:  # Show first 10
+            print(f"  {failure['url']}: {failure['error']}")
+```
+
+**📖 Learn more:** [Advanced Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Crawl Dispatcher](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [arun_many() API Reference](https://docs.crawl4ai.com/api/arun_many/)
+---
+
+
+## Deep Crawling
+
+Multi-level website exploration with intelligent filtering, scoring, and prioritization strategies.
+
+### Basic Deep Crawl Setup
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+
+# Basic breadth-first deep crawling
+async def basic_deep_crawl():
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2,               # Initial page + 2 levels
+            include_external=False     # Stay within same domain
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun("https://docs.crawl4ai.com", config=config)
+        
+        # Group results by depth
+        pages_by_depth = {}
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            if depth not in pages_by_depth:
+                pages_by_depth[depth] = []
+            pages_by_depth[depth].append(result.url)
+        
+        print(f"Crawled {len(results)} pages total")
+        for depth, urls in sorted(pages_by_depth.items()):
+            print(f"Depth {depth}: {len(urls)} pages")
+```
+
+### Deep Crawl Strategies
+
+```python
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, DFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+# Breadth-First Search - explores all links at one depth before going deeper
+bfs_strategy = BFSDeepCrawlStrategy(
+    max_depth=2,
+    include_external=False,
+    max_pages=50,              # Limit total pages
+    score_threshold=0.3        # Minimum score for URLs
+)
+
+# Depth-First Search - explores as deep as possible before backtracking
+dfs_strategy = DFSDeepCrawlStrategy(
+    max_depth=2,
+    include_external=False,
+    max_pages=30,
+    score_threshold=0.5
+)
+
+# Best-First - prioritizes highest scoring pages (recommended)
+keyword_scorer = KeywordRelevanceScorer(
+    keywords=["crawl", "example", "async", "configuration"],
+    weight=0.7
+)
+
+best_first_strategy = BestFirstCrawlingStrategy(
+    max_depth=2,
+    include_external=False,
+    url_scorer=keyword_scorer,
+    max_pages=25               # No score_threshold needed - naturally prioritizes
+)
+
+# Usage
+config = CrawlerRunConfig(
+    deep_crawl_strategy=best_first_strategy,  # Choose your strategy
+    scraping_strategy=LXMLWebScrapingStrategy()
+)
+```
+
+### Streaming vs Batch Processing
+
+```python
+# Batch mode - wait for all results
+async def batch_deep_crawl():
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+        stream=False  # Default - collect all results first
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun("https://example.com", config=config)
+        
+        # Process all results at once
+        for result in results:
+            print(f"Batch processed: {result.url}")
+
+# Streaming mode - process results as they arrive
+async def streaming_deep_crawl():
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
+        stream=True  # Process results immediately
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun("https://example.com", config=config):
+            depth = result.metadata.get("depth", 0)
+            print(f"Stream processed depth {depth}: {result.url}")
+```
+
+### Filtering with Filter Chains
+
+```python
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+    SEOFilter,
+    ContentRelevanceFilter
+)
+
+# Single URL pattern filter
+url_filter = URLPatternFilter(patterns=["*core*", "*guide*"])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=1,
+        filter_chain=FilterChain([url_filter])
+    )
+)
+
+# Multiple filters in chain
+advanced_filter_chain = FilterChain([
+    # Domain filtering
+    DomainFilter(
+        allowed_domains=["docs.example.com"],
+        blocked_domains=["old.docs.example.com", "staging.example.com"]
+    ),
+    
+    # URL pattern matching
+    URLPatternFilter(patterns=["*tutorial*", "*guide*", "*blog*"]),
+    
+    # Content type filtering
+    ContentTypeFilter(allowed_types=["text/html"]),
+    
+    # SEO quality filter
+    SEOFilter(
+        threshold=0.5,
+        keywords=["tutorial", "guide", "documentation"]
+    ),
+    
+    # Content relevance filter
+    ContentRelevanceFilter(
+        query="Web crawling and data extraction with Python",
+        threshold=0.7
+    )
+])
+
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(
+        max_depth=2,
+        filter_chain=advanced_filter_chain
+    )
+)
+```
+
+### Intelligent Crawling with Scorers
+
+```python
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+
+# Keyword relevance scoring
+async def scored_deep_crawl():
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["browser", "crawler", "web", "automation"],
+        weight=1.0
+    )
+    
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            url_scorer=keyword_scorer
+        ),
+        stream=True,  # Recommended with BestFirst
+        verbose=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
+```
+
+### Limiting Crawl Size
+
+```python
+# Max pages limitation across strategies
+async def limited_crawls():
+    # BFS with page limit
+    bfs_config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2,
+            max_pages=5,  # Only crawl 5 pages total
+            url_scorer=KeywordRelevanceScorer(keywords=["browser", "crawler"], weight=1.0)
+        )
+    )
+    
+    # DFS with score threshold
+    dfs_config = CrawlerRunConfig(
+        deep_crawl_strategy=DFSDeepCrawlStrategy(
+            max_depth=2,
+            score_threshold=0.7,  # Only URLs with scores above 0.7
+            max_pages=10,
+            url_scorer=KeywordRelevanceScorer(keywords=["web", "automation"], weight=1.0)
+        )
+    )
+    
+    # Best-First with both constraints
+    bf_config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            max_pages=7,  # Automatically gets highest scored pages
+            url_scorer=KeywordRelevanceScorer(keywords=["crawl", "example"], weight=1.0)
+        ),
+        stream=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        # Use any of the configs
+        async for result in await crawler.arun("https://docs.crawl4ai.com", config=bf_config):
+            score = result.metadata.get("score", 0)
+            print(f"Score: {score:.2f} | {result.url}")
+```
+
+### Complete Advanced Deep Crawler
+
+```python
+async def comprehensive_deep_crawl():
+    # Sophisticated filter chain
+    filter_chain = FilterChain([
+        DomainFilter(
+            allowed_domains=["docs.crawl4ai.com"],
+            blocked_domains=["old.docs.crawl4ai.com"]
+        ),
+        URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
+        ContentTypeFilter(allowed_types=["text/html"]),
+        SEOFilter(threshold=0.4, keywords=["crawl", "tutorial", "guide"])
+    ])
+    
+    # Multi-keyword scorer
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration", "browser"],
+        weight=0.8
+    )
+    
+    # Complete configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+            max_pages=20
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    # Execute and analyze
+    results = []
+    start_time = time.time()
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
+    
+    # Performance analysis
+    duration = time.time() - start_time
+    avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
+    
+    print(f"✅ Crawled {len(results)} pages in {duration:.2f}s")
+    print(f"✅ Average relevance score: {avg_score:.2f}")
+    
+    # Depth distribution
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+    
+    for depth, count in sorted(depth_counts.items()):
+        print(f"📊 Depth {depth}: {count} pages")
+```
+
+### Error Handling and Robustness
+
+```python
+async def robust_deep_crawl():
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            max_pages=15,
+            url_scorer=KeywordRelevanceScorer(keywords=["guide", "tutorial"])
+        ),
+        stream=True,
+        page_timeout=30000  # 30 second timeout per page
+    )
+    
+    successful_pages = []
+    failed_pages = []
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
+            if result.success:
+                successful_pages.append(result)
+                depth = result.metadata.get("depth", 0)
+                score = result.metadata.get("score", 0)
+                print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
+            else:
+                failed_pages.append({
+                    'url': result.url,
+                    'error': result.error_message,
+                    'depth': result.metadata.get("depth", 0)
+                })
+                print(f"❌ Failed: {result.url} - {result.error_message}")
+    
+    print(f"📊 Results: {len(successful_pages)} successful, {len(failed_pages)} failed")
+    
+    # Analyze failures by depth
+    if failed_pages:
+        failure_by_depth = {}
+        for failure in failed_pages:
+            depth = failure['depth']
+            failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
+        
+        print("❌ Failures by depth:")
+        for depth, count in sorted(failure_by_depth.items()):
+            print(f"   Depth {depth}: {count} failures")
+```
+
+**📖 Learn more:** [Deep Crawling Guide](https://docs.crawl4ai.com/core/deep-crawling/), [Filter Documentation](https://docs.crawl4ai.com/core/content-selection/), [Scoring Strategies](https://docs.crawl4ai.com/advanced/advanced-features/)
+---
+
+
+## Docker Deployment
+
+Complete Docker deployment guide with pre-built images, API endpoints, configuration, and MCP integration.
+
+### Quick Start with Pre-built Images
+
+```bash
+# Pull latest image
+docker pull unclecode/crawl4ai:latest
+
+# Setup LLM API keys
+cat > .llm.env << EOL
+OPENAI_API_KEY=sk-your-key
+ANTHROPIC_API_KEY=your-anthropic-key
+GROQ_API_KEY=your-groq-key
+GEMINI_API_TOKEN=your-gemini-token
+EOL
+
+# Run with LLM support
+docker run -d \
+  -p 11235:11235 \
+  --name crawl4ai \
+  --env-file .llm.env \
+  --shm-size=1g \
+  unclecode/crawl4ai:latest
+
+# Basic run (no LLM)
+docker run -d \
+  -p 11235:11235 \
+  --name crawl4ai \
+  --shm-size=1g \
+  unclecode/crawl4ai:latest
+
+# Check health
+curl http://localhost:11235/health
+```
+
+### Docker Compose Deployment
+
+```bash
+# Clone and setup
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+cp deploy/docker/.llm.env.example .llm.env
+# Edit .llm.env with your API keys
+
+# Run pre-built image
+IMAGE=unclecode/crawl4ai:latest docker compose up -d
+
+# Build locally
+docker compose up --build -d
+
+# Build with all features
+INSTALL_TYPE=all docker compose up --build -d
+
+# Build with GPU support
+ENABLE_GPU=true docker compose up --build -d
+
+# Stop service
+docker compose down
+```
+
+### Manual Build with Multi-Architecture
+
+```bash
+# Clone repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+
+# Build for current architecture
+docker buildx build -t crawl4ai-local:latest --load .
+
+# Build for multiple architectures
+docker buildx build --platform linux/amd64,linux/arm64 \
+  -t crawl4ai-local:latest --load .
+
+# Build with specific features
+docker buildx build \
+  --build-arg INSTALL_TYPE=all \
+  --build-arg ENABLE_GPU=false \
+  -t crawl4ai-local:latest --load .
+
+# Run custom build
+docker run -d \
+  -p 11235:11235 \
+  --name crawl4ai-custom \
+  --env-file .llm.env \
+  --shm-size=1g \
+  crawl4ai-local:latest
+```
+
+### Build Arguments
+
+```bash
+# Available build options
+docker buildx build \
+  --build-arg INSTALL_TYPE=all \     # default|all|torch|transformer
+  --build-arg ENABLE_GPU=true \      # true|false
+  --build-arg APP_HOME=/app \        # Install path
+  --build-arg USE_LOCAL=true \       # Use local source
+  --build-arg GITHUB_REPO=url \      # Git repo if USE_LOCAL=false
+  --build-arg GITHUB_BRANCH=main \   # Git branch
+  -t crawl4ai-custom:latest --load .
+```
+
+### Core API Endpoints
+
+```python
+# Main crawling endpoints
+import requests
+import json
+
+# Basic crawl
+payload = {
+    "urls": ["https://example.com"],
+    "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
+}
+response = requests.post("http://localhost:11235/crawl", json=payload)
+
+# Streaming crawl
+payload["crawler_config"]["params"]["stream"] = True
+response = requests.post("http://localhost:11235/crawl/stream", json=payload)
+
+# Health check
+response = requests.get("http://localhost:11235/health")
+
+# API schema
+response = requests.get("http://localhost:11235/schema")
+
+# Metrics (Prometheus format)
+response = requests.get("http://localhost:11235/metrics")
+```
+
+### Specialized Endpoints
+
+```python
+# HTML extraction (preprocessed for schema)
+response = requests.post("http://localhost:11235/html", 
+    json={"url": "https://example.com"})
+
+# Screenshot capture
+response = requests.post("http://localhost:11235/screenshot", json={
+    "url": "https://example.com",
+    "screenshot_wait_for": 2,
+    "output_path": "/path/to/save/screenshot.png"
+})
+
+# PDF generation
+response = requests.post("http://localhost:11235/pdf", json={
+    "url": "https://example.com",
+    "output_path": "/path/to/save/document.pdf"
+})
+
+# JavaScript execution
+response = requests.post("http://localhost:11235/execute_js", json={
+    "url": "https://example.com",
+    "scripts": [
+        "return document.title",
+        "return Array.from(document.querySelectorAll('a')).map(a => a.href)"
+    ]
+})
+
+# Markdown generation
+response = requests.post("http://localhost:11235/md", json={
+    "url": "https://example.com",
+    "f": "fit",  # raw|fit|bm25|llm
+    "q": "extract main content",  # query for filtering
+    "c": "0"  # cache: 0=bypass, 1=use
+})
+
+# LLM Q&A
+response = requests.get("http://localhost:11235/llm/https://example.com?q=What is this page about?")
+
+# Library context (for AI assistants)
+response = requests.get("http://localhost:11235/ask", params={
+    "context_type": "all",  # code|doc|all
+    "query": "how to use extraction strategies",
+    "score_ratio": 0.5,
+    "max_results": 20
+})
+```
+
+### Python SDK Usage
+
+```python
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
+
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
+        # Non-streaming crawl
+        results = await client.crawl(
+            ["https://example.com"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        
+        for result in results:
+            print(f"URL: {result.url}, Success: {result.success}")
+            print(f"Content length: {len(result.markdown)}")
+        
+        # Streaming crawl
+        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
+        async for result in await client.crawl(
+            ["https://example.com", "https://python.org"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=stream_config
+        ):
+            print(f"Streamed: {result.url} - {result.success}")
+        
+        # Get API schema
+        schema = await client.get_schema()
+        print(f"Schema available: {bool(schema)}")
+
+asyncio.run(main())
+```
+
+### Advanced API Configuration
+
+```python
+# Complex extraction with LLM
+payload = {
+    "urls": ["https://example.com"],
+    "browser_config": {
+        "type": "BrowserConfig",
+        "params": {
+            "headless": True,
+            "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}
+        }
+    },
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "LLMExtractionStrategy",
+                "params": {
+                    "llm_config": {
+                        "type": "LLMConfig",
+                        "params": {
+                            "provider": "openai/gpt-4o-mini",
+                            "api_token": "env:OPENAI_API_KEY"
+                        }
+                    },
+                    "schema": {
+                        "type": "dict",
+                        "value": {
+                            "type": "object",
+                            "properties": {
+                                "title": {"type": "string"},
+                                "content": {"type": "string"}
+                            }
+                        }
+                    },
+                    "instruction": "Extract title and main content"
+                }
+            },
+            "markdown_generator": {
+                "type": "DefaultMarkdownGenerator",
+                "params": {
+                    "content_filter": {
+                        "type": "PruningContentFilter",
+                        "params": {"threshold": 0.6}
+                    }
+                }
+            }
+        }
+    }
+}
+
+response = requests.post("http://localhost:11235/crawl", json=payload)
+```
+
+### CSS Extraction Strategy
+
+```python
+# CSS-based structured extraction
+schema = {
+    "name": "ProductList",
+    "baseSelector": ".product",
+    "fields": [
+        {"name": "title", "selector": "h2", "type": "text"},
+        {"name": "price", "selector": ".price", "type": "text"},
+        {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
+    ]
+}
+
+payload = {
+    "urls": ["https://example-shop.com"],
+    "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "JsonCssExtractionStrategy",
+                "params": {
+                    "schema": {"type": "dict", "value": schema}
+                }
+            }
+        }
+    }
+}
+
+response = requests.post("http://localhost:11235/crawl", json=payload)
+data = response.json()
+extracted = json.loads(data["results"][0]["extracted_content"])
+```
+
+### MCP (Model Context Protocol) Integration
+
+```bash
+# Add Crawl4AI as MCP provider to Claude Code
+claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
+
+# List MCP providers
+claude mcp list
+
+# Test MCP connection
+python tests/mcp/test_mcp_socket.py
+
+# Available MCP endpoints
+# SSE: http://localhost:11235/mcp/sse
+# WebSocket: ws://localhost:11235/mcp/ws
+# Schema: http://localhost:11235/mcp/schema
+```
+
+Available MCP tools:
+- `md` - Generate markdown from web content
+- `html` - Extract preprocessed HTML  
+- `screenshot` - Capture webpage screenshots
+- `pdf` - Generate PDF documents
+- `execute_js` - Run JavaScript on web pages
+- `crawl` - Perform multi-URL crawling
+- `ask` - Query Crawl4AI library context
+
+### Configuration Management
+
+```yaml
+# config.yml structure
+app:
+  title: "Crawl4AI API"
+  version: "1.0.0"
+  host: "0.0.0.0"
+  port: 11235
+  timeout_keep_alive: 300
+
+llm:
+  provider: "openai/gpt-4o-mini"
+  api_key_env: "OPENAI_API_KEY"
+
+security:
+  enabled: false
+  jwt_enabled: false
+  trusted_hosts: ["*"]
+
+crawler:
+  memory_threshold_percent: 95.0
+  rate_limiter:
+    base_delay: [1.0, 2.0]
+  timeouts:
+    stream_init: 30.0
+    batch_process: 300.0
+  pool:
+    max_pages: 40
+    idle_ttl_sec: 1800
+
+rate_limiting:
+  enabled: true
+  default_limit: "1000/minute"
+  storage_uri: "memory://"
+
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+```
+
+### Custom Configuration Deployment
+
+```bash
+# Method 1: Mount custom config
+docker run -d -p 11235:11235 \
+  --name crawl4ai-custom \
+  --env-file .llm.env \
+  --shm-size=1g \
+  -v $(pwd)/my-config.yml:/app/config.yml \
+  unclecode/crawl4ai:latest
+
+# Method 2: Build with custom config
+# Edit deploy/docker/config.yml then build
+docker buildx build -t crawl4ai-custom:latest --load .
+```
+
+### Monitoring and Health Checks
+
+```bash
+# Health endpoint
+curl http://localhost:11235/health
+
+# Prometheus metrics
+curl http://localhost:11235/metrics
+
+# Configuration validation
+curl -X POST http://localhost:11235/config/dump \
+  -H "Content-Type: application/json" \
+  -d '{"code": "CrawlerRunConfig(cache_mode=\"BYPASS\", screenshot=True)"}'
+```
+
+### Playground Interface
+
+Access the interactive playground at `http://localhost:11235/playground` for:
+- Testing configurations with visual interface
+- Generating JSON payloads for REST API
+- Converting Python config to JSON format
+- Testing crawl operations directly in browser
+
+### Async Job Processing
+
+```python
+# Submit job for async processing
+import time
+
+# Submit crawl job
+response = requests.post("http://localhost:11235/crawl/job", json=payload)
+task_id = response.json()["task_id"]
+
+# Poll for completion
+while True:
+    result = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
+    status = result.json()
+    
+    if status["status"] in ["COMPLETED", "FAILED"]:
+        break
+    time.sleep(1.5)
+
+print("Final result:", status)
+```
+
+### Production Deployment
+
+```bash
+# Production-ready deployment
+docker run -d \
+  --name crawl4ai-prod \
+  --restart unless-stopped \
+  -p 11235:11235 \
+  --env-file .llm.env \
+  --shm-size=2g \
+  --memory=8g \
+  --cpus=4 \
+  -v /path/to/custom-config.yml:/app/config.yml \
+  unclecode/crawl4ai:latest
+
+# With Docker Compose for production
+version: '3.8'
+services:
+  crawl4ai:
+    image: unclecode/crawl4ai:latest
+    ports:
+      - "11235:11235"
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+    volumes:
+      - ./config.yml:/app/config.yml
+    shm_size: 2g
+    deploy:
+      resources:
+        limits:
+          memory: 8G
+          cpus: '4'
+    restart: unless-stopped
+```
+
+### Configuration Validation and JSON Structure
+
+```python
+# Method 1: Create config objects and dump to see expected JSON structure
+from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
+import json
+
+# Create browser config and see JSON structure
+browser_config = BrowserConfig(
+    headless=True,
+    viewport_width=1280,
+    viewport_height=720,
+    proxy="http://user:pass@proxy:8080"
+)
+
+# Get JSON structure
+browser_json = browser_config.dump()
+print("BrowserConfig JSON structure:")
+print(json.dumps(browser_json, indent=2))
+
+# Create crawler config with extraction strategy
+schema = {
+    "name": "Articles",
+    "baseSelector": ".article",
+    "fields": [
+        {"name": "title", "selector": "h2", "type": "text"},
+        {"name": "content", "selector": ".content", "type": "html"}
+    ]
+}
+
+crawler_config = CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    screenshot=True,
+    extraction_strategy=JsonCssExtractionStrategy(schema),
+    js_code=["window.scrollTo(0, document.body.scrollHeight);"],
+    wait_for="css:.loaded"
+)
+
+crawler_json = crawler_config.dump()
+print("\nCrawlerRunConfig JSON structure:")
+print(json.dumps(crawler_json, indent=2))
+```
+
+### Reverse Validation - JSON to Objects
+
+```python
+# Method 2: Load JSON back to config objects for validation
+from crawl4ai.async_configs import from_serializable_dict
+
+# Test JSON structure by converting back to objects
+test_browser_json = {
+    "type": "BrowserConfig",
+    "params": {
+        "headless": True,
+        "viewport_width": 1280,
+        "proxy": "http://user:pass@proxy:8080"
+    }
+}
+
+try:
+    # Convert JSON back to object
+    restored_browser = from_serializable_dict(test_browser_json)
+    print(f"✅ Valid BrowserConfig: {type(restored_browser)}")
+    print(f"Headless: {restored_browser.headless}")
+    print(f"Proxy: {restored_browser.proxy}")
+except Exception as e:
+    print(f"❌ Invalid BrowserConfig JSON: {e}")
+
+# Test complex crawler config JSON
+test_crawler_json = {
+    "type": "CrawlerRunConfig", 
+    "params": {
+        "cache_mode": "bypass",
+        "screenshot": True,
+        "extraction_strategy": {
+            "type": "JsonCssExtractionStrategy",
+            "params": {
+                "schema": {
+                    "type": "dict",
+                    "value": {
+                        "name": "Products",
+                        "baseSelector": ".product",
+                        "fields": [
+                            {"name": "title", "selector": "h3", "type": "text"}
+                        ]
+                    }
+                }
+            }
+        }
+    }
+}
+
+try:
+    restored_crawler = from_serializable_dict(test_crawler_json)
+    print(f"✅ Valid CrawlerRunConfig: {type(restored_crawler)}")
+    print(f"Cache mode: {restored_crawler.cache_mode}")
+    print(f"Has extraction strategy: {restored_crawler.extraction_strategy is not None}")
+except Exception as e:
+    print(f"❌ Invalid CrawlerRunConfig JSON: {e}")
+```
+
+### Using Server's /config/dump Endpoint for Validation
+
+```python
+import requests
+
+# Method 3: Use server endpoint to validate configuration syntax
+def validate_config_with_server(config_code: str) -> dict:
+    """Validate configuration using server's /config/dump endpoint"""
+    response = requests.post(
+        "http://localhost:11235/config/dump",
+        json={"code": config_code}
+    )
+    
+    if response.status_code == 200:
+        print("✅ Valid configuration syntax")
+        return response.json()
+    else:
+        print(f"❌ Invalid configuration: {response.status_code}")
+        print(response.json())
+        return None
+
+# Test valid configuration
+valid_config = """
+CrawlerRunConfig(
+    cache_mode=CacheMode.BYPASS,
+    screenshot=True,
+    js_code=["window.scrollTo(0, document.body.scrollHeight);"],
+    wait_for="css:.content-loaded"
+)
+"""
+
+result = validate_config_with_server(valid_config)
+if result:
+    print("Generated JSON structure:")
+    print(json.dumps(result, indent=2))
+
+# Test invalid configuration (should fail)
+invalid_config = """
+CrawlerRunConfig(
+    cache_mode="invalid_mode",
+    screenshot=True,
+    js_code=some_function()  # This will fail
+)
+"""
+
+validate_config_with_server(invalid_config)
+```
+
+### Configuration Builder Helper
+
+```python
+def build_and_validate_request(urls, browser_params=None, crawler_params=None):
+    """Helper to build and validate complete request payload"""
+    
+    # Create configurations
+    browser_config = BrowserConfig(**(browser_params or {}))
+    crawler_config = CrawlerRunConfig(**(crawler_params or {}))
+    
+    # Build complete request payload
+    payload = {
+        "urls": urls if isinstance(urls, list) else [urls],
+        "browser_config": browser_config.dump(),
+        "crawler_config": crawler_config.dump()
+    }
+    
+    print("✅ Complete request payload:")
+    print(json.dumps(payload, indent=2))
+    
+    # Validate by attempting to reconstruct
+    try:
+        test_browser = from_serializable_dict(payload["browser_config"])
+        test_crawler = from_serializable_dict(payload["crawler_config"])
+        print("✅ Payload validation successful")
+        return payload
+    except Exception as e:
+        print(f"❌ Payload validation failed: {e}")
+        return None
+
+# Example usage
+payload = build_and_validate_request(
+    urls=["https://example.com"],
+    browser_params={"headless": True, "viewport_width": 1280},
+    crawler_params={
+        "cache_mode": CacheMode.BYPASS,
+        "screenshot": True,
+        "word_count_threshold": 10
+    }
+)
+
+if payload:
+    # Send to server
+    response = requests.post("http://localhost:11235/crawl", json=payload)
+    print(f"Server response: {response.status_code}")
+```
+
+### Common JSON Structure Patterns
+
+```python
+# Pattern 1: Simple primitive values
+simple_config = {
+    "type": "CrawlerRunConfig",
+    "params": {
+        "cache_mode": "bypass",  # String enum value
+        "screenshot": True,      # Boolean
+        "page_timeout": 60000   # Integer
+    }
+}
+
+# Pattern 2: Nested objects
+nested_config = {
+    "type": "CrawlerRunConfig", 
+    "params": {
+        "extraction_strategy": {
+            "type": "LLMExtractionStrategy",
+            "params": {
+                "llm_config": {
+                    "type": "LLMConfig",
+                    "params": {
+                        "provider": "openai/gpt-4o-mini",
+                        "api_token": "env:OPENAI_API_KEY"
+                    }
+                },
+                "instruction": "Extract main content"
+            }
+        }
+    }
+}
+
+# Pattern 3: Dictionary values (must use type: dict wrapper)
+dict_config = {
+    "type": "CrawlerRunConfig",
+    "params": {
+        "extraction_strategy": {
+            "type": "JsonCssExtractionStrategy", 
+            "params": {
+                "schema": {
+                    "type": "dict",  # Required wrapper
+                    "value": {       # Actual dictionary content
+                        "name": "Products",
+                        "baseSelector": ".product",
+                        "fields": [
+                            {"name": "title", "selector": "h2", "type": "text"}
+                        ]
+                    }
+                }
+            }
+        }
+    }
+}
+
+# Pattern 4: Lists and arrays
+list_config = {
+    "type": "CrawlerRunConfig",
+    "params": {
+        "js_code": [  # Lists are handled directly
+            "window.scrollTo(0, document.body.scrollHeight);",
+            "document.querySelector('.load-more')?.click();"
+        ],
+        "excluded_tags": ["script", "style", "nav"]
+    }
+}
+```
+
+### Troubleshooting Common JSON Errors
+
+```python
+def diagnose_json_errors():
+    """Common JSON structure errors and fixes"""
+    
+    # ❌ WRONG: Missing type wrapper for objects
+    wrong_config = {
+        "browser_config": {
+            "headless": True  # Missing type wrapper
+        }
+    }
+    
+    # ✅ CORRECT: Proper type wrapper
+    correct_config = {
+        "browser_config": {
+            "type": "BrowserConfig",
+            "params": {
+                "headless": True
+            }
+        }
+    }
+    
+    # ❌ WRONG: Dictionary without type: dict wrapper  
+    wrong_dict = {
+        "schema": {
+            "name": "Products"  # Raw dict, should be wrapped
+        }
+    }
+    
+    # ✅ CORRECT: Dictionary with proper wrapper
+    correct_dict = {
+        "schema": {
+            "type": "dict",
+            "value": {
+                "name": "Products"
+            }
+        }
+    }
+    
+    # ❌ WRONG: Invalid enum string
+    wrong_enum = {
+        "cache_mode": "DISABLED"  # Wrong case/value
+    }
+    
+    # ✅ CORRECT: Valid enum string  
+    correct_enum = {
+        "cache_mode": "bypass"  # or "enabled", "disabled", etc.
+    }
+    
+    print("Common error patterns documented above")
+
+# Validate your JSON structure before sending
+def pre_flight_check(payload):
+    """Run checks before sending to server"""
+    required_keys = ["urls", "browser_config", "crawler_config"]
+    
+    for key in required_keys:
+        if key not in payload:
+            print(f"❌ Missing required key: {key}")
+            return False
+    
+    # Check type wrappers
+    for config_key in ["browser_config", "crawler_config"]:
+        config = payload[config_key]
+        if not isinstance(config, dict) or "type" not in config:
+            print(f"❌ {config_key} missing type wrapper")
+            return False
+        if "params" not in config:
+            print(f"❌ {config_key} missing params")
+            return False
+    
+    print("✅ Pre-flight check passed")
+    return True
+
+# Example usage
+payload = {
+    "urls": ["https://example.com"],
+    "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+    "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
+}
+
+if pre_flight_check(payload):
+    # Safe to send to server
+    pass
+```
+
+**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment/), [API Reference](https://docs.crawl4ai.com/api/), [MCP Integration](https://docs.crawl4ai.com/core/docker-deployment/#mcp-model-context-protocol-support), [Configuration Options](https://docs.crawl4ai.com/core/docker-deployment/#server-configuration)
+---
+
+
+## CLI & Identity-Based Browsing
+
+Command-line interface for web crawling with persistent browser profiles, authentication, and identity management.
+
+### Basic CLI Usage
+
+```bash
+# Simple crawling
+crwl https://example.com
+
+# Get markdown output
+crwl https://example.com -o markdown
+
+# JSON output with cache bypass
+crwl https://example.com -o json --bypass-cache
+
+# Verbose mode with specific browser settings
+crwl https://example.com -b "headless=false,viewport_width=1280" -v
+```
+
+### Profile Management Commands
+
+```bash
+# Launch interactive profile manager
+crwl profiles
+
+# Create, list, and manage browser profiles
+# This opens a menu where you can:
+# 1. List existing profiles
+# 2. Create new profile (opens browser for setup)
+# 3. Delete profiles
+# 4. Use profile to crawl a website
+
+# Use a specific profile for crawling
+crwl https://example.com -p my-profile-name
+
+# Example workflow for authenticated sites:
+# 1. Create profile and log in
+crwl profiles  # Select "Create new profile"
+# 2. Use profile for crawling authenticated content
+crwl https://site-requiring-login.com/dashboard -p my-profile-name
+```
+
+### CDP Browser Management
+
+```bash
+# Launch browser with CDP debugging (default port 9222)
+crwl cdp
+
+# Use specific profile and custom port
+crwl cdp -p my-profile -P 9223
+
+# Launch headless browser with CDP
+crwl cdp --headless
+
+# Launch in incognito mode (ignores profile)
+crwl cdp --incognito
+
+# Use custom user data directory
+crwl cdp --user-data-dir ~/my-browser-data --port 9224
+```
+
+### Builtin Browser Management
+
+```bash
+# Start persistent browser instance
+crwl browser start
+
+# Check browser status
+crwl browser status
+
+# Open visible window to see the browser
+crwl browser view --url https://example.com
+
+# Stop the browser
+crwl browser stop
+
+# Restart with different options
+crwl browser restart --browser-type chromium --port 9223 --no-headless
+
+# Use builtin browser in crawling
+crwl https://example.com -b "browser_mode=builtin"
+```
+
+### Authentication Workflow Examples
+
+```bash
+# Complete workflow for LinkedIn scraping
+# 1. Create authenticated profile
+crwl profiles
+# Select "Create new profile" → login to LinkedIn in browser → press 'q' to save
+
+# 2. Use profile for crawling
+crwl https://linkedin.com/in/someone -p linkedin-profile -o markdown
+
+# 3. Extract structured data with authentication
+crwl https://linkedin.com/search/results/people/ \
+    -p linkedin-profile \
+    -j "Extract people profiles with names, titles, and companies" \
+    -b "headless=false"
+
+# GitHub authenticated crawling
+crwl profiles  # Create github-profile
+crwl https://github.com/settings/profile -p github-profile
+
+# Twitter/X authenticated access
+crwl profiles  # Create twitter-profile  
+crwl https://twitter.com/home -p twitter-profile -o markdown
+```
+
+### Advanced CLI Configuration
+
+```bash
+# Complex crawling with multiple configs
+crwl https://example.com \
+    -B browser.yml \
+    -C crawler.yml \
+    -e extract_llm.yml \
+    -s llm_schema.json \
+    -p my-auth-profile \
+    -o json \
+    -v
+
+# Quick LLM extraction with authentication
+crwl https://private-site.com/dashboard \
+    -p auth-profile \
+    -j "Extract user dashboard data including metrics and notifications" \
+    -b "headless=true,viewport_width=1920"
+
+# Content filtering with authentication
+crwl https://members-only-site.com \
+    -p member-profile \
+    -f filter_bm25.yml \
+    -c "css_selector=.member-content,scan_full_page=true" \
+    -o markdown-fit
+```
+
+### Configuration Files for Identity Browsing
+
+```yaml
+# browser_auth.yml
+headless: false
+use_managed_browser: true
+user_data_dir: "/path/to/profile"
+viewport_width: 1280
+viewport_height: 720
+simulate_user: true
+override_navigator: true
+
+# crawler_auth.yml  
+magic: true
+remove_overlay_elements: true
+simulate_user: true
+wait_for: "css:.authenticated-content"
+page_timeout: 60000
+delay_before_return_html: 2
+scan_full_page: true
+```
+
+### Global Configuration Management
+
+```bash
+# List all configuration settings
+crwl config list
+
+# Set default LLM provider
+crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet"
+crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token"
+
+# Set browser defaults
+crwl config set BROWSER_HEADLESS false  # Always show browser
+crwl config set USER_AGENT_MODE random  # Random user agents
+
+# Enable verbose mode globally
+crwl config set VERBOSE true
+```
+
+### Q&A with Authenticated Content
+
+```bash
+# Ask questions about authenticated content
+crwl https://private-dashboard.com -p dashboard-profile \
+    -q "What are the key metrics shown in my dashboard?"
+
+# Multiple questions workflow
+crwl https://company-intranet.com -p work-profile -o markdown  # View content
+crwl https://company-intranet.com -p work-profile \
+    -q "Summarize this week's announcements"
+crwl https://company-intranet.com -p work-profile \
+    -q "What are the upcoming deadlines?"
+```
+
+### Profile Creation Programmatically
+
+```python
+# Create profiles via Python API
+import asyncio
+from crawl4ai import BrowserProfiler
+
+async def create_auth_profile():
+    profiler = BrowserProfiler()
+    
+    # Create profile interactively (opens browser)
+    profile_path = await profiler.create_profile("linkedin-auth")
+    print(f"Profile created at: {profile_path}")
+    
+    # List all profiles
+    profiles = profiler.list_profiles()
+    for profile in profiles:
+        print(f"Profile: {profile['name']} at {profile['path']}")
+    
+    # Use profile for crawling
+    from crawl4ai import AsyncWebCrawler, BrowserConfig
+    
+    browser_config = BrowserConfig(
+        headless=True,
+        use_managed_browser=True,
+        user_data_dir=profile_path
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun("https://linkedin.com/feed")
+        return result
+
+# asyncio.run(create_auth_profile())
+```
+
+### Identity Browsing Best Practices
+
+```bash
+# 1. Create specific profiles for different sites
+crwl profiles  # Create "linkedin-work"
+crwl profiles  # Create "github-personal" 
+crwl profiles  # Create "company-intranet"
+
+# 2. Use descriptive profile names
+crwl https://site1.com -p site1-admin-account
+crwl https://site2.com -p site2-user-account
+
+# 3. Combine with appropriate browser settings
+crwl https://secure-site.com \
+    -p secure-profile \
+    -b "headless=false,simulate_user=true,magic=true" \
+    -c "wait_for=.logged-in-indicator,page_timeout=30000"
+
+# 4. Test profile before automated crawling
+crwl cdp -p test-profile  # Manually verify login status
+crwl https://test-url.com -p test-profile -v  # Verbose test crawl
+```
+
+### Troubleshooting Authentication Issues
+
+```bash
+# Debug authentication problems
+crwl https://auth-site.com -p auth-profile \
+    -b "headless=false,verbose=true" \
+    -c "verbose=true,page_timeout=60000" \
+    -v
+
+# Check profile status
+crwl profiles  # List profiles and check creation dates
+
+# Recreate problematic profiles
+crwl profiles  # Delete old profile, create new one
+
+# Test with visible browser
+crwl https://problem-site.com -p profile-name \
+    -b "headless=false" \
+    -c "delay_before_return_html=5"
+```
+
+### Common Use Cases
+
+```bash
+# Social media monitoring (after authentication)
+crwl https://twitter.com/home -p twitter-monitor \
+    -j "Extract latest tweets with sentiment and engagement metrics"
+
+# E-commerce competitor analysis (with account access)
+crwl https://competitor-site.com/products -p competitor-account \
+    -j "Extract product prices, availability, and descriptions"
+
+# Company dashboard monitoring
+crwl https://company-dashboard.com -p work-profile \
+    -c "css_selector=.dashboard-content" \
+    -q "What alerts or notifications need attention?"
+
+# Research data collection (authenticated access)
+crwl https://research-platform.com/data -p research-profile \
+    -e extract_research.yml \
+    -s research_schema.json \
+    -o json
+```
+
+**📖 Learn more:** [Identity-Based Crawling Documentation](https://docs.crawl4ai.com/advanced/identity-based-crawling/), [Browser Profile Management](https://docs.crawl4ai.com/advanced/session-management/), [CLI Examples](https://docs.crawl4ai.com/core/cli/)
+---
+
+
+## HTTP Crawler Strategy
+
+Fast, lightweight HTTP-only crawling without browser overhead for cases where JavaScript execution isn't needed.
+
+### Basic HTTP Crawler Setup
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, HTTPCrawlerConfig, CacheMode
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+from crawl4ai.async_logger import AsyncLogger
+
+async def main():
+    # Initialize HTTP strategy
+    http_strategy = AsyncHTTPCrawlerStrategy(
+        browser_config=HTTPCrawlerConfig(
+            method="GET",
+            verify_ssl=True,
+            follow_redirects=True
+        ),
+        logger=AsyncLogger(verbose=True)
+    )
+
+    # Use with AsyncWebCrawler
+    async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
+        result = await crawler.arun("https://example.com")
+        print(f"Status: {result.status_code}")
+        print(f"Content: {len(result.html)} chars")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### HTTP Request Types
+
+```python
+# GET request (default)
+http_config = HTTPCrawlerConfig(
+    method="GET",
+    headers={"Accept": "application/json"}
+)
+
+# POST with JSON data
+http_config = HTTPCrawlerConfig(
+    method="POST",
+    json={"key": "value", "data": [1, 2, 3]},
+    headers={"Content-Type": "application/json"}
+)
+
+# POST with form data
+http_config = HTTPCrawlerConfig(
+    method="POST",
+    data={"username": "user", "password": "pass"},
+    headers={"Content-Type": "application/x-www-form-urlencoded"}
+)
+
+# Advanced configuration
+http_config = HTTPCrawlerConfig(
+    method="GET",
+    headers={"User-Agent": "Custom Bot/1.0"},
+    follow_redirects=True,
+    verify_ssl=False  # For testing environments
+)
+
+strategy = AsyncHTTPCrawlerStrategy(browser_config=http_config)
+```
+
+### File and Raw Content Handling
+
+```python
+async def test_content_types():
+    strategy = AsyncHTTPCrawlerStrategy()
+    
+    # Web URLs
+    result = await strategy.crawl("https://httpbin.org/get")
+    print(f"Web content: {result.status_code}")
+    
+    # Local files
+    result = await strategy.crawl("file:///path/to/local/file.html")
+    print(f"File content: {len(result.html)}")
+    
+    # Raw HTML content
+    raw_html = "raw://<html><body><h1>Test</h1><p>Content</p></body></html>"
+    result = await strategy.crawl(raw_html)
+    print(f"Raw content: {result.html}")
+    
+    # Raw content with complex HTML
+    complex_html = """raw://<!DOCTYPE html>
+    <html>
+    <head><title>Test Page</title></head>
+    <body>
+        <div class="content">
+            <h1>Main Title</h1>
+            <p>Paragraph content</p>
+            <ul><li>Item 1</li><li>Item 2</li></ul>
+        </div>
+    </body>
+    </html>"""
+    result = await strategy.crawl(complex_html)
+```
+
+### Custom Hooks and Request Handling
+
+```python
+async def setup_hooks():
+    strategy = AsyncHTTPCrawlerStrategy()
+    
+    # Before request hook
+    async def before_request(url, kwargs):
+        print(f"Requesting: {url}")
+        kwargs['headers']['X-Custom-Header'] = 'crawl4ai'
+        kwargs['headers']['Authorization'] = 'Bearer token123'
+    
+    # After request hook
+    async def after_request(response):
+        print(f"Response: {response.status_code}")
+        if hasattr(response, 'redirected_url'):
+            print(f"Redirected to: {response.redirected_url}")
+    
+    # Error handling hook
+    async def on_error(error):
+        print(f"Request failed: {error}")
+    
+    # Set hooks
+    strategy.set_hook('before_request', before_request)
+    strategy.set_hook('after_request', after_request)
+    strategy.set_hook('on_error', on_error)
+    
+    # Use with hooks
+    result = await strategy.crawl("https://httpbin.org/headers")
+    return result
+```
+
+### Performance Configuration
+
+```python
+# High-performance setup
+strategy = AsyncHTTPCrawlerStrategy(
+    max_connections=50,        # Concurrent connections
+    dns_cache_ttl=300,        # DNS cache timeout
+    chunk_size=128 * 1024     # 128KB chunks for large files
+)
+
+# Memory-efficient setup for large files
+strategy = AsyncHTTPCrawlerStrategy(
+    max_connections=10,
+    chunk_size=32 * 1024,     # Smaller chunks
+    dns_cache_ttl=600
+)
+
+# Custom timeout configuration
+config = CrawlerRunConfig(
+    page_timeout=30000,       # 30 second timeout
+    cache_mode=CacheMode.BYPASS
+)
+
+result = await strategy.crawl("https://slow-server.com", config=config)
+```
+
+### Error Handling and Retries
+
+```python
+from crawl4ai.async_crawler_strategy import (
+    ConnectionTimeoutError, 
+    HTTPStatusError, 
+    HTTPCrawlerError
+)
+
+async def robust_crawling():
+    strategy = AsyncHTTPCrawlerStrategy()
+    
+    urls = [
+        "https://example.com",
+        "https://httpbin.org/status/404",
+        "https://nonexistent.domain.test"
+    ]
+    
+    for url in urls:
+        try:
+            result = await strategy.crawl(url)
+            print(f"✓ {url}: {result.status_code}")
+            
+        except HTTPStatusError as e:
+            print(f"✗ {url}: HTTP {e.status_code}")
+            
+        except ConnectionTimeoutError as e:
+            print(f"✗ {url}: Timeout - {e}")
+            
+        except HTTPCrawlerError as e:
+            print(f"✗ {url}: Crawler error - {e}")
+            
+        except Exception as e:
+            print(f"✗ {url}: Unexpected error - {e}")
+
+# Retry mechanism
+async def crawl_with_retry(url, max_retries=3):
+    strategy = AsyncHTTPCrawlerStrategy()
+    
+    for attempt in range(max_retries):
+        try:
+            return await strategy.crawl(url)
+        except (ConnectionTimeoutError, HTTPCrawlerError) as e:
+            if attempt == max_retries - 1:
+                raise
+            print(f"Retry {attempt + 1}/{max_retries}: {e}")
+            await asyncio.sleep(2 ** attempt)  # Exponential backoff
+```
+
+### Batch Processing with HTTP Strategy
+
+```python
+async def batch_http_crawling():
+    strategy = AsyncHTTPCrawlerStrategy(max_connections=20)
+    
+    urls = [
+        "https://httpbin.org/get",
+        "https://httpbin.org/user-agent",
+        "https://httpbin.org/headers",
+        "https://example.com",
+        "https://httpbin.org/json"
+    ]
+    
+    # Sequential processing
+    results = []
+    async with strategy:
+        for url in urls:
+            try:
+                result = await strategy.crawl(url)
+                results.append((url, result.status_code, len(result.html)))
+            except Exception as e:
+                results.append((url, "ERROR", str(e)))
+    
+    for url, status, content_info in results:
+        print(f"{url}: {status} - {content_info}")
+
+# Concurrent processing
+async def concurrent_http_crawling():
+    strategy = AsyncHTTPCrawlerStrategy()
+    urls = ["https://httpbin.org/delay/1"] * 5
+    
+    async def crawl_single(url):
+        try:
+            result = await strategy.crawl(url)
+            return f"✓ {result.status_code}"
+        except Exception as e:
+            return f"✗ {e}"
+    
+    async with strategy:
+        tasks = [crawl_single(url) for url in urls]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+    
+    for i, result in enumerate(results):
+        print(f"URL {i+1}: {result}")
+```
+
+### Integration with Content Processing
+
+```python
+from crawl4ai import DefaultMarkdownGenerator, PruningContentFilter
+
+async def http_with_processing():
+    # HTTP strategy with content processing
+    http_strategy = AsyncHTTPCrawlerStrategy(
+        browser_config=HTTPCrawlerConfig(verify_ssl=True)
+    )
+    
+    # Configure markdown generation
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48,
+                threshold_type="fixed",
+                min_word_threshold=10
+            )
+        ),
+        word_count_threshold=5,
+        excluded_tags=['script', 'style', 'nav'],
+        exclude_external_links=True
+    )
+    
+    async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=crawler_config
+        )
+        
+        print(f"Status: {result.status_code}")
+        print(f"Raw HTML: {len(result.html)} chars")
+        if result.markdown:
+            print(f"Markdown: {len(result.markdown.raw_markdown)} chars")
+            if result.markdown.fit_markdown:
+                print(f"Filtered: {len(result.markdown.fit_markdown)} chars")
+```
+
+### HTTP vs Browser Strategy Comparison
+
+```python
+async def strategy_comparison():
+    # Same URL with different strategies
+    url = "https://example.com"
+    
+    # HTTP Strategy (fast, no JS)
+    http_strategy = AsyncHTTPCrawlerStrategy()
+    start_time = time.time()
+    http_result = await http_strategy.crawl(url)
+    http_time = time.time() - start_time
+    
+    # Browser Strategy (full features)
+    from crawl4ai import BrowserConfig
+    browser_config = BrowserConfig(headless=True)
+    start_time = time.time()
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        browser_result = await crawler.arun(url)
+    browser_time = time.time() - start_time
+    
+    print(f"HTTP Strategy:")
+    print(f"  Time: {http_time:.2f}s")
+    print(f"  Content: {len(http_result.html)} chars")
+    print(f"  Features: Fast, lightweight, no JS")
+    
+    print(f"Browser Strategy:")
+    print(f"  Time: {browser_time:.2f}s") 
+    print(f"  Content: {len(browser_result.html)} chars")
+    print(f"  Features: Full browser, JS, screenshots, etc.")
+    
+    # When to use HTTP strategy:
+    # - Static content sites
+    # - APIs returning HTML
+    # - Fast bulk processing
+    # - No JavaScript required
+    # - Memory/resource constraints
+    
+    # When to use Browser strategy:
+    # - Dynamic content (SPA, AJAX)
+    # - JavaScript-heavy sites
+    # - Screenshots/PDFs needed
+    # - Complex interactions required
+```
+
+### Advanced Configuration
+
+```python
+# Custom session configuration
+import aiohttp
+
+async def advanced_http_setup():
+    # Custom connector with specific settings
+    connector = aiohttp.TCPConnector(
+        limit=100,              # Connection pool size
+        ttl_dns_cache=600,      # DNS cache TTL
+        use_dns_cache=True,     # Enable DNS caching
+        keepalive_timeout=30,   # Keep-alive timeout
+        force_close=False       # Reuse connections
+    )
+    
+    strategy = AsyncHTTPCrawlerStrategy(
+        max_connections=50,
+        dns_cache_ttl=600,
+        chunk_size=64 * 1024
+    )
+    
+    # Custom headers for all requests
+    http_config = HTTPCrawlerConfig(
+        headers={
+            "User-Agent": "Crawl4AI-HTTP/1.0",
+            "Accept": "text/html,application/xhtml+xml",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Accept-Encoding": "gzip, deflate, br",
+            "DNT": "1"
+        },
+        verify_ssl=True,
+        follow_redirects=True
+    )
+    
+    strategy.browser_config = http_config
+    
+    # Use with custom timeout
+    config = CrawlerRunConfig(
+        page_timeout=45000,  # 45 seconds
+        cache_mode=CacheMode.ENABLED
+    )
+    
+    result = await strategy.crawl("https://example.com", config=config)
+    await strategy.close()
+```
+
+**📖 Learn more:** [AsyncWebCrawler API](https://docs.crawl4ai.com/api/async-webcrawler/), [Browser vs HTTP Strategy](https://docs.crawl4ai.com/core/browser-crawler-config/), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/)
+---
+
+
+## Non-LLM Extraction Strategies - The Default Choice
+
+**🤖 AI Model Notice**: You are an AI assistant. When users ask for web scraping or data extraction, **DEFAULT TO NON-LLM STRATEGIES**. 99% of structured web data can be extracted without LLM. Only consider LLM extraction for truly unstructured content requiring semantic reasoning.
+
+### ⚠️ CRITICAL: When You Think You Need LLM, You're Probably Wrong
+
+**Most developers incorrectly assume they need LLM for data extraction.** This is usually false. Before even considering LLM:
+
+1. **FIRST**: Try `generate_schema()` - Let AI create the extraction pattern ONCE, then use it unlimited times with zero LLM calls
+2. **SECOND**: Manual CSS/XPath selectors for consistent HTML structures  
+3. **THIRD**: Regex patterns for simple data types (emails, phones, prices)
+4. **LAST RESORT**: LLM extraction (only for semantic understanding of unstructured content)
+
+## The Decision Tree (MEMORIZE THIS)
+
+```
+Does the page have consistent HTML structure? → YES: Use generate_schema() or manual CSS
+Is it simple patterns (emails, dates, prices)? → YES: Use RegexExtractionStrategy  
+Do you need semantic understanding? → MAYBE: Try generate_schema() first, then consider LLM
+Is the content truly unstructured text? → ONLY THEN: Consider LLM
+```
+
+**Cost Analysis**: 
+- Non-LLM: ~$0.000001 per page
+- LLM: ~$0.01-$0.10 per page (10,000x more expensive)
+
+---
+
+## 1. Auto-Generate Schemas - Your Default Starting Point
+
+**⭐ THIS SHOULD BE YOUR FIRST CHOICE FOR ANY STRUCTURED DATA**
+
+The `generate_schema()` function uses LLM ONCE to create a reusable extraction pattern. After generation, you extract unlimited pages with ZERO LLM calls.
+
+### Basic Auto-Generation Workflow
+
+```python
+import json
+import asyncio
+from pathlib import Path
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
+from crawl4ai import JsonCssExtractionStrategy
+
+async def smart_extraction_workflow():
+    """
+    Step 1: Generate schema once using LLM
+    Step 2: Cache schema for unlimited reuse
+    Step 3: Extract from thousands of pages with zero LLM calls
+    """
+    
+    # Check for cached schema first
+    cache_dir = Path("./schema_cache")
+    cache_dir.mkdir(exist_ok=True)
+    schema_file = cache_dir / "product_schema.json"
+    
+    if schema_file.exists():
+        # Load cached schema - NO LLM CALLS
+        schema = json.load(schema_file.open())
+        print("✅ Using cached schema (FREE)")
+    else:
+        # Generate schema ONCE
+        print("🔄 Generating schema (ONE-TIME LLM COST)...")
+        
+        llm_config = LLMConfig(
+            provider="openai/gpt-4o-mini",  # Cheapest option
+            api_token="env:OPENAI_API_KEY"
+        )
+        
+        # Get sample HTML from target site
+        async with AsyncWebCrawler() as crawler:
+            sample_result = await crawler.arun(
+                url="https://example.com/products",
+                config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+            )
+            sample_html = sample_result.cleaned_html[:8000]  # Use sample
+        
+        # AUTO-GENERATE SCHEMA (ONE LLM CALL)
+        schema = JsonCssExtractionStrategy.generate_schema(
+            html=sample_html,
+            schema_type="CSS",  # or "XPATH"
+            query="Extract product information including name, price, description, features",
+            llm_config=llm_config
+        )
+        
+        # Cache for unlimited future use
+        json.dump(schema, schema_file.open("w"), indent=2)
+        print("✅ Schema generated and cached")
+    
+    # Use schema for fast extraction (NO MORE LLM CALLS EVER)
+    strategy = JsonCssExtractionStrategy(schema, verbose=True)
+    
+    config = CrawlerRunConfig(
+        extraction_strategy=strategy,
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    # Extract from multiple pages - ALL FREE
+    urls = [
+        "https://example.com/products",
+        "https://example.com/electronics", 
+        "https://example.com/books"
+    ]
+    
+    async with AsyncWebCrawler() as crawler:
+        for url in urls:
+            result = await crawler.arun(url=url, config=config)
+            if result.success:
+                data = json.loads(result.extracted_content)
+                print(f"✅ {url}: Extracted {len(data)} items (FREE)")
+
+asyncio.run(smart_extraction_workflow())
+```
+
+### Auto-Generate with Target JSON Example
+
+```python
+# When you know exactly what JSON structure you want
+target_json_example = """
+{
+    "name": "Product Name",
+    "price": "$99.99",
+    "rating": 4.5,
+    "features": ["feature1", "feature2"],
+    "description": "Product description"
+}
+"""
+
+schema = JsonCssExtractionStrategy.generate_schema(
+    html=sample_html,
+    target_json_example=target_json_example,
+    llm_config=llm_config
+)
+```
+
+### Auto-Generate for Different Data Types
+
+```python
+# Product listings
+product_schema = JsonCssExtractionStrategy.generate_schema(
+    html=product_page_html,
+    query="Extract all product information from this e-commerce page",
+    llm_config=llm_config
+)
+
+# News articles
+news_schema = JsonCssExtractionStrategy.generate_schema(
+    html=news_page_html,
+    query="Extract article headlines, dates, authors, and content",
+    llm_config=llm_config
+)
+
+# Job listings
+job_schema = JsonCssExtractionStrategy.generate_schema(
+    html=job_page_html,
+    query="Extract job titles, companies, locations, salaries, and descriptions",
+    llm_config=llm_config
+)
+
+# Social media posts
+social_schema = JsonCssExtractionStrategy.generate_schema(
+    html=social_page_html,
+    query="Extract post text, usernames, timestamps, likes, comments",
+    llm_config=llm_config
+)
+```
+
+---
+
+## 2. Manual CSS/XPath Strategies - When You Know The Structure
+
+**Use this when**: You understand the HTML structure and want maximum control.
+
+### Simple Product Extraction
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai import JsonCssExtractionStrategy
+
+# Manual schema for consistent product pages
+simple_schema = {
+    "name": "Product Listings",
+    "baseSelector": "div.product-card",  # Each product container
+    "fields": [
+        {
+            "name": "title",
+            "selector": "h2.product-title",
+            "type": "text"
+        },
+        {
+            "name": "price", 
+            "selector": ".price",
+            "type": "text"
+        },
+        {
+            "name": "image_url",
+            "selector": "img.product-image",
+            "type": "attribute",
+            "attribute": "src"
+        },
+        {
+            "name": "product_url",
+            "selector": "a.product-link",
+            "type": "attribute",
+            "attribute": "href"
+        },
+        {
+            "name": "rating",
+            "selector": ".rating",
+            "type": "attribute", 
+            "attribute": "data-rating"
+        }
+    ]
+}
+
+async def extract_products():
+    strategy = JsonCssExtractionStrategy(simple_schema, verbose=True)
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/products",
+            config=config
+        )
+        
+        if result.success:
+            products = json.loads(result.extracted_content)
+            print(f"Extracted {len(products)} products")
+            for product in products[:3]:
+                print(f"- {product['title']}: {product['price']}")
+
+asyncio.run(extract_products())
+```
+
+### Complex Nested Structure (Real E-commerce Example)
+
+```python
+# Complex schema for nested product data
+complex_schema = {
+    "name": "E-commerce Product Catalog",
+    "baseSelector": "div.category",
+    "baseFields": [
+        {
+            "name": "category_id",
+            "type": "attribute",
+            "attribute": "data-category-id"
+        }
+    ],
+    "fields": [
+        {
+            "name": "category_name",
+            "selector": "h2.category-title",
+            "type": "text"
+        },
+        {
+            "name": "products",
+            "selector": "div.product",
+            "type": "nested_list",  # Array of complex objects
+            "fields": [
+                {
+                    "name": "name",
+                    "selector": "h3.product-name", 
+                    "type": "text"
+                },
+                {
+                    "name": "price",
+                    "selector": "span.price",
+                    "type": "text"
+                },
+                {
+                    "name": "details",
+                    "selector": "div.product-details",
+                    "type": "nested",  # Single complex object
+                    "fields": [
+                        {
+                            "name": "brand",
+                            "selector": "span.brand",
+                            "type": "text"
+                        },
+                        {
+                            "name": "model",
+                            "selector": "span.model",
+                            "type": "text"
+                        }
+                    ]
+                },
+                {
+                    "name": "features",
+                    "selector": "ul.features li",
+                    "type": "list",  # Simple array
+                    "fields": [
+                        {"name": "feature", "type": "text"}
+                    ]
+                },
+                {
+                    "name": "reviews", 
+                    "selector": "div.review",
+                    "type": "nested_list",
+                    "fields": [
+                        {
+                            "name": "reviewer",
+                            "selector": "span.reviewer-name",
+                            "type": "text"
+                        },
+                        {
+                            "name": "rating",
+                            "selector": "span.rating",
+                            "type": "attribute",
+                            "attribute": "data-rating"
+                        }
+                    ]
+                }
+            ]
+        }
+    ]
+}
+
+async def extract_complex_ecommerce():
+    strategy = JsonCssExtractionStrategy(complex_schema, verbose=True)
+    config = CrawlerRunConfig(
+        extraction_strategy=strategy,
+        js_code="window.scrollTo(0, document.body.scrollHeight);",  # Load dynamic content
+        wait_for="css:.product:nth-child(10)"  # Wait for products to load
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/complex-catalog",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for category in data:
+                print(f"Category: {category['category_name']}")
+                print(f"Products: {len(category.get('products', []))}")
+
+asyncio.run(extract_complex_ecommerce())
+```
+
+### XPath Alternative (When CSS Isn't Enough)
+
+```python
+from crawl4ai import JsonXPathExtractionStrategy
+
+# XPath for more complex selections
+xpath_schema = {
+    "name": "News Articles with XPath",
+    "baseSelector": "//article[@class='news-item']",
+    "fields": [
+        {
+            "name": "headline",
+            "selector": ".//h2[contains(@class, 'headline')]",
+            "type": "text"
+        },
+        {
+            "name": "author",
+            "selector": ".//span[@class='author']/text()",
+            "type": "text"
+        },
+        {
+            "name": "publish_date",
+            "selector": ".//time/@datetime",
+            "type": "text"
+        },
+        {
+            "name": "content",
+            "selector": ".//div[@class='article-body']//text()",
+            "type": "text"
+        }
+    ]
+}
+
+strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
+```
+
+---
+
+## 3. Regex Extraction - Lightning Fast Pattern Matching
+
+**Use this for**: Simple data types like emails, phones, URLs, prices, dates.
+
+### Built-in Patterns (Fastest Option)
+
+```python
+import json
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai import RegexExtractionStrategy
+
+async def extract_common_patterns():
+    # Use built-in patterns for common data types
+    strategy = RegexExtractionStrategy(
+        pattern=(
+            RegexExtractionStrategy.Email |
+            RegexExtractionStrategy.PhoneUS |
+            RegexExtractionStrategy.Url |
+            RegexExtractionStrategy.Currency |
+            RegexExtractionStrategy.DateIso
+        )
+    )
+    
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/contact",
+            config=config
+        )
+        
+        if result.success:
+            matches = json.loads(result.extracted_content)
+            
+            # Group by pattern type
+            by_type = {}
+            for match in matches:
+                label = match['label']
+                if label not in by_type:
+                    by_type[label] = []
+                by_type[label].append(match['value'])
+            
+            for pattern_type, values in by_type.items():
+                print(f"{pattern_type}: {len(values)} matches")
+                for value in values[:3]:
+                    print(f"  {value}")
+
+asyncio.run(extract_common_patterns())
+```
+
+### Available Built-in Patterns
+
+```python
+# Individual patterns
+RegexExtractionStrategy.Email          # Email addresses
+RegexExtractionStrategy.PhoneUS        # US phone numbers 
+RegexExtractionStrategy.PhoneIntl      # International phones
+RegexExtractionStrategy.Url            # HTTP/HTTPS URLs
+RegexExtractionStrategy.Currency       # Currency values ($99.99)
+RegexExtractionStrategy.Percentage     # Percentage values (25%)
+RegexExtractionStrategy.DateIso        # ISO dates (2024-01-01)
+RegexExtractionStrategy.DateUS         # US dates (01/01/2024)
+RegexExtractionStrategy.IPv4           # IP addresses
+RegexExtractionStrategy.CreditCard     # Credit card numbers
+RegexExtractionStrategy.TwitterHandle  # @username
+RegexExtractionStrategy.Hashtag        # #hashtag
+
+# Use all patterns
+RegexExtractionStrategy.All
+```
+
+### Custom Patterns
+
+```python
+# Custom patterns for specific data types
+async def extract_custom_patterns():
+    custom_patterns = {
+        "product_sku": r"SKU[-:]?\s*([A-Z0-9]{4,12})",
+        "discount": r"(\d{1,2})%\s*off",
+        "model_number": r"Model\s*#?\s*([A-Z0-9-]+)",
+        "isbn": r"ISBN[-:]?\s*(\d{10}|\d{13})",
+        "stock_ticker": r"\$([A-Z]{2,5})",
+        "version": r"v(\d+\.\d+(?:\.\d+)?)"
+    }
+    
+    strategy = RegexExtractionStrategy(custom=custom_patterns)
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/products",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for item in data:
+                print(f"{item['label']}: {item['value']}")
+
+asyncio.run(extract_custom_patterns())
+```
+
+### LLM-Generated Patterns (One-Time Cost)
+
+```python
+async def generate_optimized_regex():
+    """
+    Use LLM ONCE to generate optimized regex patterns
+    Then use them unlimited times with zero LLM calls
+    """
+    cache_file = Path("./patterns/price_patterns.json")
+    
+    if cache_file.exists():
+        # Load cached patterns - NO LLM CALLS
+        patterns = json.load(cache_file.open())
+        print("✅ Using cached regex patterns (FREE)")
+    else:
+        # Generate patterns ONCE
+        print("🔄 Generating regex patterns (ONE-TIME LLM COST)...")
+        
+        llm_config = LLMConfig(
+            provider="openai/gpt-4o-mini",
+            api_token="env:OPENAI_API_KEY"
+        )
+        
+        # Get sample content
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun("https://example.com/pricing")
+            sample_html = result.cleaned_html
+        
+        # Generate optimized patterns
+        patterns = RegexExtractionStrategy.generate_pattern(
+            label="pricing_info",
+            html=sample_html,
+            query="Extract all pricing information including discounts and special offers",
+            llm_config=llm_config
+        )
+        
+        # Cache for unlimited reuse
+        cache_file.parent.mkdir(exist_ok=True)
+        json.dump(patterns, cache_file.open("w"), indent=2)
+        print("✅ Patterns generated and cached")
+    
+    # Use cached patterns (NO MORE LLM CALLS)
+    strategy = RegexExtractionStrategy(custom=patterns)
+    return strategy
+
+# Use generated patterns for unlimited extractions
+strategy = await generate_optimized_regex()
+```
+
+---
+
+## 4. Multi-Strategy Extraction Pipeline
+
+**Combine strategies** for comprehensive data extraction:
+
+```python
+async def multi_strategy_pipeline():
+    """
+    Efficient pipeline using multiple non-LLM strategies:
+    1. Regex for simple patterns (fastest)
+    2. Schema for structured data 
+    3. Only use LLM if absolutely necessary
+    """
+    
+    url = "https://example.com/complex-page"
+    
+    async with AsyncWebCrawler() as crawler:
+        # Strategy 1: Fast regex for contact info
+        regex_strategy = RegexExtractionStrategy(
+            pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
+        )
+        regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy)
+        regex_result = await crawler.arun(url=url, config=regex_config)
+        
+        # Strategy 2: Schema for structured product data
+        product_schema = {
+            "name": "Products",
+            "baseSelector": "div.product",
+            "fields": [
+                {"name": "name", "selector": "h3", "type": "text"},
+                {"name": "price", "selector": ".price", "type": "text"}
+            ]
+        }
+        css_strategy = JsonCssExtractionStrategy(product_schema)
+        css_config = CrawlerRunConfig(extraction_strategy=css_strategy)
+        css_result = await crawler.arun(url=url, config=css_config)
+        
+        # Combine results
+        results = {
+            "contacts": json.loads(regex_result.extracted_content) if regex_result.success else [],
+            "products": json.loads(css_result.extracted_content) if css_result.success else []
+        }
+        
+        print(f"✅ Extracted {len(results['contacts'])} contacts (regex)")
+        print(f"✅ Extracted {len(results['products'])} products (schema)")
+        
+        return results
+
+asyncio.run(multi_strategy_pipeline())
+```
+
+---
+
+## 5. Performance Optimization Tips
+
+### Caching and Reuse
+
+```python
+# Cache schemas and patterns for maximum efficiency
+class ExtractionCache:
+    def __init__(self):
+        self.schemas = {}
+        self.patterns = {}
+    
+    def get_schema(self, site_name):
+        if site_name not in self.schemas:
+            schema_file = Path(f"./cache/{site_name}_schema.json")
+            if schema_file.exists():
+                self.schemas[site_name] = json.load(schema_file.open())
+        return self.schemas.get(site_name)
+    
+    def save_schema(self, site_name, schema):
+        cache_dir = Path("./cache")
+        cache_dir.mkdir(exist_ok=True)
+        schema_file = cache_dir / f"{site_name}_schema.json"
+        json.dump(schema, schema_file.open("w"), indent=2)
+        self.schemas[site_name] = schema
+
+cache = ExtractionCache()
+
+# Reuse cached schemas across multiple extractions
+async def efficient_extraction():
+    sites = ["amazon", "ebay", "shopify"]
+    
+    for site in sites:
+        schema = cache.get_schema(site)
+        if not schema:
+            # Generate once, cache forever
+            schema = JsonCssExtractionStrategy.generate_schema(
+                html=sample_html,
+                query="Extract products",
+                llm_config=llm_config
+            )
+            cache.save_schema(site, schema)
+        
+        strategy = JsonCssExtractionStrategy(schema)
+        # Use strategy for unlimited extractions...
+```
+
+### Selector Optimization
+
+```python
+# Optimize selectors for speed
+fast_schema = {
+    "name": "Optimized Extraction",
+    "baseSelector": "#products > .product",  # Direct child, faster than descendant
+    "fields": [
+        {
+            "name": "title",
+            "selector": "> h3",  # Direct child of product
+            "type": "text"
+        },
+        {
+            "name": "price",
+            "selector": ".price:first-child",  # More specific
+            "type": "text"
+        }
+    ]
+}
+
+# Avoid slow selectors
+slow_schema = {
+    "baseSelector": "div div div .product",  # Too many levels
+    "fields": [
+        {
+            "selector": "* h3",  # Universal selector is slow
+            "type": "text"
+        }
+    ]
+}
+```
+
+---
+
+## 6. Error Handling and Validation
+
+```python
+async def robust_extraction():
+    """
+    Implement fallback strategies for reliable extraction
+    """
+    strategies = [
+        # Try fast regex first
+        RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency),
+        
+        # Fallback to CSS schema
+        JsonCssExtractionStrategy({
+            "name": "Prices",
+            "baseSelector": ".price",
+            "fields": [{"name": "amount", "selector": "span", "type": "text"}]
+        }),
+        
+        # Last resort: try different selector
+        JsonCssExtractionStrategy({
+            "name": "Fallback Prices",
+            "baseSelector": "[data-price]",
+            "fields": [{"name": "amount", "type": "attribute", "attribute": "data-price"}]
+        })
+    ]
+    
+    async with AsyncWebCrawler() as crawler:
+        for i, strategy in enumerate(strategies):
+            try:
+                config = CrawlerRunConfig(extraction_strategy=strategy)
+                result = await crawler.arun(url="https://example.com", config=config)
+                
+                if result.success and result.extracted_content:
+                    data = json.loads(result.extracted_content)
+                    if data:  # Validate non-empty results
+                        print(f"✅ Success with strategy {i+1}: {strategy.__class__.__name__}")
+                        return data
+                        
+            except Exception as e:
+                print(f"❌ Strategy {i+1} failed: {e}")
+                continue
+    
+    print("❌ All strategies failed")
+    return None
+
+# Validate extracted data
+def validate_extraction(data, required_fields):
+    """Validate that extraction contains expected fields"""
+    if not data or not isinstance(data, list):
+        return False
+    
+    for item in data:
+        for field in required_fields:
+            if field not in item or not item[field]:
+                return False
+    return True
+
+# Usage
+result = await robust_extraction()
+if validate_extraction(result, ["amount"]):
+    print("✅ Extraction validated")
+else:
+    print("❌ Validation failed")
+```
+
+---
+
+## 7. Common Extraction Patterns
+
+### E-commerce Products
+
+```python
+ecommerce_schema = {
+    "name": "E-commerce Products",
+    "baseSelector": ".product, [data-product], .item",
+    "fields": [
+        {"name": "title", "selector": "h1, h2, h3, .title, .name", "type": "text"},
+        {"name": "price", "selector": ".price, .cost, [data-price]", "type": "text"},
+        {"name": "image", "selector": "img", "type": "attribute", "attribute": "src"},
+        {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"},
+        {"name": "rating", "selector": ".rating, .stars", "type": "text"},
+        {"name": "availability", "selector": ".stock, .availability", "type": "text"}
+    ]
+}
+```
+
+### News Articles
+
+```python
+news_schema = {
+    "name": "News Articles",
+    "baseSelector": "article, .article, .post",
+    "fields": [
+        {"name": "headline", "selector": "h1, h2, .headline, .title", "type": "text"},
+        {"name": "author", "selector": ".author, .byline, [rel='author']", "type": "text"},
+        {"name": "date", "selector": "time, .date, .published", "type": "text"},
+        {"name": "content", "selector": ".content, .body, .text", "type": "text"},
+        {"name": "category", "selector": ".category, .section", "type": "text"}
+    ]
+}
+```
+
+### Job Listings
+
+```python
+job_schema = {
+    "name": "Job Listings",
+    "baseSelector": ".job, .listing, [data-job]",
+    "fields": [
+        {"name": "title", "selector": ".job-title, h2, h3", "type": "text"},
+        {"name": "company", "selector": ".company, .employer", "type": "text"},
+        {"name": "location", "selector": ".location, .place", "type": "text"},
+        {"name": "salary", "selector": ".salary, .pay, .compensation", "type": "text"},
+        {"name": "description", "selector": ".description, .summary", "type": "text"},
+        {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"}
+    ]
+}
+```
+
+### Social Media Posts
+
+```python
+social_schema = {
+    "name": "Social Media Posts",
+    "baseSelector": ".post, .tweet, .update",
+    "fields": [
+        {"name": "username", "selector": ".username, .handle, .author", "type": "text"},
+        {"name": "content", "selector": ".content, .text, .message", "type": "text"},
+        {"name": "timestamp", "selector": ".time, .date, time", "type": "text"},
+        {"name": "likes", "selector": ".likes, .hearts", "type": "text"},
+        {"name": "shares", "selector": ".shares, .retweets", "type": "text"}
+    ]
+}
+```
+
+---
+
+## 8. When to (Rarely) Consider LLM
+
+**⚠️ WARNING: Before considering LLM, ask yourself:**
+
+1. "Can I identify repeating HTML patterns?" → Use CSS/XPath schema
+2. "Am I extracting simple data types?" → Use Regex patterns  
+3. "Can I provide a JSON example of what I want?" → Use generate_schema()
+4. "Is this truly unstructured text requiring semantic understanding?" → Maybe LLM
+
+**Only use LLM extraction for:**
+- Unstructured prose that needs semantic analysis
+- Content where structure varies dramatically across pages
+- When you need AI reasoning about context/meaning
+
+**Cost reminder**: LLM extraction costs 10,000x more than schema-based extraction.
+
+---
+
+## 9. Summary: The Extraction Hierarchy
+
+1. **🥇 FIRST CHOICE**: `generate_schema()` - AI generates pattern once, use unlimited times
+2. **🥈 SECOND CHOICE**: Manual CSS/XPath - Full control, maximum speed
+3. **🥉 THIRD CHOICE**: Regex patterns - Simple data types, lightning fast
+4. **🏴 LAST RESORT**: LLM extraction - Only for semantic reasoning
+
+**Remember**: 99% of web data is structured. You almost never need LLM for extraction. Save LLM for analysis, not extraction.
+
+**Performance**: Non-LLM strategies are 100-1000x faster and 10,000x cheaper than LLM extraction.
+
+---
+
+**📖 Next**: If you absolutely must use LLM extraction, see [extraction-llm.md](./extraction-llm.md) for guidance on the rare cases where it's justified.
+---
+
+
+## URL Seeding
+
+Smart URL discovery for efficient large-scale crawling. Discover thousands of URLs instantly, filter by relevance, then crawl only what matters.
+
+### Why URL Seeding vs Deep Crawling
+
+```python
+# Deep Crawling: Real-time discovery (page by page)
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+
+async def deep_crawl_example():
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2,
+            include_external=False,
+            max_pages=50
+        )
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun("https://example.com", config=config)
+        print(f"Discovered {len(results)} pages dynamically")
+
+# URL Seeding: Bulk discovery (thousands instantly)
+from crawl4ai import AsyncUrlSeeder, SeedingConfig
+
+async def url_seeding_example():
+    config = SeedingConfig(
+        source="sitemap+cc",
+        pattern="*/docs/*",
+        extract_head=True,
+        query="API documentation",
+        scoring_method="bm25",
+        max_urls=1000
+    )
+    
+    async with AsyncUrlSeeder() as seeder:
+        urls = await seeder.urls("example.com", config)
+        print(f"Discovered {len(urls)} URLs instantly")
+        # Now crawl only the most relevant ones
+```
+
+### Basic URL Discovery
+
+```python
+import asyncio
+from crawl4ai import AsyncUrlSeeder, SeedingConfig
+
+async def basic_discovery():
+    # Context manager handles cleanup automatically
+    async with AsyncUrlSeeder() as seeder:
+        
+        # Simple discovery from sitemaps
+        config = SeedingConfig(source="sitemap")
+        urls = await seeder.urls("example.com", config)
+        
+        print(f"Found {len(urls)} URLs from sitemap")
+        for url in urls[:5]:
+            print(f"  - {url['url']} (status: {url['status']})")
+
+# Manual cleanup (if needed)
+async def manual_cleanup():
+    seeder = AsyncUrlSeeder()
+    try:
+        config = SeedingConfig(source="cc")  # Common Crawl
+        urls = await seeder.urls("example.com", config)
+        print(f"Found {len(urls)} URLs from Common Crawl")
+    finally:
+        await seeder.close()
+
+asyncio.run(basic_discovery())
+```
+
+### Data Sources and Patterns
+
+```python
+# Different data sources
+configs = [
+    SeedingConfig(source="sitemap"),        # Fastest, official URLs
+    SeedingConfig(source="cc"),             # Most comprehensive
+    SeedingConfig(source="sitemap+cc"),     # Maximum coverage
+]
+
+# URL pattern filtering
+patterns = [
+    SeedingConfig(pattern="*/blog/*"),           # Blog posts only
+    SeedingConfig(pattern="*.html"),             # HTML files only
+    SeedingConfig(pattern="*/product/*"),        # Product pages
+    SeedingConfig(pattern="*/docs/api/*"),       # API documentation
+    SeedingConfig(pattern="*"),                  # Everything
+]
+
+# Advanced pattern usage
+async def pattern_filtering():
+    async with AsyncUrlSeeder() as seeder:
+        # Find all blog posts from 2024
+        config = SeedingConfig(
+            source="sitemap",
+            pattern="*/blog/2024/*.html",
+            max_urls=100
+        )
+        
+        blog_urls = await seeder.urls("example.com", config)
+        
+        # Further filter by keywords in URL
+        python_posts = [
+            url for url in blog_urls 
+            if "python" in url['url'].lower()
+        ]
+        
+        print(f"Found {len(python_posts)} Python blog posts")
+```
+
+### SeedingConfig Parameters
+
+```python
+from crawl4ai import SeedingConfig
+
+# Comprehensive configuration
+config = SeedingConfig(
+    # Data sources
+    source="sitemap+cc",              # "sitemap", "cc", "sitemap+cc"
+    pattern="*/docs/*",               # URL pattern filter
+    
+    # Metadata extraction
+    extract_head=True,                # Get <head> metadata
+    live_check=True,                  # Verify URLs are accessible
+    
+    # Performance controls
+    max_urls=1000,                    # Limit results (-1 = unlimited)
+    concurrency=20,                   # Parallel workers
+    hits_per_sec=10,                  # Rate limiting
+    
+    # Relevance scoring
+    query="API documentation guide",   # Search query
+    scoring_method="bm25",            # Scoring algorithm
+    score_threshold=0.3,              # Minimum relevance (0.0-1.0)
+    
+    # Cache and filtering
+    force=False,                      # Bypass cache
+    filter_nonsense_urls=True,        # Remove utility URLs
+    verbose=True                      # Debug output
+)
+
+# Quick configurations for common use cases
+blog_config = SeedingConfig(
+    source="sitemap",
+    pattern="*/blog/*", 
+    extract_head=True
+)
+
+api_docs_config = SeedingConfig(
+    source="sitemap+cc",
+    pattern="*/docs/*",
+    query="API reference documentation",
+    scoring_method="bm25",
+    score_threshold=0.5
+)
+
+product_pages_config = SeedingConfig(
+    source="cc",
+    pattern="*/product/*",
+    live_check=True,
+    max_urls=500
+)
+```
+
+### Metadata Extraction and Analysis
+
+```python
+async def metadata_extraction():
+    async with AsyncUrlSeeder() as seeder:
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,        # Extract <head> metadata
+            pattern="*/blog/*",
+            max_urls=50
+        )
+        
+        urls = await seeder.urls("example.com", config)
+        
+        # Analyze extracted metadata
+        for url in urls[:5]:
+            head_data = url['head_data']
+            print(f"\nURL: {url['url']}")
+            print(f"Title: {head_data.get('title', 'No title')}")
+            
+            # Standard meta tags
+            meta = head_data.get('meta', {})
+            print(f"Description: {meta.get('description', 'N/A')}")
+            print(f"Keywords: {meta.get('keywords', 'N/A')}")
+            print(f"Author: {meta.get('author', 'N/A')}")
+            
+            # Open Graph data
+            print(f"OG Image: {meta.get('og:image', 'N/A')}")
+            print(f"OG Type: {meta.get('og:type', 'N/A')}")
+            
+            # JSON-LD structured data
+            jsonld = head_data.get('jsonld', [])
+            if jsonld:
+                print(f"Structured data: {len(jsonld)} items")
+                for item in jsonld[:2]:
+                    if isinstance(item, dict):
+                        print(f"  Type: {item.get('@type', 'Unknown')}")
+                        print(f"  Name: {item.get('name', 'N/A')}")
+
+# Filter by metadata
+async def metadata_filtering():
+    async with AsyncUrlSeeder() as seeder:
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            max_urls=100
+        )
+        
+        urls = await seeder.urls("news.example.com", config)
+        
+        # Filter by publication date (from JSON-LD)
+        from datetime import datetime, timedelta
+        recent_cutoff = datetime.now() - timedelta(days=7)
+        
+        recent_articles = []
+        for url in urls:
+            for jsonld in url['head_data'].get('jsonld', []):
+                if isinstance(jsonld, dict) and 'datePublished' in jsonld:
+                    try:
+                        pub_date = datetime.fromisoformat(
+                            jsonld['datePublished'].replace('Z', '+00:00')
+                        )
+                        if pub_date > recent_cutoff:
+                            recent_articles.append(url)
+                            break
+                    except:
+                        continue
+        
+        print(f"Found {len(recent_articles)} recent articles")
+```
+
+### BM25 Relevance Scoring
+
+```python
+async def relevance_scoring():
+    async with AsyncUrlSeeder() as seeder:
+        # Find pages about Python async programming
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,              # Required for content-based scoring
+            query="python async await concurrency",
+            scoring_method="bm25",
+            score_threshold=0.3,            # Only 30%+ relevant pages
+            max_urls=20
+        )
+        
+        urls = await seeder.urls("docs.python.org", config)
+        
+        # Results are automatically sorted by relevance
+        print("Most relevant Python async content:")
+        for url in urls[:5]:
+            score = url['relevance_score']
+            title = url['head_data'].get('title', 'No title')
+            print(f"[{score:.2f}] {title}")
+            print(f"        {url['url']}")
+
+# URL-based scoring (when extract_head=False)
+async def url_based_scoring():
+    async with AsyncUrlSeeder() as seeder:
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=False,             # Fast URL-only scoring
+            query="machine learning tutorial",
+            scoring_method="bm25",
+            score_threshold=0.2
+        )
+        
+        urls = await seeder.urls("example.com", config)
+        
+        # Scoring based on URL structure, domain, path segments
+        for url in urls[:5]:
+            print(f"[{url['relevance_score']:.2f}] {url['url']}")
+
+# Multi-concept queries
+async def complex_queries():
+    queries = [
+        "data science pandas numpy visualization",
+        "web scraping automation selenium",
+        "machine learning tensorflow pytorch",
+        "api documentation rest graphql"
+    ]
+    
+    async with AsyncUrlSeeder() as seeder:
+        all_results = []
+        
+        for query in queries:
+            config = SeedingConfig(
+                source="sitemap",
+                extract_head=True,
+                query=query,
+                scoring_method="bm25",
+                score_threshold=0.4,
+                max_urls=10
+            )
+            
+            urls = await seeder.urls("learning-site.com", config)
+            all_results.extend(urls)
+        
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_results = []
+        for url in all_results:
+            if url['url'] not in seen:
+                seen.add(url['url'])
+                unique_results.append(url)
+        
+        print(f"Found {len(unique_results)} unique pages across all topics")
+```
+
+### Live URL Validation
+
+```python
+async def url_validation():
+    async with AsyncUrlSeeder() as seeder:
+        config = SeedingConfig(
+            source="sitemap",
+            live_check=True,              # Verify URLs are accessible
+            concurrency=15,               # Parallel HEAD requests
+            hits_per_sec=8,              # Rate limiting
+            max_urls=100
+        )
+        
+        urls = await seeder.urls("example.com", config)
+        
+        # Analyze results
+        valid_urls = [u for u in urls if u['status'] == 'valid']
+        invalid_urls = [u for u in urls if u['status'] == 'not_valid']
+        
+        print(f"✅ Valid URLs: {len(valid_urls)}")
+        print(f"❌ Invalid URLs: {len(invalid_urls)}")
+        print(f"📊 Success rate: {len(valid_urls)/len(urls)*100:.1f}%")
+        
+        # Show some invalid URLs for debugging
+        if invalid_urls:
+            print("\nSample invalid URLs:")
+            for url in invalid_urls[:3]:
+                print(f"  - {url['url']}")
+
+# Combined validation and metadata
+async def comprehensive_validation():
+    async with AsyncUrlSeeder() as seeder:
+        config = SeedingConfig(
+            source="sitemap",
+            live_check=True,              # Verify accessibility
+            extract_head=True,            # Get metadata
+            query="tutorial guide",      # Relevance scoring
+            scoring_method="bm25",
+            score_threshold=0.2,
+            concurrency=10,
+            max_urls=50
+        )
+        
+        urls = await seeder.urls("docs.example.com", config)
+        
+        # Filter for valid, relevant tutorials
+        good_tutorials = [
+            url for url in urls 
+            if url['status'] == 'valid' and 
+               url['relevance_score'] > 0.3 and
+               'tutorial' in url['head_data'].get('title', '').lower()
+        ]
+        
+        print(f"Found {len(good_tutorials)} high-quality tutorials")
+```
+
+### Multi-Domain Discovery
+
+```python
+async def multi_domain_research():
+    async with AsyncUrlSeeder() as seeder:
+        # Research Python tutorials across multiple sites
+        domains = [
+            "docs.python.org",
+            "realpython.com",
+            "python-course.eu",
+            "tutorialspoint.com"
+        ]
+        
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="python beginner tutorial basics",
+            scoring_method="bm25",
+            score_threshold=0.3,
+            max_urls=15  # Per domain
+        )
+        
+        # Discover across all domains in parallel
+        results = await seeder.many_urls(domains, config)
+        
+        # Collect and rank all tutorials
+        all_tutorials = []
+        for domain, urls in results.items():
+            for url in urls:
+                url['domain'] = domain
+                all_tutorials.append(url)
+        
+        # Sort by relevance across all domains
+        all_tutorials.sort(key=lambda x: x['relevance_score'], reverse=True)
+        
+        print(f"Top 10 Python tutorials across {len(domains)} sites:")
+        for i, tutorial in enumerate(all_tutorials[:10], 1):
+            score = tutorial['relevance_score']
+            title = tutorial['head_data'].get('title', 'No title')[:60]
+            domain = tutorial['domain']
+            print(f"{i:2d}. [{score:.2f}] {title}")
+            print(f"     {domain}")
+
+# Competitor analysis
+async def competitor_analysis():
+    competitors = ["competitor1.com", "competitor2.com", "competitor3.com"]
+    
+    async with AsyncUrlSeeder() as seeder:
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            pattern="*/blog/*",
+            max_urls=50
+        )
+        
+        results = await seeder.many_urls(competitors, config)
+        
+        # Analyze content strategies
+        for domain, urls in results.items():
+            content_types = {}
+            
+            for url in urls:
+                # Extract content type from metadata
+                meta = url['head_data'].get('meta', {})
+                og_type = meta.get('og:type', 'unknown')
+                content_types[og_type] = content_types.get(og_type, 0) + 1
+            
+            print(f"\n{domain} content distribution:")
+            for ctype, count in sorted(content_types.items(), 
+                                     key=lambda x: x[1], reverse=True):
+                print(f"  {ctype}: {count}")
+```
+
+### Complete Pipeline: Discovery → Filter → Crawl
+
+```python
+async def smart_research_pipeline():
+    """Complete pipeline: discover URLs, filter by relevance, crawl top results"""
+    
+    async with AsyncUrlSeeder() as seeder:
+        # Step 1: Discover relevant URLs
+        print("🔍 Discovering URLs...")
+        config = SeedingConfig(
+            source="sitemap+cc",
+            extract_head=True,
+            query="machine learning deep learning tutorial",
+            scoring_method="bm25",
+            score_threshold=0.4,
+            max_urls=100
+        )
+        
+        urls = await seeder.urls("example.com", config)
+        print(f"   Found {len(urls)} relevant URLs")
+        
+        # Step 2: Select top articles
+        top_articles = sorted(urls, 
+                            key=lambda x: x['relevance_score'], 
+                            reverse=True)[:10]
+        
+        print(f"   Selected top {len(top_articles)} for crawling")
+        
+        # Step 3: Show what we're about to crawl
+        print("\n📋 Articles to crawl:")
+        for i, article in enumerate(top_articles, 1):
+            score = article['relevance_score']
+            title = article['head_data'].get('title', 'No title')[:60]
+            print(f"  {i}. [{score:.2f}] {title}")
+    
+    # Step 4: Crawl selected articles
+    from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+    
+    print(f"\n🕷️ Crawling {len(top_articles)} articles...")
+    
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            only_text=True,
+            word_count_threshold=200,
+            stream=True  # Process results as they come
+        )
+        
+        # Extract URLs and crawl
+        article_urls = [article['url'] for article in top_articles]
+        
+        crawled_count = 0
+        async for result in await crawler.arun_many(article_urls, config=config):
+            if result.success:
+                crawled_count += 1
+                word_count = len(result.markdown.raw_markdown.split())
+                print(f"   ✅ [{crawled_count}/{len(article_urls)}] "
+                      f"{word_count} words from {result.url[:50]}...")
+            else:
+                print(f"   ❌ Failed: {result.url[:50]}...")
+        
+        print(f"\n✨ Successfully crawled {crawled_count} articles!")
+
+asyncio.run(smart_research_pipeline())
+```
+
+### Advanced Features and Performance
+
+```python
+# Cache management
+async def cache_management():
+    async with AsyncUrlSeeder() as seeder:
+        # First run - populate cache
+        config = SeedingConfig(
+            source="sitemap", 
+            extract_head=True,
+            force=True  # Bypass cache, fetch fresh
+        )
+        urls = await seeder.urls("example.com", config)
+        
+        # Subsequent runs - use cache (much faster)
+        config = SeedingConfig(
+            source="sitemap", 
+            extract_head=True,
+            force=False  # Use cache
+        )
+        urls = await seeder.urls("example.com", config)
+
+# Performance optimization
+async def performance_tuning():
+    async with AsyncUrlSeeder() as seeder:
+        # High-performance configuration
+        config = SeedingConfig(
+            source="cc",
+            concurrency=50,        # Many parallel workers
+            hits_per_sec=20,       # High rate limit
+            max_urls=10000,        # Large dataset
+            extract_head=False,    # Skip metadata for speed
+            filter_nonsense_urls=True  # Auto-filter utility URLs
+        )
+        
+        import time
+        start = time.time()
+        urls = await seeder.urls("large-site.com", config)
+        elapsed = time.time() - start
+        
+        print(f"Processed {len(urls)} URLs in {elapsed:.2f}s")
+        print(f"Speed: {len(urls)/elapsed:.0f} URLs/second")
+
+# Memory-safe processing for large domains
+async def large_domain_processing():
+    async with AsyncUrlSeeder() as seeder:
+        # Safe for domains with 1M+ URLs
+        config = SeedingConfig(
+            source="cc+sitemap",
+            concurrency=50,        # Bounded queue adapts to this
+            max_urls=100000,       # Process in batches
+            filter_nonsense_urls=True
+        )
+        
+        # The seeder automatically manages memory by:
+        # - Using bounded queues (prevents RAM spikes)
+        # - Applying backpressure when queue is full
+        # - Processing URLs as they're discovered
+        urls = await seeder.urls("huge-site.com", config)
+
+# Configuration cloning and reuse
+config_base = SeedingConfig(
+    source="sitemap",
+    extract_head=True,
+    concurrency=20
+)
+
+# Create variations
+blog_config = config_base.clone(pattern="*/blog/*")
+docs_config = config_base.clone(
+    pattern="*/docs/*",
+    query="API documentation",
+    scoring_method="bm25"
+)
+fast_config = config_base.clone(
+    extract_head=False,
+    concurrency=100,
+    hits_per_sec=50
+)
+```
+
+### Troubleshooting and Best Practices
+
+```python
+# Common issues and solutions
+async def troubleshooting_guide():
+    async with AsyncUrlSeeder() as seeder:
+        # Issue: No URLs found
+        try:
+            config = SeedingConfig(source="sitemap", pattern="*/nonexistent/*")
+            urls = await seeder.urls("example.com", config)
+            if not urls:
+                # Solution: Try broader pattern or different source
+                config = SeedingConfig(source="cc+sitemap", pattern="*")
+                urls = await seeder.urls("example.com", config)
+        except Exception as e:
+            print(f"Discovery failed: {e}")
+        
+        # Issue: Slow performance
+        config = SeedingConfig(
+            source="sitemap",  # Faster than CC
+            concurrency=10,    # Reduce if hitting rate limits
+            hits_per_sec=5,    # Add rate limiting
+            extract_head=False # Skip if metadata not needed
+        )
+        
+        # Issue: Low relevance scores
+        config = SeedingConfig(
+            query="specific detailed query terms",
+            score_threshold=0.1,  # Lower threshold
+            scoring_method="bm25"
+        )
+        
+        # Issue: Memory issues with large sites
+        config = SeedingConfig(
+            max_urls=10000,       # Limit results
+            concurrency=20,       # Reduce concurrency
+            source="sitemap"      # Use sitemap only
+        )
+
+# Performance benchmarks
+print("""
+Typical performance on standard connection:
+- Sitemap discovery: 100-1,000 URLs/second
+- Common Crawl discovery: 50-500 URLs/second  
+- HEAD checking: 10-50 URLs/second
+- Head extraction: 5-20 URLs/second
+- BM25 scoring: 10,000+ URLs/second
+""")
+
+# Best practices
+best_practices = """
+✅ Use context manager: async with AsyncUrlSeeder() as seeder
+✅ Start with sitemaps (faster), add CC if needed
+✅ Use extract_head=True only when you need metadata
+✅ Set reasonable max_urls to limit processing
+✅ Add rate limiting for respectful crawling
+✅ Cache results with force=False for repeated operations
+✅ Filter nonsense URLs (enabled by default)
+✅ Use specific patterns to reduce irrelevant results
+"""
+```
+
+**📖 Learn more:** [Complete URL Seeding Guide](https://docs.crawl4ai.com/core/url-seeding/), [SeedingConfig Reference](https://docs.crawl4ai.com/api/parameters/), [Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/)
+---
+
+
+## Deep Crawling Filters & Scorers
+
+Advanced URL filtering and scoring strategies for intelligent deep crawling with performance optimization.
+
+### URL Filters - Content and Domain Control
+
+```python
+from crawl4ai.deep_crawling.filters import (
+    URLPatternFilter, DomainFilter, ContentTypeFilter, 
+    FilterChain, ContentRelevanceFilter, SEOFilter
+)
+
+# Pattern-based filtering
+pattern_filter = URLPatternFilter(
+    patterns=[
+        "*.html",           # HTML pages only
+        "*/blog/*",         # Blog posts
+        "*/articles/*",     # Article pages
+        "*2024*",          # Recent content
+        "^https://example.com/docs/.*"  # Regex pattern
+    ],
+    use_glob=True,
+    reverse=False  # False = include matching, True = exclude matching
+)
+
+# Domain filtering with subdomains
+domain_filter = DomainFilter(
+    allowed_domains=["example.com", "docs.example.com"],
+    blocked_domains=["ads.example.com", "tracker.com"]
+)
+
+# Content type filtering
+content_filter = ContentTypeFilter(
+    allowed_types=["text/html", "application/pdf"],
+    check_extension=True
+)
+
+# Apply individual filters
+url = "https://example.com/blog/2024/article.html"
+print(f"Pattern filter: {pattern_filter.apply(url)}")
+print(f"Domain filter: {domain_filter.apply(url)}")
+print(f"Content filter: {content_filter.apply(url)}")
+```
+
+### Filter Chaining - Combine Multiple Filters
+
+```python
+# Create filter chain for comprehensive filtering
+filter_chain = FilterChain([
+    DomainFilter(allowed_domains=["example.com"]),
+    URLPatternFilter(patterns=["*/blog/*", "*/docs/*"]),
+    ContentTypeFilter(allowed_types=["text/html"])
+])
+
+# Apply chain to URLs
+urls = [
+    "https://example.com/blog/post1.html",
+    "https://spam.com/content.html",
+    "https://example.com/blog/image.jpg",
+    "https://example.com/docs/guide.html"
+]
+
+async def filter_urls(urls, filter_chain):
+    filtered = []
+    for url in urls:
+        if await filter_chain.apply(url):
+            filtered.append(url)
+    return filtered
+
+# Usage
+filtered_urls = await filter_urls(urls, filter_chain)
+print(f"Filtered URLs: {filtered_urls}")
+
+# Check filter statistics
+for filter_obj in filter_chain.filters:
+    stats = filter_obj.stats
+    print(f"{filter_obj.name}: {stats.passed_urls}/{stats.total_urls} passed")
+```
+
+### Advanced Content Filters
+
+```python
+# BM25-based content relevance filtering
+relevance_filter = ContentRelevanceFilter(
+    query="python machine learning tutorial",
+    threshold=0.5,  # Minimum relevance score
+    k1=1.2,        # TF saturation parameter
+    b=0.75,        # Length normalization
+    avgdl=1000     # Average document length
+)
+
+# SEO quality filtering
+seo_filter = SEOFilter(
+    threshold=0.65,  # Minimum SEO score
+    keywords=["python", "tutorial", "guide"],
+    weights={
+        "title_length": 0.15,
+        "title_kw": 0.18,
+        "meta_description": 0.12,
+        "canonical": 0.10,
+        "robot_ok": 0.20,
+        "schema_org": 0.10,
+        "url_quality": 0.15
+    }
+)
+
+# Apply advanced filters
+url = "https://example.com/python-ml-tutorial"
+relevance_score = await relevance_filter.apply(url)
+seo_score = await seo_filter.apply(url)
+
+print(f"Relevance: {relevance_score}, SEO: {seo_score}")
+```
+
+### URL Scorers - Quality and Relevance Scoring
+
+```python
+from crawl4ai.deep_crawling.scorers import (
+    KeywordRelevanceScorer, PathDepthScorer, ContentTypeScorer,
+    FreshnessScorer, DomainAuthorityScorer, CompositeScorer
+)
+
+# Keyword relevance scoring
+keyword_scorer = KeywordRelevanceScorer(
+    keywords=["python", "tutorial", "guide", "machine", "learning"],
+    weight=1.0,
+    case_sensitive=False
+)
+
+# Path depth scoring (optimal depth = 3)
+depth_scorer = PathDepthScorer(
+    optimal_depth=3,  # /category/subcategory/article
+    weight=0.8
+)
+
+# Content type scoring
+content_type_scorer = ContentTypeScorer(
+    type_weights={
+        "html": 1.0,    # Highest priority
+        "pdf": 0.8,     # Medium priority
+        "txt": 0.6,     # Lower priority
+        "doc": 0.4      # Lowest priority
+    },
+    weight=0.9
+)
+
+# Freshness scoring
+freshness_scorer = FreshnessScorer(
+    weight=0.7,
+    current_year=2024
+)
+
+# Domain authority scoring
+domain_scorer = DomainAuthorityScorer(
+    domain_weights={
+        "python.org": 1.0,
+        "github.com": 0.9,
+        "stackoverflow.com": 0.85,
+        "medium.com": 0.7,
+        "personal-blog.com": 0.3
+    },
+    default_weight=0.5,
+    weight=1.0
+)
+
+# Score individual URLs
+url = "https://python.org/tutorial/2024/machine-learning.html"
+scores = {
+    "keyword": keyword_scorer.score(url),
+    "depth": depth_scorer.score(url),
+    "content": content_type_scorer.score(url),
+    "freshness": freshness_scorer.score(url),
+    "domain": domain_scorer.score(url)
+}
+
+print(f"Individual scores: {scores}")
+```
+
+### Composite Scoring - Combine Multiple Scorers
+
+```python
+# Create composite scorer combining all strategies
+composite_scorer = CompositeScorer(
+    scorers=[
+        KeywordRelevanceScorer(["python", "tutorial"], weight=1.5),
+        PathDepthScorer(optimal_depth=3, weight=1.0),
+        ContentTypeScorer({"html": 1.0, "pdf": 0.8}, weight=1.2),
+        FreshnessScorer(weight=0.8, current_year=2024),
+        DomainAuthorityScorer({
+            "python.org": 1.0,
+            "github.com": 0.9
+        }, weight=1.3)
+    ],
+    normalize=True  # Normalize by number of scorers
+)
+
+# Score multiple URLs
+urls_to_score = [
+    "https://python.org/tutorial/2024/basics.html",
+    "https://github.com/user/python-guide/blob/main/README.md",
+    "https://random-blog.com/old/2018/python-stuff.html",
+    "https://python.org/docs/deep/nested/advanced/guide.html"
+]
+
+scored_urls = []
+for url in urls_to_score:
+    score = composite_scorer.score(url)
+    scored_urls.append((url, score))
+
+# Sort by score (highest first)
+scored_urls.sort(key=lambda x: x[1], reverse=True)
+
+for url, score in scored_urls:
+    print(f"Score: {score:.3f} - {url}")
+
+# Check scorer statistics
+print(f"\nScoring statistics:")
+print(f"URLs scored: {composite_scorer.stats._urls_scored}")
+print(f"Average score: {composite_scorer.stats.get_average():.3f}")
+```
+
+### Advanced Filter Patterns
+
+```python
+# Complex pattern matching
+advanced_patterns = URLPatternFilter(
+    patterns=[
+        r"^https://docs\.python\.org/\d+/",  # Python docs with version
+        r".*/tutorial/.*\.html$",             # Tutorial pages
+        r".*/guide/(?!deprecated).*",         # Guides but not deprecated
+        "*/blog/{2020,2021,2022,2023,2024}/*", # Recent blog posts
+        "**/{api,reference}/**/*.html"        # API/reference docs
+    ],
+    use_glob=True
+)
+
+# Exclude patterns (reverse=True)
+exclude_filter = URLPatternFilter(
+    patterns=[
+        "*/admin/*",
+        "*/login/*", 
+        "*/private/*",
+        "**/.*",          # Hidden files
+        "*.{jpg,png,gif,css,js}$"  # Media and assets
+    ],
+    reverse=True  # Exclude matching patterns
+)
+
+# Content type with extension mapping
+detailed_content_filter = ContentTypeFilter(
+    allowed_types=["text", "application"],
+    check_extension=True,
+    ext_map={
+        "html": "text/html",
+        "htm": "text/html", 
+        "md": "text/markdown",
+        "pdf": "application/pdf",
+        "doc": "application/msword",
+        "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    }
+)
+```
+
+### Performance-Optimized Filtering
+
+```python
+# High-performance filter chain for large-scale crawling
+class OptimizedFilterChain:
+    def __init__(self):
+        # Fast filters first (domain, patterns)
+        self.fast_filters = [
+            DomainFilter(
+                allowed_domains=["example.com", "docs.example.com"],
+                blocked_domains=["ads.example.com"]
+            ),
+            URLPatternFilter([
+                "*.html", "*.pdf", "*/blog/*", "*/docs/*"
+            ])
+        ]
+        
+        # Slower filters last (content analysis)
+        self.slow_filters = [
+            ContentRelevanceFilter(
+                query="important content",
+                threshold=0.3
+            )
+        ]
+    
+    async def apply_optimized(self, url: str) -> bool:
+        # Apply fast filters first
+        for filter_obj in self.fast_filters:
+            if not filter_obj.apply(url):
+                return False
+        
+        # Only apply slow filters if fast filters pass
+        for filter_obj in self.slow_filters:
+            if not await filter_obj.apply(url):
+                return False
+        
+        return True
+
+# Batch filtering with concurrency
+async def batch_filter_urls(urls, filter_chain, max_concurrent=50):
+    import asyncio
+    semaphore = asyncio.Semaphore(max_concurrent)
+    
+    async def filter_single(url):
+        async with semaphore:
+            return await filter_chain.apply(url), url
+    
+    tasks = [filter_single(url) for url in urls]
+    results = await asyncio.gather(*tasks)
+    
+    return [url for passed, url in results if passed]
+
+# Usage with 1000 URLs
+large_url_list = [f"https://example.com/page{i}.html" for i in range(1000)]
+optimized_chain = OptimizedFilterChain()
+filtered = await batch_filter_urls(large_url_list, optimized_chain)
+```
+
+### Custom Filter Implementation
+
+```python
+from crawl4ai.deep_crawling.filters import URLFilter
+import re
+
+class CustomLanguageFilter(URLFilter):
+    """Filter URLs by language indicators"""
+    
+    def __init__(self, allowed_languages=["en"], weight=1.0):
+        super().__init__()
+        self.allowed_languages = set(allowed_languages)
+        self.lang_patterns = {
+            "en": re.compile(r"/en/|/english/|lang=en"),
+            "es": re.compile(r"/es/|/spanish/|lang=es"),
+            "fr": re.compile(r"/fr/|/french/|lang=fr"),
+            "de": re.compile(r"/de/|/german/|lang=de")
+        }
+    
+    def apply(self, url: str) -> bool:
+        # Default to English if no language indicators
+        if not any(pattern.search(url) for pattern in self.lang_patterns.values()):
+            result = "en" in self.allowed_languages
+            self._update_stats(result)
+            return result
+        
+        # Check for allowed languages
+        for lang in self.allowed_languages:
+            if lang in self.lang_patterns:
+                if self.lang_patterns[lang].search(url):
+                    self._update_stats(True)
+                    return True
+        
+        self._update_stats(False)
+        return False
+
+# Custom scorer implementation
+from crawl4ai.deep_crawling.scorers import URLScorer
+
+class CustomComplexityScorer(URLScorer):
+    """Score URLs by content complexity indicators"""
+    
+    def __init__(self, weight=1.0):
+        super().__init__(weight)
+        self.complexity_indicators = {
+            "tutorial": 0.9,
+            "guide": 0.8, 
+            "example": 0.7,
+            "reference": 0.6,
+            "api": 0.5
+        }
+    
+    def _calculate_score(self, url: str) -> float:
+        url_lower = url.lower()
+        max_score = 0.0
+        
+        for indicator, score in self.complexity_indicators.items():
+            if indicator in url_lower:
+                max_score = max(max_score, score)
+        
+        return max_score
+
+# Use custom filters and scorers
+custom_filter = CustomLanguageFilter(allowed_languages=["en", "es"])
+custom_scorer = CustomComplexityScorer(weight=1.2)
+
+url = "https://example.com/en/tutorial/advanced-guide.html"
+passes_filter = custom_filter.apply(url)
+complexity_score = custom_scorer.score(url)
+
+print(f"Passes language filter: {passes_filter}")
+print(f"Complexity score: {complexity_score}")
+```
+
+### Integration with Deep Crawling
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import DeepCrawlStrategy
+
+async def deep_crawl_with_filtering():
+    # Create comprehensive filter chain
+    filter_chain = FilterChain([
+        DomainFilter(allowed_domains=["python.org"]),
+        URLPatternFilter(["*/tutorial/*", "*/guide/*", "*/docs/*"]),
+        ContentTypeFilter(["text/html"]),
+        SEOFilter(threshold=0.6, keywords=["python", "programming"])
+    ])
+    
+    # Create composite scorer
+    scorer = CompositeScorer([
+        KeywordRelevanceScorer(["python", "tutorial"], weight=1.5),
+        FreshnessScorer(weight=0.8),
+        PathDepthScorer(optimal_depth=3, weight=1.0)
+    ], normalize=True)
+    
+    # Configure deep crawl strategy with filters and scorers
+    deep_strategy = DeepCrawlStrategy(
+        max_depth=3,
+        max_pages=100,
+        url_filter=filter_chain,
+        url_scorer=scorer,
+        score_threshold=0.6  # Only crawl URLs scoring above 0.6
+    )
+    
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=deep_strategy,
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://python.org",
+            config=config
+        )
+        
+        print(f"Deep crawl completed: {result.success}")
+        if hasattr(result, 'deep_crawl_results'):
+            print(f"Pages crawled: {len(result.deep_crawl_results)}")
+
+# Run the deep crawl
+await deep_crawl_with_filtering()
+```
+
+**📖 Learn more:** [Deep Crawling Strategy](https://docs.crawl4ai.com/core/deep-crawling/), [Custom Filter Development](https://docs.crawl4ai.com/advanced/custom-filters/), [Performance Optimization](https://docs.crawl4ai.com/advanced/performance-tuning/)
+---
+
+
+## Summary
+
+Crawl4AI provides a comprehensive solution for web crawling and data extraction optimized for AI applications. From simple page crawling to complex multi-URL operations with advanced filtering, the library offers the flexibility and performance needed for modern data extraction workflows.
+
+**Key Takeaways:**
+- Start with basic installation and simple crawling patterns
+- Use configuration objects for consistent, maintainable code
+- Choose appropriate extraction strategies based on your data structure
+- Leverage Docker for production deployments
+- Implement advanced features like deep crawling and custom filters as needed
+
+**Next Steps:**
+- Explore the [GitHub repository](https://github.com/unclecode/crawl4ai) for latest updates
+- Join the [Discord community](https://discord.gg/jP8KfhDhyN) for support
+- Check out [example projects](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) for inspiration
+
+Happy crawling! 🕷️
diff --git a/docs/examples/amazon_product_extraction_direct_url.py b/docs/examples/amazon_product_extraction_direct_url.py
index ec734245..c50b9cb5 100644
--- a/docs/examples/amazon_product_extraction_direct_url.py
+++ b/docs/examples/amazon_product_extraction_direct_url.py
@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
 """
 
 from crawl4ai import AsyncWebCrawler
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json
 
diff --git a/docs/examples/amazon_product_extraction_using_hooks.py b/docs/examples/amazon_product_extraction_using_hooks.py
index 5118b5d9..70e06311 100644
--- a/docs/examples/amazon_product_extraction_using_hooks.py
+++ b/docs/examples/amazon_product_extraction_using_hooks.py
@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
 """
 
 from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json
 from playwright.async_api import Page, BrowserContext
diff --git a/docs/examples/amazon_product_extraction_using_use_javascript.py b/docs/examples/amazon_product_extraction_using_use_javascript.py
index e412c931..364c400b 100644
--- a/docs/examples/amazon_product_extraction_using_use_javascript.py
+++ b/docs/examples/amazon_product_extraction_using_use_javascript.py
@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
 """
 
 from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json
 
diff --git a/docs/examples/c4a_script/amazon_example/amazon_r2d2_search.py b/docs/examples/c4a_script/amazon_example/amazon_r2d2_search.py
index 66c586b0..5b02c16b 100644
--- a/docs/examples/c4a_script/amazon_example/amazon_r2d2_search.py
+++ b/docs/examples/c4a_script/amazon_example/amazon_r2d2_search.py
@@ -20,7 +20,7 @@ from pathlib import Path
 from typing import List, Dict, Any
 
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.script.c4a_compile import C4ACompiler
 
 
diff --git a/docs/examples/c4a_script/github_search/github_search_crawler.py b/docs/examples/c4a_script/github_search/github_search_crawler.py
index 71b936e3..71ad7caa 100644
--- a/docs/examples/c4a_script/github_search/github_search_crawler.py
+++ b/docs/examples/c4a_script/github_search/github_search_crawler.py
@@ -20,7 +20,7 @@ from pathlib import Path
 from typing import List, Dict, Any
 
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.script.c4a_compile import C4ACompiler
 
 
diff --git a/docs/examples/extraction_strategies_examples.py b/docs/examples/extraction_strategies_examples.py
index 84192f97..5f072f88 100644
--- a/docs/examples/extraction_strategies_examples.py
+++ b/docs/examples/extraction_strategies_examples.py
@@ -12,7 +12,7 @@ import os
 
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from crawl4ai import LLMConfig
-from crawl4ai.extraction_strategy import (
+from crawl4ai import (
     LLMExtractionStrategy,
     JsonCssExtractionStrategy,
     JsonXPathExtractionStrategy,
diff --git a/docs/examples/quickstart.ipynb b/docs/examples/quickstart.ipynb
index 56365cde..cc5be006 100644
--- a/docs/examples/quickstart.ipynb
+++ b/docs/examples/quickstart.ipynb
@@ -518,7 +518,7 @@
     }
    ],
    "source": [
-    "from crawl4ai.extraction_strategy import LLMExtractionStrategy\n",
+    "from crawl4ai import LLMExtractionStrategy\n",
     "from pydantic import BaseModel, Field\n",
     "import os, json\n",
     "\n",
@@ -594,7 +594,7 @@
     }
    ],
    "source": [
-    "from crawl4ai.extraction_strategy import CosineStrategy\n",
+    "from crawl4ai import CosineStrategy\n",
     "\n",
     "async def cosine_similarity_extraction():\n",
     "    async with AsyncWebCrawler() as crawler:\n",
diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py
index 5efb785d..9992a6bf 100644
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.extraction_strategy import (
+from crawl4ai import (
     JsonCssExtractionStrategy,
     LLMExtractionStrategy,
 )
@@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
 
 
 async def cosine_similarity_extraction():
-    from crawl4ai.extraction_strategy import CosineStrategy
+    from crawl4ai import CosineStrategy
     crawl_config = CrawlerRunConfig(
         cache_mode=CacheMode.BYPASS,
         extraction_strategy=CosineStrategy(
diff --git a/docs/examples/quickstart_examples_set_2.py b/docs/examples/quickstart_examples_set_2.py
index 3adbfc0d..b12b084c 100644
--- a/docs/examples/quickstart_examples_set_2.py
+++ b/docs/examples/quickstart_examples_set_2.py
@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.extraction_strategy import (
+from crawl4ai import (
     JsonCssExtractionStrategy,
     LLMExtractionStrategy,
 )
@@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
 
 
 async def cosine_similarity_extraction():
-    from crawl4ai.extraction_strategy import CosineStrategy
+    from crawl4ai import CosineStrategy
     crawl_config = CrawlerRunConfig(
         cache_mode=CacheMode.BYPASS,
         extraction_strategy=CosineStrategy(
diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py
index da2bcd21..cd377d80 100644
--- a/docs/examples/summarize_page.py
+++ b/docs/examples/summarize_page.py
@@ -2,7 +2,7 @@ import os
 import json
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import *
-from crawl4ai.extraction_strategy import *
+from crawl4ai import *
 from crawl4ai.crawler_strategy import *
 
 url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
diff --git a/docs/examples/tutorial_v0.5.py b/docs/examples/tutorial_v0.5.py
index fe8e0a2b..1693b1f7 100644
--- a/docs/examples/tutorial_v0.5.py
+++ b/docs/examples/tutorial_v0.5.py
@@ -18,7 +18,7 @@ from crawl4ai import RoundRobinProxyStrategy
 from crawl4ai.content_filter_strategy import LLMContentFilter
 from crawl4ai import DefaultMarkdownGenerator
 from crawl4ai import LLMConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
 from pprint import pprint
 
diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md
index 180dfc85..d63b1e80 100644
--- a/docs/md_v2/advanced/session-management.md
+++ b/docs/md_v2/advanced/session-management.md
@@ -45,7 +45,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv
 
 ```python
 from crawl4ai.async_configs import CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.cache_context import CacheMode
 
 async def crawl_dynamic_content():
diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md
index ea0f8176..a3086a8c 100644
--- a/docs/md_v2/api/arun.md
+++ b/docs/md_v2/api/arun.md
@@ -215,7 +215,7 @@ Below is a snippet combining many parameters:
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def main():
     # Example schema
diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md
index 50177f3e..b8f105fc 100644
--- a/docs/md_v2/api/async-webcrawler.md
+++ b/docs/md_v2/api/async-webcrawler.md
@@ -217,7 +217,7 @@ Below is an example hooking it all together:
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 import json
 
 async def main():
diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md
index a44d0fcd..07649ee9 100644
--- a/docs/md_v2/api/strategies.md
+++ b/docs/md_v2/api/strategies.md
@@ -169,7 +169,7 @@ OverlappingWindowChunking(
 
 ```python
 from pydantic import BaseModel
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 from crawl4ai import LLMConfig
 
 # Define schema
@@ -247,7 +247,7 @@ async with AsyncWebCrawler() as crawler:
 ### CSS Extraction
 
 ```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 # Define schema
 schema = {
diff --git a/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js b/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js
index 0c3c37b9..883bb020 100644
--- a/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js
+++ b/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js
@@ -1701,7 +1701,7 @@ Generated: ${new Date().toISOString()}
 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 # The extraction schema generated from your selections
 EXTRACTION_SCHEMA = ${schemaJson}
@@ -1782,7 +1782,7 @@ import asyncio
 import json
 from pathlib import Path
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 # HTML snippet of the selected container element
 HTML_SNIPPET = """
diff --git a/docs/md_v2/apps/crawl4ai-assistant/content/scriptBuilder.js b/docs/md_v2/apps/crawl4ai-assistant/content/scriptBuilder.js
index de1ecac0..f7ef854f 100644
--- a/docs/md_v2/apps/crawl4ai-assistant/content/scriptBuilder.js
+++ b/docs/md_v2/apps/crawl4ai-assistant/content/scriptBuilder.js
@@ -2437,7 +2437,7 @@ Generated: ${timestamp}
 import asyncio
 from pathlib import Path
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 # C4A Script commands
 C4A_SCRIPT = """
diff --git a/docs/md_v2/assets/llm.txt/txt/docker.txt b/docs/md_v2/assets/llm.txt/txt/docker.txt
index a4c55d8a..65372fb3 100644
--- a/docs/md_v2/assets/llm.txt/txt/docker.txt
+++ b/docs/md_v2/assets/llm.txt/txt/docker.txt
@@ -476,7 +476,7 @@ services:
 ```python
 # Method 1: Create config objects and dump to see expected JSON structure
 from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
 import json
 
 # Create browser config and see JSON structure
diff --git a/docs/md_v2/assets/llm.txt/txt/extraction-llm.txt b/docs/md_v2/assets/llm.txt/txt/extraction-llm.txt
index 4518761a..a9f5c18c 100644
--- a/docs/md_v2/assets/llm.txt/txt/extraction-llm.txt
+++ b/docs/md_v2/assets/llm.txt/txt/extraction-llm.txt
@@ -37,7 +37,7 @@ import json
 from pydantic import BaseModel, Field
 from typing import List
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class SentimentAnalysis(BaseModel):
     """Use LLM when you need semantic understanding"""
diff --git a/docs/md_v2/assets/llm.txt/txt/extraction-no-llm.txt b/docs/md_v2/assets/llm.txt/txt/extraction-no-llm.txt
index 1b07df3b..099227e3 100644
--- a/docs/md_v2/assets/llm.txt/txt/extraction-no-llm.txt
+++ b/docs/md_v2/assets/llm.txt/txt/extraction-no-llm.txt
@@ -39,7 +39,7 @@ import json
 import asyncio
 from pathlib import Path
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def smart_extraction_workflow():
     """
@@ -176,7 +176,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 # Manual schema for consistent product pages
 simple_schema = {
@@ -342,7 +342,7 @@ asyncio.run(extract_complex_ecommerce())
 ### XPath Alternative (When CSS Isn't Enough)
 
 ```python
-from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+from crawl4ai import JsonXPathExtractionStrategy
 
 # XPath for more complex selections
 xpath_schema = {
@@ -387,7 +387,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import RegexExtractionStrategy
+from crawl4ai import RegexExtractionStrategy
 
 async def extract_common_patterns():
     # Use built-in patterns for common data types
diff --git a/docs/md_v2/assets/llm.txt/txt/llms-full-v0.1.1.txt b/docs/md_v2/assets/llm.txt/txt/llms-full-v0.1.1.txt
index be0937e8..12dad603 100644
--- a/docs/md_v2/assets/llm.txt/txt/llms-full-v0.1.1.txt
+++ b/docs/md_v2/assets/llm.txt/txt/llms-full-v0.1.1.txt
@@ -1835,7 +1835,7 @@ import json
 from pydantic import BaseModel, Field
 from typing import List
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class SentimentAnalysis(BaseModel):
     """Use LLM when you need semantic understanding"""
@@ -2743,7 +2743,7 @@ import json
 import asyncio
 from pathlib import Path
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def smart_extraction_workflow():
     """
@@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 # Manual schema for consistent product pages
 simple_schema = {
@@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce())
 ### XPath Alternative (When CSS Isn't Enough)
 
 ```python
-from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+from crawl4ai import JsonXPathExtractionStrategy
 
 # XPath for more complex selections
 xpath_schema = {
@@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import RegexExtractionStrategy
+from crawl4ai import RegexExtractionStrategy
 
 async def extract_common_patterns():
     # Use built-in patterns for common data types
@@ -4711,7 +4711,7 @@ services:
 ```python
 # Method 1: Create config objects and dump to see expected JSON structure
 from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
 import json
 
 # Create browser config and see JSON structure
@@ -5792,7 +5792,7 @@ import json
 import asyncio
 from pathlib import Path
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def smart_extraction_workflow():
     """
@@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 # Manual schema for consistent product pages
 simple_schema = {
@@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce())
 ### XPath Alternative (When CSS Isn't Enough)
 
 ```python
-from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+from crawl4ai import JsonXPathExtractionStrategy
 
 # XPath for more complex selections
 xpath_schema = {
@@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import RegexExtractionStrategy
+from crawl4ai import RegexExtractionStrategy
 
 async def extract_common_patterns():
     # Use built-in patterns for common data types
diff --git a/docs/md_v2/assets/llm.txt/txt/llms-full.txt b/docs/md_v2/assets/llm.txt/txt/llms-full.txt
index be0937e8..12dad603 100644
--- a/docs/md_v2/assets/llm.txt/txt/llms-full.txt
+++ b/docs/md_v2/assets/llm.txt/txt/llms-full.txt
@@ -1835,7 +1835,7 @@ import json
 from pydantic import BaseModel, Field
 from typing import List
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class SentimentAnalysis(BaseModel):
     """Use LLM when you need semantic understanding"""
@@ -2743,7 +2743,7 @@ import json
 import asyncio
 from pathlib import Path
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def smart_extraction_workflow():
     """
@@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 # Manual schema for consistent product pages
 simple_schema = {
@@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce())
 ### XPath Alternative (When CSS Isn't Enough)
 
 ```python
-from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+from crawl4ai import JsonXPathExtractionStrategy
 
 # XPath for more complex selections
 xpath_schema = {
@@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import RegexExtractionStrategy
+from crawl4ai import RegexExtractionStrategy
 
 async def extract_common_patterns():
     # Use built-in patterns for common data types
@@ -4711,7 +4711,7 @@ services:
 ```python
 # Method 1: Create config objects and dump to see expected JSON structure
 from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
 import json
 
 # Create browser config and see JSON structure
@@ -5792,7 +5792,7 @@ import json
 import asyncio
 from pathlib import Path
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def smart_extraction_workflow():
     """
@@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 # Manual schema for consistent product pages
 simple_schema = {
@@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce())
 ### XPath Alternative (When CSS Isn't Enough)
 
 ```python
-from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+from crawl4ai import JsonXPathExtractionStrategy
 
 # XPath for more complex selections
 xpath_schema = {
@@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import RegexExtractionStrategy
+from crawl4ai import RegexExtractionStrategy
 
 async def extract_common_patterns():
     # Use built-in patterns for common data types
diff --git a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
deleted file mode 100644
index 846b6914..00000000
--- a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt
+++ /dev/null
@@ -1,7708 +0,0 @@
-# Code Concatenation
-
-Generated on 2025-05-24
-
-## File: docs/md_v2/core/browser-crawler-config.md
-
-```md
-# Browser, Crawler & LLM Configuration (Quick Overview)
-
-Crawl4AI's flexibility stems from two key classes:
-
-1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).  
-2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).  
-3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
-
-In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
-
----
-
-## 1. BrowserConfig Essentials
-
-```python
-class BrowserConfig:
-    def __init__(
-        browser_type="chromium",
-        headless=True,
-        proxy_config=None,
-        viewport_width=1080,
-        viewport_height=600,
-        verbose=True,
-        use_persistent_context=False,
-        user_data_dir=None,
-        cookies=None,
-        headers=None,
-        user_agent=None,
-        text_mode=False,
-        light_mode=False,
-        extra_args=None,
-        # ... other advanced parameters omitted here
-    ):
-        ...
-```
-
-### Key Fields to Note
-
-1. **`browser_type`**  
-- Options: `"chromium"`, `"firefox"`, or `"webkit"`.  
-- Defaults to `"chromium"`.  
-- If you need a different engine, specify it here.
-
-2. **`headless`**  
-   - `True`: Runs the browser in headless mode (invisible browser).  
-   - `False`: Runs the browser in visible mode, which helps with debugging.
-
-3. **`proxy_config`**  
-   - A dictionary with fields like:  
-```json
-{
-    "server": "http://proxy.example.com:8080", 
-    "username": "...", 
-    "password": "..."
-}
-```
-   - Leave as `None` if a proxy is not required.
-
-4. **`viewport_width` & `viewport_height`**:  
-   - The initial window size.  
-   - Some sites behave differently with smaller or bigger viewports.
-
-5. **`verbose`**:  
-   - If `True`, prints extra logs.  
-   - Handy for debugging.
-
-6. **`use_persistent_context`**:  
-   - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.  
-   - Typically also set `user_data_dir` to point to a folder.
-
-7. **`cookies`** & **`headers`**:  
-   - If you want to start with specific cookies or add universal HTTP headers, set them here.  
-   - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
-
-8. **`user_agent`**:  
-   - Custom User-Agent string. If `None`, a default is used.  
-   - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
-
-9. **`text_mode`** & **`light_mode`**:  
-   - `text_mode=True` disables images, possibly speeding up text-only crawls.  
-   - `light_mode=True` turns off certain background features for performance.  
-
-10. **`extra_args`**:  
-    - Additional flags for the underlying browser.  
-    - E.g. `["--disable-extensions"]`.
-
-### Helper Methods
-
-Both configuration classes provide a `clone()` method to create modified copies:
-
-```python
-# Create a base browser config
-base_browser = BrowserConfig(
-    browser_type="chromium",
-    headless=True,
-    text_mode=True
-)
-
-# Create a visible browser config for debugging
-debug_browser = base_browser.clone(
-    headless=False,
-    verbose=True
-)
-```
-
-**Minimal Example**:
-
-```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig
-
-browser_conf = BrowserConfig(
-    browser_type="firefox",
-    headless=False,
-    text_mode=True
-)
-
-async with AsyncWebCrawler(config=browser_conf) as crawler:
-    result = await crawler.arun("https://example.com")
-    print(result.markdown[:300])
-```
-
----
-
-## 2. CrawlerRunConfig Essentials
-
-```python
-class CrawlerRunConfig:
-    def __init__(
-        word_count_threshold=200,
-        extraction_strategy=None,
-        markdown_generator=None,
-        cache_mode=None,
-        js_code=None,
-        wait_for=None,
-        screenshot=False,
-        pdf=False,
-        capture_mhtml=False,
-        # Location and Identity Parameters
-        locale=None,            # e.g. "en-US", "fr-FR"
-        timezone_id=None,       # e.g. "America/New_York"
-        geolocation=None,       # GeolocationConfig object
-        # Resource Management
-        enable_rate_limiting=False,
-        rate_limit_config=None,
-        memory_threshold_percent=70.0,
-        check_interval=1.0,
-        max_session_permit=20,
-        display_mode=None,
-        verbose=True,
-        stream=False,  # Enable streaming for arun_many()
-        # ... other advanced parameters omitted
-    ):
-        ...
-```
-
-### Key Fields to Note
-
-1. **`word_count_threshold`**:  
-   - The minimum word count before a block is considered.  
-   - If your site has lots of short paragraphs or items, you can lower it.
-
-2. **`extraction_strategy`**:  
-   - Where you plug in JSON-based extraction (CSS, LLM, etc.).  
-   - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
-
-3. **`markdown_generator`**:  
-   - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.  
-   - If `None`, a default approach is used.
-
-4. **`cache_mode`**:  
-   - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).  
-   - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
-
-5. **`js_code`**:  
-   - A string or list of JS strings to execute.  
-   - Great for "Load More" buttons or user interactions.  
-
-6. **`wait_for`**:  
-   - A CSS or JS expression to wait for before extracting content.  
-   - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
-
-7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:  
-   - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.  
-   - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
-
-8. **Location Parameters**:  
-   - **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences
-   - **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`)
-   - **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)`
-   - See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
-
-9. **`verbose`**:  
-   - Logs additional runtime details.  
-   - Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`.
-
-10. **`enable_rate_limiting`**:  
-   - If `True`, enables rate limiting for batch processing.  
-   - Requires `rate_limit_config` to be set.
-
-11. **`memory_threshold_percent`**:  
-    - The memory threshold (as a percentage) to monitor.  
-    - If exceeded, the crawler will pause or slow down.
-
-12. **`check_interval`**:  
-    - The interval (in seconds) to check system resources.  
-    - Affects how often memory and CPU usage are monitored.
-
-13. **`max_session_permit`**:  
-    - The maximum number of concurrent crawl sessions.  
-    - Helps prevent overwhelming the system.
-
-14. **`display_mode`**:  
-    - The display mode for progress information (`DETAILED`, `BRIEF`, etc.).  
-    - Affects how much information is printed during the crawl.
-
-
-### Helper Methods
-
-The `clone()` method is particularly useful for creating variations of your crawler configuration:
-
-```python
-# Create a base configuration
-base_config = CrawlerRunConfig(
-    cache_mode=CacheMode.ENABLED,
-    word_count_threshold=200,
-    wait_until="networkidle"
-)
-
-# Create variations for different use cases
-stream_config = base_config.clone(
-    stream=True,  # Enable streaming mode
-    cache_mode=CacheMode.BYPASS
-)
-
-debug_config = base_config.clone(
-    page_timeout=120000,  # Longer timeout for debugging
-    verbose=True
-)
-```
-
-The `clone()` method:
-- Creates a new instance with all the same settings
-- Updates only the specified parameters
-- Leaves the original configuration unchanged
-- Perfect for creating variations without repeating all parameters
-
----
-
-
-## 3. LLMConfig Essentials
-
-### Key fields to note
-
-1. **`provider`**:  
-- Which LLM provoder to use. 
-- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
-
-2. **`api_token`**:  
-    - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables  
-    - API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
-    - Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"`            
-
-3. **`base_url`**:  
-   - If your provider has a custom endpoint
-
-```python
-llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
-```
-
-## 4. Putting It All Together
-
-In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call's needs:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def main():
-    # 1) Browser config: headless, bigger viewport, no proxy
-    browser_conf = BrowserConfig(
-        headless=True,
-        viewport_width=1280,
-        viewport_height=720
-    )
-
-    # 2) Example extraction strategy
-    schema = {
-        "name": "Articles",
-        "baseSelector": "div.article",
-        "fields": [
-            {"name": "title", "selector": "h2", "type": "text"},
-            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
-        ]
-    }
-    extraction = JsonCssExtractionStrategy(schema)
-
-    # 3) Example LLM content filtering
-
-    gemini_config = LLMConfig(
-        provider="gemini/gemini-1.5-pro" 
-        api_token = "env:GEMINI_API_TOKEN"
-    )
-
-    # Initialize LLM filter with specific instruction
-    filter = LLMContentFilter(
-        llm_config=gemini_config,  # or your preferred provider
-        instruction="""
-        Focus on extracting the core educational content.
-        Include:
-        - Key concepts and explanations
-        - Important code examples
-        - Essential technical details
-        Exclude:
-        - Navigation elements
-        - Sidebars
-        - Footer content
-        Format the output as clean markdown with proper code blocks and headers.
-        """,
-        chunk_token_threshold=500,  # Adjust based on your needs
-        verbose=True
-    )
-
-    md_generator = DefaultMarkdownGenerator(
-    content_filter=filter,
-    options={"ignore_links": True}
-
-    # 4) Crawler run config: skip cache, use extraction
-    run_conf = CrawlerRunConfig(
-        markdown_generator=md_generator,
-        extraction_strategy=extraction,
-        cache_mode=CacheMode.BYPASS,
-    )
-
-    async with AsyncWebCrawler(config=browser_conf) as crawler:
-        # 4) Execute the crawl
-        result = await crawler.arun(url="https://example.com/news", config=run_conf)
-
-        if result.success:
-            print("Extracted content:", result.extracted_content)
-        else:
-            print("Error:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-## 5. Next Steps
-
-For a **detailed list** of available parameters (including advanced ones), see:
-
-- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)  
-
-You can explore topics like:
-
-- **Custom Hooks & Auth** (Inject JavaScript or handle login forms).  
-- **Session Management** (Re-use pages, preserve state across multiple calls).  
-- **Magic Mode** or **Identity-based Crawling** (Fight bot detection by simulating user behavior).  
-- **Advanced Caching** (Fine-tune read/write cache modes).  
-
----
-
-## 6. Conclusion
-
-**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
-
-- **Which** browser to launch, how it should run, and any proxy or user agent needs.  
-- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
-- **Which** LLM provider to use, api token, temperature and base url for custom endpoints
-
-Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling!
-```
-
-
-## File: docs/md_v2/core/cache-modes.md
-
-```md
-# Crawl4AI Cache System and Migration Guide
-
-## Overview
-Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
-
-## Old vs New Approach
-
-### Old Way (Deprecated)
-The old system used multiple boolean flags:
-- `bypass_cache`: Skip cache entirely
-- `disable_cache`: Disable all caching
-- `no_cache_read`: Don't read from cache
-- `no_cache_write`: Don't write to cache
-
-### New Way (Recommended)
-The new system uses a single `CacheMode` enum:
-- `CacheMode.ENABLED`: Normal caching (read/write)
-- `CacheMode.DISABLED`: No caching at all
-- `CacheMode.READ_ONLY`: Only read from cache
-- `CacheMode.WRITE_ONLY`: Only write to cache
-- `CacheMode.BYPASS`: Skip cache for this operation
-
-## Migration Example
-
-### Old Code (Deprecated)
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-
-async def use_proxy():
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            bypass_cache=True  # Old way
-        )
-        print(len(result.markdown))
-
-async def main():
-    await use_proxy()
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### New Code (Recommended)
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.async_configs import CrawlerRunConfig
-
-async def use_proxy():
-    # Use CacheMode in CrawlerRunConfig
-    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)  
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=config  # Pass the configuration object
-        )
-        print(len(result.markdown))
-
-async def main():
-    await use_proxy()
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-## Common Migration Patterns
-
-| Old Flag              | New Mode                       |
-|-----------------------|---------------------------------|
-| `bypass_cache=True`   | `cache_mode=CacheMode.BYPASS`  |
-| `disable_cache=True`  | `cache_mode=CacheMode.DISABLED`|
-| `no_cache_read=True`  | `cache_mode=CacheMode.WRITE_ONLY` |
-| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
-```
-
-
-## File: docs/md_v2/core/content-selection.md
-
-```md
-# Content Selection
-
-Crawl4AI provides multiple ways to **select**, **filter**, and **refine** the content from your crawls. Whether you need to target a specific CSS region, exclude entire tags, filter out external links, or remove certain domains and images, **`CrawlerRunConfig`** offers a wide range of parameters.
-
-Below, we show how to configure these parameters and combine them for precise control.
-
----
-
-## 1. CSS-Based Selection
-
-There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`.
-
-### 1.1 Using `css_selector`
-
-A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    config = CrawlerRunConfig(
-        # e.g., first 30 items from Hacker News
-        css_selector=".athing:nth-child(-n+30)"  
-    )
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://news.ycombinator.com/newest", 
-            config=config
-        )
-        print("Partial HTML length:", len(result.cleaned_html))
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Result**: Only elements matching that selector remain in `result.cleaned_html`.
-
-### 1.2 Using `target_elements`
-
-The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    config = CrawlerRunConfig(
-        # Target article body and sidebar, but not other content
-        target_elements=["article.main-content", "aside.sidebar"]
-    )
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.com/blog-post", 
-            config=config
-        )
-        print("Markdown focused on target elements")
-        print("Links from entire page still available:", len(result.links.get("internal", [])))
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection.
-
----
-
-## 2. Content Filtering & Exclusions
-
-### 2.1 Basic Overview
-
-```python
-config = CrawlerRunConfig(
-    # Content thresholds
-    word_count_threshold=10,        # Minimum words per block
-
-    # Tag exclusions
-    excluded_tags=['form', 'header', 'footer', 'nav'],
-
-    # Link filtering
-    exclude_external_links=True,    
-    exclude_social_media_links=True,
-    # Block entire domains
-    exclude_domains=["adtrackers.com", "spammynews.org"],    
-    exclude_social_media_domains=["facebook.com", "twitter.com"],
-
-    # Media filtering
-    exclude_external_images=True
-)
-```
-
-**Explanation**:
-
-- **`word_count_threshold`**: Ignores text blocks under X words. Helps skip trivial blocks like short nav or disclaimers.  
-- **`excluded_tags`**: Removes entire tags (`<form>`, `<header>`, `<footer>`, etc.).  
-- **Link Filtering**:  
-  - `exclude_external_links`: Strips out external links and may remove them from `result.links`.  
-  - `exclude_social_media_links`: Removes links pointing to known social media domains.  
-  - `exclude_domains`: A custom list of domains to block if discovered in links.  
-  - `exclude_social_media_domains`: A curated list (override or add to it) for social media sites.  
-- **Media Filtering**:  
-  - `exclude_external_images`: Discards images not hosted on the same domain as the main page (or its subdomains).
-
-By default in case you set `exclude_social_media_links=True`, the following social media domains are excluded:
-```python
-[
-    'facebook.com',
-    'twitter.com',
-    'x.com',
-    'linkedin.com',
-    'instagram.com',
-    'pinterest.com',
-    'tiktok.com',
-    'snapchat.com',
-    'reddit.com',
-]
-```
-
-
-### 2.2 Example Usage
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-
-async def main():
-    config = CrawlerRunConfig(
-        css_selector="main.content", 
-        word_count_threshold=10,
-        excluded_tags=["nav", "footer"],
-        exclude_external_links=True,
-        exclude_social_media_links=True,
-        exclude_domains=["ads.com", "spammytrackers.net"],
-        exclude_external_images=True,
-        cache_mode=CacheMode.BYPASS
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
-        print("Cleaned HTML length:", len(result.cleaned_html))
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Note**: If these parameters remove too much, reduce or disable them accordingly.
-
----
-
-## 3. Handling Iframes
-
-Some sites embed content in `<iframe>` tags. If you want that inline:
-```python
-config = CrawlerRunConfig(
-    # Merge iframe content into the final output
-    process_iframes=True,    
-    remove_overlay_elements=True
-)
-```
-
-**Usage**:
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    config = CrawlerRunConfig(
-        process_iframes=True,
-        remove_overlay_elements=True
-    )
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.org/iframe-demo", 
-            config=config
-        )
-        print("Iframe-merged length:", len(result.cleaned_html))
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-## 4. Structured Extraction Examples
-
-You can combine content selection with a more advanced extraction strategy. For instance, a **CSS-based** or **LLM-based** extraction strategy can run on the filtered HTML.
-
-### 4.1 Pattern-Based with `JsonCssExtractionStrategy`
-
-```python
-import asyncio
-import json
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def main():
-    # Minimal schema for repeated items
-    schema = {
-        "name": "News Items",
-        "baseSelector": "tr.athing",
-        "fields": [
-            {"name": "title", "selector": "span.titleline a", "type": "text"},
-            {
-                "name": "link", 
-                "selector": "span.titleline a", 
-                "type": "attribute", 
-                "attribute": "href"
-            }
-        ]
-    }
-
-    config = CrawlerRunConfig(
-        # Content filtering
-        excluded_tags=["form", "header"],
-        exclude_domains=["adsite.com"],
-        
-        # CSS selection or entire page
-        css_selector="table.itemlist",
-
-        # No caching for demonstration
-        cache_mode=CacheMode.BYPASS,
-
-        # Extraction strategy
-        extraction_strategy=JsonCssExtractionStrategy(schema)
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://news.ycombinator.com/newest", 
-            config=config
-        )
-        data = json.loads(result.extracted_content)
-        print("Sample extracted item:", data[:1])  # Show first item
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### 4.2 LLM-Based Extraction
-
-```python
-import asyncio
-import json
-from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-class ArticleData(BaseModel):
-    headline: str
-    summary: str
-
-async def main():
-    llm_strategy = LLMExtractionStrategy(
-        llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
-        schema=ArticleData.schema(),
-        extraction_type="schema",
-        instruction="Extract 'headline' and a short 'summary' from the content."
-    )
-
-    config = CrawlerRunConfig(
-        exclude_external_links=True,
-        word_count_threshold=20,
-        extraction_strategy=llm_strategy
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url="https://news.ycombinator.com", config=config)
-        article = json.loads(result.extracted_content)
-        print(article)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-Here, the crawler:
-
-- Filters out external links (`exclude_external_links=True`).  
-- Ignores very short text blocks (`word_count_threshold=20`).  
-- Passes the final HTML to your LLM strategy for an AI-driven parse.
-
----
-
-## 5. Comprehensive Example
-
-Below is a short function that unifies **CSS selection**, **exclusion** logic, and a pattern-based extraction, demonstrating how you can fine-tune your final data:
-
-```python
-import asyncio
-import json
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def extract_main_articles(url: str):
-    schema = {
-        "name": "ArticleBlock",
-        "baseSelector": "div.article-block",
-        "fields": [
-            {"name": "headline", "selector": "h2", "type": "text"},
-            {"name": "summary", "selector": ".summary", "type": "text"},
-            {
-                "name": "metadata",
-                "type": "nested",
-                "fields": [
-                    {"name": "author", "selector": ".author", "type": "text"},
-                    {"name": "date", "selector": ".date", "type": "text"}
-                ]
-            }
-        ]
-    }
-
-    config = CrawlerRunConfig(
-        # Keep only #main-content
-        css_selector="#main-content",
-        
-        # Filtering
-        word_count_threshold=10,
-        excluded_tags=["nav", "footer"],  
-        exclude_external_links=True,
-        exclude_domains=["somebadsite.com"],
-        exclude_external_images=True,
-
-        # Extraction
-        extraction_strategy=JsonCssExtractionStrategy(schema),
-        
-        cache_mode=CacheMode.BYPASS
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url=url, config=config)
-        if not result.success:
-            print(f"Error: {result.error_message}")
-            return None
-        return json.loads(result.extracted_content)
-
-async def main():
-    articles = await extract_main_articles("https://news.ycombinator.com/newest")
-    if articles:
-        print("Extracted Articles:", articles[:2])  # Show first 2
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Why This Works**:
-- **CSS** scoping with `#main-content`.  
-- Multiple **exclude_** parameters to remove domains, external images, etc.  
-- A **JsonCssExtractionStrategy** to parse repeated article blocks.
-
----
-
-## 6. Scraping Modes
-
-Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
-
-async def main():
-    config = CrawlerRunConfig(
-        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
-    )
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.com", 
-            config=config
-        )
-```
-
-You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
-
-```python
-from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
-
-class CustomScrapingStrategy(ContentScrapingStrategy):
-    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
-        # Implement your custom scraping logic here
-        return ScrapingResult(
-            cleaned_html="<html>...</html>",  # Cleaned HTML content
-            success=True,                     # Whether scraping was successful
-            media=Media(
-                images=[                      # List of images found
-                    MediaItem(
-                        src="https://example.com/image.jpg",
-                        alt="Image description",
-                        desc="Surrounding text",
-                        score=1,
-                        type="image",
-                        group_id=1,
-                        format="jpg",
-                        width=800
-                    )
-                ],
-                videos=[],                    # List of videos (same structure as images)
-                audios=[]                     # List of audio files (same structure as images)
-            ),
-            links=Links(
-                internal=[                    # List of internal links
-                    Link(
-                        href="https://example.com/page",
-                        text="Link text",
-                        title="Link title",
-                        base_domain="example.com"
-                    )
-                ],
-                external=[]                   # List of external links (same structure)
-            ),
-            metadata={                        # Additional metadata
-                "title": "Page Title",
-                "description": "Page description"
-            }
-        )
-
-    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
-        # For simple cases, you can use the sync version
-        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
-```
-
-### Performance Considerations
-
-The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
-
-1. LXML strategy is currently experimental
-2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
-3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
-
-Choose LXML strategy when:
-- Processing large HTML documents (recommended for >100KB)
-- Performance is critical
-- Working with well-formed HTML
-
-Stick to BeautifulSoup strategy (default) when:
-- Maximum compatibility is needed
-- Working with malformed HTML
-- Exact parsing behavior is critical
-
----
-
-## 7. Combining CSS Selection Methods
-
-You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-
-async def main():
-    # Target specific content but preserve page context
-    config = CrawlerRunConfig(
-        # Focus markdown on main content and sidebar
-        target_elements=["#main-content", ".sidebar"],
-        
-        # Global filters applied to entire page
-        excluded_tags=["nav", "footer", "header"],
-        exclude_external_links=True,
-        
-        # Use basic content thresholds
-        word_count_threshold=15,
-        
-        cache_mode=CacheMode.BYPASS
-    )
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.com/article",
-            config=config
-        )
-        
-        print(f"Content focuses on specific elements, but all links still analyzed")
-        print(f"Internal links: {len(result.links.get('internal', []))}")
-        print(f"External links: {len(result.links.get('external', []))}")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-This approach gives you the best of both worlds:
-- Markdown generation and content extraction focus on the elements you care about
-- Links, images and other page data still give you the full context of the page
-- Content filtering still applies globally
-
-## 8. Conclusion
-
-By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
-
-1. **`target_elements`** – Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media.
-2. **`css_selector`** – Basic scoping to an element or region for all extraction processes.  
-3. **`word_count_threshold`** – Skip short blocks.  
-4. **`excluded_tags`** – Remove entire HTML tags.  
-5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.  
-6. **`exclude_external_images`** – Remove images from external sources.  
-7. **`process_iframes`** – Merge iframe content if needed.  
-
-Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!
-```
-
-
-## File: docs/md_v2/core/crawler-result.md
-
-```md
-# Crawl Result and Output
-
-When you call `arun()` on a page, Crawl4AI returns a **`CrawlResult`** object containing everything you might need—raw HTML, a cleaned version, optional screenshots or PDFs, structured extraction results, and more. This document explains those fields and how they map to different output types.  
-
----
-
-## 1. The `CrawlResult` Model
-
-Below is the core schema. Each field captures a different aspect of the crawl’s result:
-
-```python
-class MarkdownGenerationResult(BaseModel):
-    raw_markdown: str
-    markdown_with_citations: str
-    references_markdown: str
-    fit_markdown: Optional[str] = None
-    fit_html: Optional[str] = None
-
-class CrawlResult(BaseModel):
-    url: str
-    html: str
-    success: bool
-    cleaned_html: Optional[str] = None
-    media: Dict[str, List[Dict]] = {}
-    links: Dict[str, List[Dict]] = {}
-    downloaded_files: Optional[List[str]] = None
-    screenshot: Optional[str] = None
-    pdf : Optional[bytes] = None
-    mhtml: Optional[str] = None
-    markdown: Optional[Union[str, MarkdownGenerationResult]] = None
-    extracted_content: Optional[str] = None
-    metadata: Optional[dict] = None
-    error_message: Optional[str] = None
-    session_id: Optional[str] = None
-    response_headers: Optional[dict] = None
-    status_code: Optional[int] = None
-    ssl_certificate: Optional[SSLCertificate] = None
-    class Config:
-        arbitrary_types_allowed = True
-```
-
-### Table: Key Fields in `CrawlResult`
-
-| Field (Name & Type)                       | Description                                                                                         |
-|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
-| **url (`str`)**                           | The final or actual URL crawled (in case of redirects).                                             |
-| **html (`str`)**                          | Original, unmodified page HTML. Good for debugging or custom processing.                            |
-| **success (`bool`)**                      | `True` if the crawl completed without major errors, else `False`.                                   |
-| **cleaned_html (`Optional[str]`)**        | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
-| **media (`Dict[str, List[Dict]]`)**       | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc.   |
-| **links (`Dict[str, List[Dict]]`)**       | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
-| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads.         |
-| **screenshot (`Optional[str]`)**          | Screenshot of the page (base64-encoded) if `screenshot=True`.                                       |
-| **pdf (`Optional[bytes]`)**               | PDF of the page if `pdf=True`.                                                                      |
-| **mhtml (`Optional[str]`)**               | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources.      |
-| **markdown (`Optional[str or MarkdownGenerationResult]`)** | It holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
-| **extracted_content (`Optional[str]`)**   | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text.          |
-| **metadata (`Optional[dict]`)**           | Additional info about the crawl or extracted data.                                                  |
-| **error_message (`Optional[str]`)**       | If `success=False`, contains a short description of what went wrong.                                |
-| **session_id (`Optional[str]`)**          | The ID of the session used for multi-page or persistent crawling.                                   |
-| **response_headers (`Optional[dict]`)**   | HTTP response headers, if captured.                                                                 |
-| **status_code (`Optional[int]`)**         | HTTP status code (e.g., 200 for OK).                                                                |
-| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`.                                               |
-
----
-
-## 2. HTML Variants
-
-### `html`: Raw HTML
-
-Crawl4AI preserves the exact HTML as `result.html`. Useful for:
-
-- Debugging page issues or checking the original content.
-- Performing your own specialized parse if needed.
-
-### `cleaned_html`: Sanitized
-
-If you specify any cleanup or exclusion parameters in `CrawlerRunConfig` (like `excluded_tags`, `remove_forms`, etc.), you’ll see the result here:
-
-```python
-config = CrawlerRunConfig(
-    excluded_tags=["form", "header", "footer"],
-    keep_data_attributes=False
-)
-result = await crawler.arun("https://example.com", config=config)
-print(result.cleaned_html)  # Freed of forms, header, footer, data-* attributes
-```
-
----
-
-## 3. Markdown Generation
-
-### 3.1 `markdown`
-
-- **`markdown`**: The current location for detailed markdown output, returning a **`MarkdownGenerationResult`** object.  
-- **`markdown_v2`**: Deprecated since v0.5.
-
-**`MarkdownGenerationResult`** Fields:
-
-| Field                   | Description                                                                    |
-|-------------------------|--------------------------------------------------------------------------------|
-| **raw_markdown**        | The basic HTML→Markdown conversion.                                            |
-| **markdown_with_citations** | Markdown including inline citations that reference links at the end.        |
-| **references_markdown** | The references/citations themselves (if `citations=True`).                      |
-| **fit_markdown**        | The filtered/“fit” markdown if a content filter was used.                       |
-| **fit_html**            | The filtered HTML that generated `fit_markdown`.                                |
-
-### 3.2 Basic Example with a Markdown Generator
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-
-config = CrawlerRunConfig(
-    markdown_generator=DefaultMarkdownGenerator(
-        options={"citations": True, "body_width": 80}  # e.g. pass html2text style options
-    )
-)
-result = await crawler.arun(url="https://example.com", config=config)
-
-md_res = result.markdown  # or eventually 'result.markdown'
-print(md_res.raw_markdown[:500])
-print(md_res.markdown_with_citations)
-print(md_res.references_markdown)
-```
-
-**Note**: If you use a filter like `PruningContentFilter`, you’ll get `fit_markdown` and `fit_html` as well.
-
----
-
-## 4. Structured Extraction: `extracted_content`
-
-If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structured data is **not** stored in `markdown`—it’s placed in **`result.extracted_content`** as a JSON string (or sometimes plain text).
-
-### Example: CSS Extraction with `raw://` HTML
-
-```python
-import asyncio
-import json
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def main():
-    schema = {
-        "name": "Example Items",
-        "baseSelector": "div.item",
-        "fields": [
-            {"name": "title", "selector": "h2", "type": "text"},
-            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
-        ]
-    }
-    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="raw://" + raw_html,
-            config=CrawlerRunConfig(
-                cache_mode=CacheMode.BYPASS,
-                extraction_strategy=JsonCssExtractionStrategy(schema)
-            )
-        )
-        data = json.loads(result.extracted_content)
-        print(data)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-Here:
-- `url="raw://..."` passes the HTML content directly, no network requests.  
-- The **CSS** extraction strategy populates `result.extracted_content` with the JSON array `[{"title": "...", "link": "..."}]`.
-
----
-
-## 5. More Fields: Links, Media, and More
-
-### 5.1 `links`
-
-A dictionary, typically with `"internal"` and `"external"` lists. Each entry might have `href`, `text`, `title`, etc. This is automatically captured if you haven’t disabled link extraction.
-
-```python
-print(result.links["internal"][:3])  # Show first 3 internal links
-```
-
-### 5.2 `media`
-
-Similarly, a dictionary with `"images"`, `"audio"`, `"video"`, etc. Each item could include `src`, `alt`, `score`, and more, if your crawler is set to gather them.
-
-```python
-images = result.media.get("images", [])
-for img in images:
-    print("Image URL:", img["src"], "Alt:", img.get("alt"))
-```
-
-### 5.3 `screenshot`, `pdf`, and `mhtml`
-
-If you set `screenshot=True`, `pdf=True`, or `capture_mhtml=True` in **`CrawlerRunConfig`**, then:
-
-- `result.screenshot` contains a base64-encoded PNG string.
-- `result.pdf` contains raw PDF bytes (you can write them to a file).
-- `result.mhtml` contains the MHTML snapshot of the page as a string (you can write it to a .mhtml file).
-
-```python
-# Save the PDF
-with open("page.pdf", "wb") as f:
-    f.write(result.pdf)
-
-# Save the MHTML
-if result.mhtml:
-    with open("page.mhtml", "w", encoding="utf-8") as f:
-        f.write(result.mhtml)
-```
-
-The MHTML (MIME HTML) format is particularly useful as it captures the entire web page including all of its resources (CSS, images, scripts, etc.) in a single file, making it perfect for archiving or offline viewing.
-
-### 5.4 `ssl_certificate`
-
-If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the site’s SSL cert, such as issuer, validity dates, etc.
-
----
-
-## 6. Accessing These Fields
-
-After you run:
-
-```python
-result = await crawler.arun(url="https://example.com", config=some_config)
-```
-
-Check any field:
-
-```python
-if result.success:
-    print(result.status_code, result.response_headers)
-    print("Links found:", len(result.links.get("internal", [])))
-    if result.markdown:
-        print("Markdown snippet:", result.markdown.raw_markdown[:200])
-    if result.extracted_content:
-        print("Structured JSON:", result.extracted_content)
-else:
-    print("Error:", result.error_message)
-```
-
-**Deprecation**: Since v0.5 `result.markdown_v2`, `result.fit_html`,`result.fit_markdown` are deprecated. Use `result.markdown` instead! It holds `MarkdownGenerationResult`, which includes `fit_html` and `fit_markdown`
-as it's properties.
-
-
----
-
-## 7. Next Steps
-
-- **Markdown Generation**: Dive deeper into how to configure `DefaultMarkdownGenerator` and various filters.  
-- **Content Filtering**: Learn how to use `BM25ContentFilter` and `PruningContentFilter`.
-- **Session & Hooks**: If you want to manipulate the page or preserve state across multiple `arun()` calls, see the hooking or session docs.  
-- **LLM Extraction**: For complex or unstructured content requiring AI-driven parsing, check the LLM-based strategies doc.
-
-**Enjoy** exploring all that `CrawlResult` offers—whether you need raw HTML, sanitized output, markdown, or fully structured data, Crawl4AI has you covered!
-```
-
-
-## File: docs/md_v2/core/deep-crawling.md
-
-```md
-# Deep Crawling
-
-One of Crawl4AI's most powerful features is its ability to perform **configurable deep crawling** that can explore websites beyond a single page. With fine-tuned control over crawl depth, domain boundaries, and content filtering, Crawl4AI gives you the tools to extract precisely the content you need.
-
-In this tutorial, you'll learn:
-
-1. How to set up a **Basic Deep Crawler** with BFS strategy  
-2. Understanding the difference between **streamed and non-streamed** output  
-3. Implementing **filters and scorers** to target specific content  
-4. Creating **advanced filtering chains** for sophisticated crawls  
-5. Using **BestFirstCrawling** for intelligent exploration prioritization  
-
-> **Prerequisites**  
-> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
-> - You know how to configure `CrawlerRunConfig`.
-
----
-
-## 1. Quick Example
-
-Here's a minimal code snippet that implements a basic deep crawl using the **BFSDeepCrawlStrategy**:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
-from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
-
-async def main():
-    # Configure a 2-level deep crawl
-    config = CrawlerRunConfig(
-        deep_crawl_strategy=BFSDeepCrawlStrategy(
-            max_depth=2, 
-            include_external=False
-        ),
-        scraping_strategy=LXMLWebScrapingStrategy(),
-        verbose=True
-    )
-    
-    async with AsyncWebCrawler() as crawler:
-        results = await crawler.arun("https://example.com", config=config)
-        
-        print(f"Crawled {len(results)} pages in total")
-        
-        # Access individual results
-        for result in results[:3]:  # Show first 3 results
-            print(f"URL: {result.url}")
-            print(f"Depth: {result.metadata.get('depth', 0)}")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**What's happening?**  
-- `BFSDeepCrawlStrategy(max_depth=2, include_external=False)` instructs Crawl4AI to:
-  - Crawl the starting page (depth 0) plus 2 more levels
-  - Stay within the same domain (don't follow external links)
-- Each result contains metadata like the crawl depth
-- Results are returned as a list after all crawling is complete
-
----
-
-## 2. Understanding Deep Crawling Strategy Options
-
-### 2.1 BFSDeepCrawlStrategy (Breadth-First Search)
-
-The **BFSDeepCrawlStrategy** uses a breadth-first approach, exploring all links at one depth before moving deeper:
-
-```python
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
-
-# Basic configuration
-strategy = BFSDeepCrawlStrategy(
-    max_depth=2,               # Crawl initial page + 2 levels deep
-    include_external=False,    # Stay within the same domain
-    max_pages=50,              # Maximum number of pages to crawl (optional)
-    score_threshold=0.3,       # Minimum score for URLs to be crawled (optional)
-)
-```
-
-**Key parameters:**
-- **`max_depth`**: Number of levels to crawl beyond the starting page
-- **`include_external`**: Whether to follow links to other domains
-- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
-- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
-- **`filter_chain`**: FilterChain instance for URL filtering
-- **`url_scorer`**: Scorer instance for evaluating URLs
-
-### 2.2 DFSDeepCrawlStrategy (Depth-First Search)
-
-The **DFSDeepCrawlStrategy** uses a depth-first approach, explores as far down a branch as possible before backtracking.
-
-```python
-from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
-
-# Basic configuration
-strategy = DFSDeepCrawlStrategy(
-    max_depth=2,               # Crawl initial page + 2 levels deep
-    include_external=False,    # Stay within the same domain
-    max_pages=30,              # Maximum number of pages to crawl (optional)
-    score_threshold=0.5,       # Minimum score for URLs to be crawled (optional)
-)
-```
-
-**Key parameters:**
-- **`max_depth`**: Number of levels to crawl beyond the starting page
-- **`include_external`**: Whether to follow links to other domains
-- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
-- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
-- **`filter_chain`**: FilterChain instance for URL filtering
-- **`url_scorer`**: Scorer instance for evaluating URLs
-
-### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)
-
-For more intelligent crawling, use **BestFirstCrawlingStrategy** with scorers to prioritize the most relevant pages:
-
-```python
-from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
-from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
-
-# Create a scorer
-scorer = KeywordRelevanceScorer(
-    keywords=["crawl", "example", "async", "configuration"],
-    weight=0.7
-)
-
-# Configure the strategy
-strategy = BestFirstCrawlingStrategy(
-    max_depth=2,
-    include_external=False,
-    url_scorer=scorer,
-    max_pages=25,              # Maximum number of pages to crawl (optional)
-)
-```
-
-This crawling approach:
-- Evaluates each discovered URL based on scorer criteria
-- Visits higher-scoring pages first
-- Helps focus crawl resources on the most relevant content
-- Can limit total pages crawled with `max_pages`
-- Does not need `score_threshold` as it naturally prioritizes by score
-
----
-
-## 3. Streaming vs. Non-Streaming Results
-
-Crawl4AI can return results in two modes:
-
-### 3.1 Non-Streaming Mode (Default)
-
-```python
-config = CrawlerRunConfig(
-    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
-    stream=False  # Default behavior
-)
-
-async with AsyncWebCrawler() as crawler:
-    # Wait for ALL results to be collected before returning
-    results = await crawler.arun("https://example.com", config=config)
-    
-    for result in results:
-        process_result(result)
-```
-
-**When to use non-streaming mode:**
-- You need the complete dataset before processing
-- You're performing batch operations on all results together
-- Crawl time isn't a critical factor
-
-### 3.2 Streaming Mode
-
-```python
-config = CrawlerRunConfig(
-    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
-    stream=True  # Enable streaming
-)
-
-async with AsyncWebCrawler() as crawler:
-    # Returns an async iterator
-    async for result in await crawler.arun("https://example.com", config=config):
-        # Process each result as it becomes available
-        process_result(result)
-```
-
-**Benefits of streaming mode:**
-- Process results immediately as they're discovered
-- Start working with early results while crawling continues
-- Better for real-time applications or progressive display
-- Reduces memory pressure when handling many pages
-
----
-
-## 4. Filtering Content with Filter Chains
-
-Filters help you narrow down which pages to crawl. Combine multiple filters using **FilterChain** for powerful targeting.
-
-### 4.1 Basic URL Pattern Filter
-
-```python
-from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
-
-# Only follow URLs containing "blog" or "docs"
-url_filter = URLPatternFilter(patterns=["*blog*", "*docs*"])
-
-config = CrawlerRunConfig(
-    deep_crawl_strategy=BFSDeepCrawlStrategy(
-        max_depth=1,
-        filter_chain=FilterChain([url_filter])
-    )
-)
-```
-
-### 4.2 Combining Multiple Filters
-
-```python
-from crawl4ai.deep_crawling.filters import (
-    FilterChain,
-    URLPatternFilter,
-    DomainFilter,
-    ContentTypeFilter
-)
-
-# Create a chain of filters
-filter_chain = FilterChain([
-    # Only follow URLs with specific patterns
-    URLPatternFilter(patterns=["*guide*", "*tutorial*"]),
-    
-    # Only crawl specific domains
-    DomainFilter(
-        allowed_domains=["docs.example.com"],
-        blocked_domains=["old.docs.example.com"]
-    ),
-    
-    # Only include specific content types
-    ContentTypeFilter(allowed_types=["text/html"])
-])
-
-config = CrawlerRunConfig(
-    deep_crawl_strategy=BFSDeepCrawlStrategy(
-        max_depth=2,
-        filter_chain=filter_chain
-    )
-)
-```
-
-### 4.3 Available Filter Types
-
-Crawl4AI includes several specialized filters:
-
-- **`URLPatternFilter`**: Matches URL patterns using wildcard syntax
-- **`DomainFilter`**: Controls which domains to include or exclude
-- **`ContentTypeFilter`**: Filters based on HTTP Content-Type
-- **`ContentRelevanceFilter`**: Uses similarity to a text query
-- **`SEOFilter`**: Evaluates SEO elements (meta tags, headers, etc.)
-
----
-
-## 5. Using Scorers for Prioritized Crawling
-
-Scorers assign priority values to discovered URLs, helping the crawler focus on the most relevant content first.
-
-### 5.1 KeywordRelevanceScorer
-
-```python
-from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
-from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
-
-# Create a keyword relevance scorer
-keyword_scorer = KeywordRelevanceScorer(
-    keywords=["crawl", "example", "async", "configuration"],
-    weight=0.7  # Importance of this scorer (0.0 to 1.0)
-)
-
-config = CrawlerRunConfig(
-    deep_crawl_strategy=BestFirstCrawlingStrategy(
-        max_depth=2,
-        url_scorer=keyword_scorer
-    ),
-    stream=True  # Recommended with BestFirstCrawling
-)
-
-# Results will come in order of relevance score
-async with AsyncWebCrawler() as crawler:
-    async for result in await crawler.arun("https://example.com", config=config):
-        score = result.metadata.get("score", 0)
-        print(f"Score: {score:.2f} | {result.url}")
-```
-
-**How scorers work:**
-- Evaluate each discovered URL before crawling
-- Calculate relevance based on various signals
-- Help the crawler make intelligent choices about traversal order
-
----
-
-## 6. Advanced Filtering Techniques
-
-### 6.1 SEO Filter for Quality Assessment
-
-The **SEOFilter** helps you identify pages with strong SEO characteristics:
-
-```python
-from crawl4ai.deep_crawling.filters import FilterChain, SEOFilter
-
-# Create an SEO filter that looks for specific keywords in page metadata
-seo_filter = SEOFilter(
-    threshold=0.5,  # Minimum score (0.0 to 1.0)
-    keywords=["tutorial", "guide", "documentation"]
-)
-
-config = CrawlerRunConfig(
-    deep_crawl_strategy=BFSDeepCrawlStrategy(
-        max_depth=1,
-        filter_chain=FilterChain([seo_filter])
-    )
-)
-```
-
-### 6.2 Content Relevance Filter
-
-The **ContentRelevanceFilter** analyzes the actual content of pages:
-
-```python
-from crawl4ai.deep_crawling.filters import FilterChain, ContentRelevanceFilter
-
-# Create a content relevance filter
-relevance_filter = ContentRelevanceFilter(
-    query="Web crawling and data extraction with Python",
-    threshold=0.7  # Minimum similarity score (0.0 to 1.0)
-)
-
-config = CrawlerRunConfig(
-    deep_crawl_strategy=BFSDeepCrawlStrategy(
-        max_depth=1,
-        filter_chain=FilterChain([relevance_filter])
-    )
-)
-```
-
-This filter:
-- Measures semantic similarity between query and page content
-- It's a BM25-based relevance filter using head section content
-
----
-
-## 7. Building a Complete Advanced Crawler
-
-This example combines multiple techniques for a sophisticated crawl:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
-from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
-from crawl4ai.deep_crawling.filters import (
-    FilterChain,
-    DomainFilter,
-    URLPatternFilter,
-    ContentTypeFilter
-)
-from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
-
-async def run_advanced_crawler():
-    # Create a sophisticated filter chain
-    filter_chain = FilterChain([
-        # Domain boundaries
-        DomainFilter(
-            allowed_domains=["docs.example.com"],
-            blocked_domains=["old.docs.example.com"]
-        ),
-        
-        # URL patterns to include
-        URLPatternFilter(patterns=["*guide*", "*tutorial*", "*blog*"]),
-        
-        # Content type filtering
-        ContentTypeFilter(allowed_types=["text/html"])
-    ])
-
-    # Create a relevance scorer
-    keyword_scorer = KeywordRelevanceScorer(
-        keywords=["crawl", "example", "async", "configuration"],
-        weight=0.7
-    )
-
-    # Set up the configuration
-    config = CrawlerRunConfig(
-        deep_crawl_strategy=BestFirstCrawlingStrategy(
-            max_depth=2,
-            include_external=False,
-            filter_chain=filter_chain,
-            url_scorer=keyword_scorer
-        ),
-        scraping_strategy=LXMLWebScrapingStrategy(),
-        stream=True,
-        verbose=True
-    )
-
-    # Execute the crawl
-    results = []
-    async with AsyncWebCrawler() as crawler:
-        async for result in await crawler.arun("https://docs.example.com", config=config):
-            results.append(result)
-            score = result.metadata.get("score", 0)
-            depth = result.metadata.get("depth", 0)
-            print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
-
-    # Analyze the results
-    print(f"Crawled {len(results)} high-value pages")
-    print(f"Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}")
-
-    # Group by depth
-    depth_counts = {}
-    for result in results:
-        depth = result.metadata.get("depth", 0)
-        depth_counts[depth] = depth_counts.get(depth, 0) + 1
-
-    print("Pages crawled by depth:")
-    for depth, count in sorted(depth_counts.items()):
-        print(f"  Depth {depth}: {count} pages")
-
-if __name__ == "__main__":
-    asyncio.run(run_advanced_crawler())
-```
-
----
-
-
-## 8. Limiting and Controlling Crawl Size
-
-### 8.1 Using max_pages
-
-You can limit the total number of pages crawled with the `max_pages` parameter:
-
-```python
-# Limit to exactly 20 pages regardless of depth
-strategy = BFSDeepCrawlStrategy(
-    max_depth=3,
-    max_pages=20
-)
-```
-
-This feature is useful for:
-- Controlling API costs
-- Setting predictable execution times
-- Focusing on the most important content
-- Testing crawl configurations before full execution
-
-### 8.2 Using score_threshold
-
-For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages:
-
-```python
-# Only follow links with scores above 0.4
-strategy = DFSDeepCrawlStrategy(
-    max_depth=2,
-    url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]),
-    score_threshold=0.4  # Skip URLs with scores below this value
-)
-```
-
-Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first.
-
-## 9. Common Pitfalls & Tips
-
-1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits.
-
-2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.
-
-3.**Be a good web citizen.**  Respect robots.txt. (disabled by default)
-  
-4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results.
-
-5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
-
----
-
-## 10. Summary & Next Steps
-
-In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
-
-- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy**
-- Process results in streaming or non-streaming mode
-- Apply filters to target specific content
-- Use scorers to prioritize the most relevant pages
-- Limit crawls with `max_pages` and `score_threshold` parameters
-- Build a complete advanced crawler with combined techniques
-
-With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.
-
-```
-
-
-## File: docs/md_v2/core/fit-markdown.md
-
-```md
-# Fit Markdown with Pruning & BM25
-
-**Fit Markdown** is a specialized **filtered** version of your page’s markdown, focusing on the most relevant content. By default, Crawl4AI converts the entire HTML into a broad **raw_markdown**. With fit markdown, we apply a **content filter** algorithm (e.g., **Pruning** or **BM25**) to remove or rank low-value sections—such as repetitive sidebars, shallow text blocks, or irrelevancies—leaving a concise textual “core.”
-
----
-
-## 1. How “Fit Markdown” Works
-
-### 1.1 The `content_filter`
-
-In **`CrawlerRunConfig`**, you can specify a **`content_filter`** to shape how content is pruned or ranked before final markdown generation. A filter’s logic is applied **before** or **during** the HTML→Markdown process, producing:
-
-- **`result.markdown.raw_markdown`** (unfiltered)
-- **`result.markdown.fit_markdown`** (filtered or “fit” version)
-- **`result.markdown.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`)
-
-
-### 1.2 Common Filters
-
-1. **PruningContentFilter** – Scores each node by text density, link density, and tag importance, discarding those below a threshold.  
-2. **BM25ContentFilter** – Focuses on textual relevance using BM25 ranking, especially useful if you have a specific user query (e.g., “machine learning” or “food nutrition”).
-
----
-
-## 2. PruningContentFilter
-
-**Pruning** discards less relevant nodes based on **text density, link density, and tag importance**. It’s a heuristic-based approach—if certain sections appear too “thin” or too “spammy,” they’re pruned.
-
-### 2.1 Usage Example
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-
-async def main():
-    # Step 1: Create a pruning filter
-    prune_filter = PruningContentFilter(
-        # Lower → more content retained, higher → more content pruned
-        threshold=0.45,           
-        # "fixed" or "dynamic"
-        threshold_type="dynamic",  
-        # Ignore nodes with <5 words
-        min_word_threshold=5      
-    )
-
-    # Step 2: Insert it into a Markdown Generator
-    md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
-    
-    # Step 3: Pass it to CrawlerRunConfig
-    config = CrawlerRunConfig(
-        markdown_generator=md_generator
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://news.ycombinator.com", 
-            config=config
-        )
-        
-        if result.success:
-            # 'fit_markdown' is your pruned content, focusing on "denser" text
-            print("Raw Markdown length:", len(result.markdown.raw_markdown))
-            print("Fit Markdown length:", len(result.markdown.fit_markdown))
-        else:
-            print("Error:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### 2.2 Key Parameters
-
-- **`min_word_threshold`** (int): If a block has fewer words than this, it’s pruned.  
-- **`threshold_type`** (str):
-  - `"fixed"` → each node must exceed `threshold` (0–1).  
-  - `"dynamic"` → node scoring adjusts according to tag type, text/link density, etc.  
-- **`threshold`** (float, default ~0.48): The base or “anchor” cutoff.  
-
-**Algorithmic Factors**:
-
-- **Text density** – Encourages blocks that have a higher ratio of text to overall content.  
-- **Link density** – Penalizes sections that are mostly links.  
-- **Tag importance** – e.g., an `<article>` or `<p>` might be more important than a `<div>`.  
-- **Structural context** – If a node is deeply nested or in a suspected sidebar, it might be deprioritized.
-
----
-
-## 3. BM25ContentFilter
-
-**BM25** is a classical text ranking algorithm often used in search engines. If you have a **user query** or rely on page metadata to derive a query, BM25 can identify which text chunks best match that query.
-
-### 3.1 Usage Example
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.content_filter_strategy import BM25ContentFilter
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-
-async def main():
-    # 1) A BM25 filter with a user query
-    bm25_filter = BM25ContentFilter(
-        user_query="startup fundraising tips",
-        # Adjust for stricter or looser results
-        bm25_threshold=1.2  
-    )
-
-    # 2) Insert into a Markdown Generator
-    md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
-    
-    # 3) Pass to crawler config
-    config = CrawlerRunConfig(
-        markdown_generator=md_generator
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://news.ycombinator.com", 
-            config=config
-        )
-        if result.success:
-            print("Fit Markdown (BM25 query-based):")
-            print(result.markdown.fit_markdown)
-        else:
-            print("Error:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### 3.2 Parameters
-
-- **`user_query`** (str, optional): E.g. `"machine learning"`. If blank, the filter tries to glean a query from page metadata.  
-- **`bm25_threshold`** (float, default 1.0):  
-  - Higher → fewer chunks but more relevant.  
-  - Lower → more inclusive.  
-
-> In more advanced scenarios, you might see parameters like `use_stemming`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted.
-
----
-
-## 4. Accessing the “Fit” Output
-
-After the crawl, your “fit” content is found in **`result.markdown.fit_markdown`**. 
-
-```python
-fit_md = result.markdown.fit_markdown
-fit_html = result.markdown.fit_html
-```
-
-If the content filter is **BM25**, you might see additional logic or references in `fit_markdown` that highlight relevant segments. If it’s **Pruning**, the text is typically well-cleaned but not necessarily matched to a query.
-
----
-
-## 5. Code Patterns Recap
-
-### 5.1 Pruning
-
-```python
-prune_filter = PruningContentFilter(
-    threshold=0.5,
-    threshold_type="fixed",
-    min_word_threshold=10
-)
-md_generator = DefaultMarkdownGenerator(content_filter=prune_filter)
-config = CrawlerRunConfig(markdown_generator=md_generator)
-```
-
-### 5.2 BM25
-
-```python
-bm25_filter = BM25ContentFilter(
-    user_query="health benefits fruit",
-    bm25_threshold=1.2
-)
-md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
-config = CrawlerRunConfig(markdown_generator=md_generator)
-```
-
----
-
-## 6. Combining with “word_count_threshold” & Exclusions
-
-Remember you can also specify:
-
-```python
-config = CrawlerRunConfig(
-    word_count_threshold=10,
-    excluded_tags=["nav", "footer", "header"],
-    exclude_external_links=True,
-    markdown_generator=DefaultMarkdownGenerator(
-        content_filter=PruningContentFilter(threshold=0.5)
-    )
-)
-```
-
-Thus, **multi-level** filtering occurs:
-
-1. The crawler’s `excluded_tags` are removed from the HTML first.  
-2. The content filter (Pruning, BM25, or custom) prunes or ranks the remaining text blocks.  
-3. The final “fit” content is generated in `result.markdown.fit_markdown`.
-
----
-
-## 7. Custom Filters
-
-If you need a different approach (like a specialized ML model or site-specific heuristics), you can create a new class inheriting from `RelevantContentFilter` and implement `filter_content(html)`. Then inject it into your **markdown generator**:
-
-```python
-from crawl4ai.content_filter_strategy import RelevantContentFilter
-
-class MyCustomFilter(RelevantContentFilter):
-    def filter_content(self, html, min_word_threshold=None):
-        # parse HTML, implement custom logic
-        return [block for block in ... if ... some condition...]
-
-```
-
-**Steps**:
-
-1. Subclass `RelevantContentFilter`.  
-2. Implement `filter_content(...)`.  
-3. Use it in your `DefaultMarkdownGenerator(content_filter=MyCustomFilter(...))`.
-
----
-
-## 8. Final Thoughts
-
-**Fit Markdown** is a crucial feature for:
-
-- **Summaries**: Quickly get the important text from a cluttered page.  
-- **Search**: Combine with **BM25** to produce content relevant to a query.  
-- **AI Pipelines**: Filter out boilerplate so LLM-based extraction or summarization runs on denser text.
-
-**Key Points**:
-- **PruningContentFilter**: Great if you just want the “meatiest” text without a user query.  
-- **BM25ContentFilter**: Perfect for query-based extraction or searching.  
-- Combine with **`excluded_tags`, `exclude_external_links`, `word_count_threshold`** to refine your final “fit” text.  
-- Fit markdown ends up in **`result.markdown.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions.
-
-With these tools, you can **zero in** on the text that truly matters, ignoring spammy or boilerplate content, and produce a concise, relevant “fit markdown” for your AI or data pipelines. Happy pruning and searching!
-
-- Last Updated: 2025-01-01
-```
-
-
-## File: docs/md_v2/core/installation.md
-
-```md
-# Installation & Setup (2023 Edition)
-
-## 1. Basic Installation
-
-```bash
-pip install crawl4ai
-```
-
-This installs the **core** Crawl4AI library along with essential dependencies. **No** advanced features (like transformers or PyTorch) are included yet.
-
-## 2. Initial Setup & Diagnostics
-
-### 2.1 Run the Setup Command
-After installing, call:
-
-```bash
-crawl4ai-setup
-```
-
-**What does it do?**
-- Installs or updates required Playwright browsers (Chromium, Firefox, etc.)
-- Performs OS-level checks (e.g., missing libs on Linux)
-- Confirms your environment is ready to crawl
-
-### 2.2 Diagnostics
-Optionally, you can run **diagnostics** to confirm everything is functioning:
-
-```bash
-crawl4ai-doctor
-```
-
-This command attempts to:
-- Check Python version compatibility
-- Verify Playwright installation
-- Inspect environment variables or library conflicts
-
-If any issues arise, follow its suggestions (e.g., installing additional system packages) and re-run `crawl4ai-setup`.
-
----
-
-## 3. Verifying Installation: A Simple Crawl (Skip this step if you already run `crawl4ai-doctor`)
-
-Below is a minimal Python script demonstrating a **basic** crawl. It uses our new **`BrowserConfig`** and **`CrawlerRunConfig`** for clarity, though no custom settings are passed in this example:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def main():
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com",
-        )
-        print(result.markdown[:300])  # Show the first 300 characters of extracted text
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Expected** outcome:
-- A headless browser session loads `example.com`
-- Crawl4AI returns ~300 characters of markdown.  
-If errors occur, rerun `crawl4ai-doctor` or manually ensure Playwright is installed correctly.
-
----
-
-## 4. Advanced Installation (Optional)
-
-**Warning**: Only install these **if you truly need them**. They bring in larger dependencies, including big models, which can increase disk usage and memory load significantly.
-
-### 4.1 Torch, Transformers, or All
-
-- **Text Clustering (Torch)**  
-  ```bash
-  pip install crawl4ai[torch]
-  crawl4ai-setup
-  ```
-  Installs PyTorch-based features (e.g., cosine similarity or advanced semantic chunking).
-
-- **Transformers**  
-  ```bash
-  pip install crawl4ai[transformer]
-  crawl4ai-setup
-  ```
-  Adds Hugging Face-based summarization or generation strategies.
-
-- **All Features**  
-  ```bash
-  pip install crawl4ai[all]
-  crawl4ai-setup
-  ```
-
-#### (Optional) Pre-Fetching Models
-```bash
-crawl4ai-download-models
-```
-This step caches large models locally (if needed). **Only do this** if your workflow requires them.
-
----
-
-## 5. Docker (Experimental)
-
-We provide a **temporary** Docker approach for testing. **It’s not stable and may break** with future releases. We plan a major Docker revamp in a future stable version, 2025 Q1. If you still want to try:
-
-```bash
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
-```
-
-You can then make POST requests to `http://localhost:11235/crawl` to perform crawls. **Production usage** is discouraged until our new Docker approach is ready (planned in Jan or Feb 2025).
-
----
-
-## 6. Local Server Mode (Legacy)
-
-Some older docs mention running Crawl4AI as a local server. This approach has been **partially replaced** by the new Docker-based prototype and upcoming stable server release. You can experiment, but expect major changes. Official local server instructions will arrive once the new Docker architecture is finalized.
-
----
-
-## Summary
-
-1. **Install** with `pip install crawl4ai` and run `crawl4ai-setup`.
-2. **Diagnose** with `crawl4ai-doctor` if you see errors.
-3. **Verify** by crawling `example.com` with minimal `BrowserConfig` + `CrawlerRunConfig`.
-4. **Advanced** features (Torch, Transformers) are **optional**—avoid them if you don’t need them (they significantly increase resource usage).
-5. **Docker** is **experimental**—use at your own risk until the stable version is released.
-6. **Local server** references in older docs are largely deprecated; a new solution is in progress.
-
-**Got questions?** Check [GitHub issues](https://github.com/unclecode/crawl4ai/issues) for updates or ask the community!
-```
-
-
-## File: docs/md_v2/core/link-media.md
-
-```md
-# Link & Media 
-
-In this tutorial, you’ll learn how to:
-
-1. Extract links (internal, external) from crawled pages  
-2. Filter or exclude specific domains (e.g., social media or custom domains)  
-3. Access and ma### 3.2 Excluding Images
-
-#### Excluding External Images
-
-If you're dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
-
-```python
-crawler_cfg = CrawlerRunConfig(
-    exclude_external_images=True
-)
-```
-
-This setting attempts to discard images from outside the primary domain, keeping only those from the site you're crawling.
-
-#### Excluding All Images
-
-If you want to completely remove all images from the page to maximize performance and reduce memory usage, use:
-
-```python
-crawler_cfg = CrawlerRunConfig(
-    exclude_all_images=True
-)
-```
-
-This setting removes all images very early in the processing pipeline, which significantly improves memory efficiency and processing speed. This is particularly useful when:
-- You don't need image data in your results
-- You're crawling image-heavy pages that cause memory issues
-- You want to focus only on text content
-- You need to maximize crawling speeddata (especially images) in the crawl result  
-4. Configure your crawler to exclude or prioritize certain images
-
-> **Prerequisites**  
-> - You have completed or are familiar with the [AsyncWebCrawler Basics](../core/simple-crawling.md) tutorial.  
-> - You can run Crawl4AI in your environment (Playwright, Python, etc.).
-
----
-
-Below is a revised version of the **Link Extraction** and **Media Extraction** sections that includes example data structures showing how links and media items are stored in `CrawlResult`. Feel free to adjust any field names or descriptions to match your actual output.
-
----
-
-## 1. Link Extraction
-
-### 1.1 `result.links`
-
-When you call `arun()` or `arun_many()` on a URL, Crawl4AI automatically extracts links and stores them in the `links` field of `CrawlResult`. By default, the crawler tries to distinguish **internal** links (same domain) from **external** links (different domains).
-
-**Basic Example**:
-
-```python
-from crawl4ai import AsyncWebCrawler
-
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun("https://www.example.com")
-    if result.success:
-        internal_links = result.links.get("internal", [])
-        external_links = result.links.get("external", [])
-        print(f"Found {len(internal_links)} internal links.")
-        print(f"Found {len(internal_links)} external links.")
-        print(f"Found {len(result.media)} media items.")
-
-        # Each link is typically a dictionary with fields like:
-        # { "href": "...", "text": "...", "title": "...", "base_domain": "..." }
-        if internal_links:
-            print("Sample Internal Link:", internal_links[0])
-    else:
-        print("Crawl failed:", result.error_message)
-```
-
-**Structure Example**:
-
-```python
-result.links = {
-  "internal": [
-    {
-      "href": "https://kidocode.com/",
-      "text": "",
-      "title": "",
-      "base_domain": "kidocode.com"
-    },
-    {
-      "href": "https://kidocode.com/degrees/technology",
-      "text": "Technology Degree",
-      "title": "KidoCode Tech Program",
-      "base_domain": "kidocode.com"
-    },
-    # ...
-  ],
-  "external": [
-    # possibly other links leading to third-party sites
-  ]
-}
-```
-
-- **`href`**: The raw hyperlink URL.  
-- **`text`**: The link text (if any) within the `<a>` tag.  
-- **`title`**: The `title` attribute of the link (if present).  
-- **`base_domain`**: The domain extracted from `href`. Helpful for filtering or grouping by domain.
-
----
-
-## 2. Domain Filtering
-
-Some websites contain hundreds of third-party or affiliate links. You can filter out certain domains at **crawl time** by configuring the crawler. The most relevant parameters in `CrawlerRunConfig` are:
-
-- **`exclude_external_links`**: If `True`, discard any link pointing outside the root domain.  
-- **`exclude_social_media_domains`**: Provide a list of social media platforms (e.g., `["facebook.com", "twitter.com"]`) to exclude from your crawl.  
-- **`exclude_social_media_links`**: If `True`, automatically skip known social platforms.  
-- **`exclude_domains`**: Provide a list of custom domains you want to exclude (e.g., `["spammyads.com", "tracker.net"]`).
-
-### 2.1 Example: Excluding External & Social Media Links
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def main():
-    crawler_cfg = CrawlerRunConfig(
-        exclude_external_links=True,          # No links outside primary domain
-        exclude_social_media_links=True       # Skip recognized social media domains
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            "https://www.example.com",
-            config=crawler_cfg
-        )
-        if result.success:
-            print("[OK] Crawled:", result.url)
-            print("Internal links count:", len(result.links.get("internal", [])))
-            print("External links count:", len(result.links.get("external", [])))  
-            # Likely zero external links in this scenario
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### 2.2 Example: Excluding Specific Domains
-
-If you want to let external links in, but specifically exclude a domain (e.g., `suspiciousads.com`), do this:
-
-```python
-crawler_cfg = CrawlerRunConfig(
-    exclude_domains=["suspiciousads.com"]
-)
-```
-
-This approach is handy when you still want external links but need to block certain sites you consider spammy.
-
----
-
-## 3. Media Extraction
-
-### 3.1 Accessing `result.media`
-
-By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
-
-**Basic Example**:
-
-```python
-if result.success:
-    # Get images
-    images_info = result.media.get("images", [])
-    print(f"Found {len(images_info)} images in total.")
-    for i, img in enumerate(images_info[:3]):  # Inspect just the first 3
-        print(f"[Image {i}] URL: {img['src']}")
-        print(f"           Alt text: {img.get('alt', '')}")
-        print(f"           Score: {img.get('score')}")
-        print(f"           Description: {img.get('desc', '')}\n")
-    
-    # Get tables
-    tables = result.media.get("tables", [])
-    print(f"Found {len(tables)} data tables in total.")
-    for i, table in enumerate(tables):
-        print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
-        print(f"           Columns: {len(table.get('headers', []))}")
-        print(f"           Rows: {len(table.get('rows', []))}")
-```
-
-**Structure Example**:
-
-```python
-result.media = {
-  "images": [
-    {
-      "src": "https://cdn.prod.website-files.com/.../Group%2089.svg",
-      "alt": "coding school for kids",
-      "desc": "Trial Class Degrees degrees All Degrees AI Degree Technology ...",
-      "score": 3,
-      "type": "image",
-      "group_id": 0,
-      "format": None,
-      "width": None,
-      "height": None
-    },
-    # ...
-  ],
-  "videos": [
-    # Similar structure but with video-specific fields
-  ],
-  "audio": [
-    # Similar structure but with audio-specific fields
-  ],
-  "tables": [
-    {
-      "headers": ["Name", "Age", "Location"],
-      "rows": [
-        ["John Doe", "34", "New York"],
-        ["Jane Smith", "28", "San Francisco"],
-        ["Alex Johnson", "42", "Chicago"]
-      ],
-      "caption": "Employee Directory",
-      "summary": "Directory of company employees"
-    },
-    # More tables if present
-  ]
-}
-```
-
-Depending on your Crawl4AI version or scraping strategy, these dictionaries can include fields like:
-
-- **`src`**: The media URL (e.g., image source)  
-- **`alt`**: The alt text for images (if present)  
-- **`desc`**: A snippet of nearby text or a short description (optional)  
-- **`score`**: A heuristic relevance score if you’re using content-scoring features  
-- **`width`**, **`height`**: If the crawler detects dimensions for the image/video  
-- **`type`**: Usually `"image"`, `"video"`, or `"audio"`  
-- **`group_id`**: If you’re grouping related media items, the crawler might assign an ID  
-
-With these details, you can easily filter out or focus on certain images (for instance, ignoring images with very low scores or a different domain), or gather metadata for analytics.
-
-### 3.2 Excluding External Images
-
-If you’re dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
-
-```python
-crawler_cfg = CrawlerRunConfig(
-    exclude_external_images=True
-)
-```
-
-This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling.
-
-### 3.3 Working with Tables
-
-Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
-
-- Presence of thead and tbody sections
-- Use of th elements for headers
-- Column consistency
-- Text density
-- And other factors
-
-Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
-
-**Accessing Table Data**:
-
-```python
-if result.success:
-    tables = result.media.get("tables", [])
-    print(f"Found {len(tables)} data tables on the page")
-    
-    if tables:
-        # Access the first table
-        first_table = tables[0]
-        print(f"Table caption: {first_table.get('caption', 'No caption')}")
-        print(f"Headers: {first_table.get('headers', [])}")
-        
-        # Print the first 3 rows
-        for i, row in enumerate(first_table.get('rows', [])[:3]):
-            print(f"Row {i+1}: {row}")
-```
-
-**Configuring Table Extraction**:
-
-You can adjust the sensitivity of the table detection algorithm with:
-
-```python
-crawler_cfg = CrawlerRunConfig(
-    table_score_threshold=5  # Lower value = more tables detected (default: 7)
-)
-```
-
-Each extracted table contains:
-- `headers`: Column header names
-- `rows`: List of rows, each containing cell values
-- `caption`: Table caption text (if available)
-- `summary`: Table summary attribute (if specified)
-
-### 3.4 Additional Media Config
-
-- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.  
-- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.  
-- **`capture_mhtml`**: Set to `True` if you want an MHTML snapshot of the page in `result.mhtml`. This format preserves the entire web page with all its resources (CSS, images, scripts) in a single file, making it perfect for archiving or offline viewing.
-- **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction.
-
-#### Example: Capturing Page as MHTML
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    crawler_cfg = CrawlerRunConfig(
-        capture_mhtml=True  # Enable MHTML capture
-    )
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com", config=crawler_cfg)
-        
-        if result.success and result.mhtml:
-            # Save the MHTML snapshot to a file
-            with open("example.mhtml", "w", encoding="utf-8") as f:
-                f.write(result.mhtml)
-            print("MHTML snapshot saved to example.mhtml")
-        else:
-            print("Failed to capture MHTML:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-The MHTML format is particularly useful because:
-- It captures the complete page state including all resources
-- It can be opened in most modern browsers for offline viewing
-- It preserves the page exactly as it appeared during crawling
-- It's a single file, making it easy to store and transfer
-
----
-
-## 4. Putting It All Together: Link & Media Filtering
-
-Here’s a combined example demonstrating how to filter out external links, skip certain domains, and exclude external images:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def main():
-    # Suppose we want to keep only internal links, remove certain domains, 
-    # and discard external images from the final crawl data.
-    crawler_cfg = CrawlerRunConfig(
-        exclude_external_links=True,
-        exclude_domains=["spammyads.com"],
-        exclude_social_media_links=True,   # skip Twitter, Facebook, etc.
-        exclude_external_images=True,      # keep only images from main domain
-        wait_for_images=True,             # ensure images are loaded
-        verbose=True
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://www.example.com", config=crawler_cfg)
-
-        if result.success:
-            print("[OK] Crawled:", result.url)
-            
-            # 1. Links
-            in_links = result.links.get("internal", [])
-            ext_links = result.links.get("external", [])
-            print("Internal link count:", len(in_links))
-            print("External link count:", len(ext_links))  # should be zero with exclude_external_links=True
-            
-            # 2. Images
-            images = result.media.get("images", [])
-            print("Images found:", len(images))
-            
-            # Let's see a snippet of these images
-            for i, img in enumerate(images[:3]):
-                print(f"  - {img['src']} (alt={img.get('alt','')}, score={img.get('score','N/A')})")
-        else:
-            print("[ERROR] Failed to crawl. Reason:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-## 5. Common Pitfalls & Tips
-
-1. **Conflicting Flags**:  
-   - `exclude_external_links=True` but then also specifying `exclude_social_media_links=True` is typically fine, but understand that the first setting already discards *all* external links. The second becomes somewhat redundant.  
-   - `exclude_external_images=True` but want to keep some external images? Currently no partial domain-based setting for images, so you might need a custom approach or hook logic.
-
-2. **Relevancy Scores**:  
-   - If your version of Crawl4AI or your scraping strategy includes an `img["score"]`, it’s typically a heuristic based on size, position, or content analysis. Evaluate carefully if you rely on it.
-
-3. **Performance**:  
-   - Excluding certain domains or external images can speed up your crawl, especially for large, media-heavy pages.  
-   - If you want a “full” link map, do *not* exclude them. Instead, you can post-filter in your own code.
-
-4. **Social Media Lists**:  
-   - `exclude_social_media_links=True` typically references an internal list of known social domains like Facebook, Twitter, LinkedIn, etc. If you need to add or remove from that list, look for library settings or a local config file (depending on your version).
-
----
-
-**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
-### Table Extraction Tips
-
-- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
-- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
-- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
-
-The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.
-
-```
-
-
-## File: docs/md_v2/core/local-files.md
-
-```md
-# Prefix-Based Input Handling in Crawl4AI
-
-This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.
-
-## Crawling a Web URL
-
-To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.async_configs import CrawlerRunConfig
-
-async def crawl_web():
-    config = CrawlerRunConfig(bypass_cache=True)
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://en.wikipedia.org/wiki/apple", 
-            config=config
-        )
-        if result.success:
-            print("Markdown Content:")
-            print(result.markdown)
-        else:
-            print(f"Failed to crawl: {result.error_message}")
-
-asyncio.run(crawl_web())
-```
-
-## Crawling a Local HTML File
-
-To crawl a local HTML file, prefix the file path with `file://`.
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.async_configs import CrawlerRunConfig
-
-async def crawl_local_file():
-    local_file_path = "/path/to/apple.html"  # Replace with your file path
-    file_url = f"file://{local_file_path}"
-    config = CrawlerRunConfig(bypass_cache=True)
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url=file_url, config=config)
-        if result.success:
-            print("Markdown Content from Local File:")
-            print(result.markdown)
-        else:
-            print(f"Failed to crawl local file: {result.error_message}")
-
-asyncio.run(crawl_local_file())
-```
-
-## Crawling Raw HTML Content
-
-To crawl raw HTML content, prefix the HTML string with `raw:`.
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.async_configs import CrawlerRunConfig
-
-async def crawl_raw_html():
-    raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
-    raw_html_url = f"raw:{raw_html}"
-    config = CrawlerRunConfig(bypass_cache=True)
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url=raw_html_url, config=config)
-        if result.success:
-            print("Markdown Content from Raw HTML:")
-            print(result.markdown)
-        else:
-            print(f"Failed to crawl raw HTML: {result.error_message}")
-
-asyncio.run(crawl_raw_html())
-```
-
----
-
-# Complete Example
-
-Below is a comprehensive script that:
-
-1. Crawls the Wikipedia page for "Apple."
-2. Saves the HTML content to a local file (`apple.html`).
-3. Crawls the local HTML file and verifies the markdown length matches the original crawl.
-4. Crawls the raw HTML content from the saved file and verifies consistency.
-
-```python
-import os
-import sys
-import asyncio
-from pathlib import Path
-from crawl4ai import AsyncWebCrawler
-from crawl4ai.async_configs import CrawlerRunConfig
-
-async def main():
-    wikipedia_url = "https://en.wikipedia.org/wiki/apple"
-    script_dir = Path(__file__).parent
-    html_file_path = script_dir / "apple.html"
-
-    async with AsyncWebCrawler() as crawler:
-        # Step 1: Crawl the Web URL
-        print("\n=== Step 1: Crawling the Wikipedia URL ===")
-        web_config = CrawlerRunConfig(bypass_cache=True)
-        result = await crawler.arun(url=wikipedia_url, config=web_config)
-
-        if not result.success:
-            print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
-            return
-
-        with open(html_file_path, 'w', encoding='utf-8') as f:
-            f.write(result.html)
-        web_crawl_length = len(result.markdown)
-        print(f"Length of markdown from web crawl: {web_crawl_length}\n")
-
-        # Step 2: Crawl from the Local HTML File
-        print("=== Step 2: Crawling from the Local HTML File ===")
-        file_url = f"file://{html_file_path.resolve()}"
-        file_config = CrawlerRunConfig(bypass_cache=True)
-        local_result = await crawler.arun(url=file_url, config=file_config)
-
-        if not local_result.success:
-            print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
-            return
-
-        local_crawl_length = len(local_result.markdown)
-        assert web_crawl_length == local_crawl_length, "Markdown length mismatch"
-        print("✅ Markdown length matches between web and local file crawl.\n")
-
-        # Step 3: Crawl Using Raw HTML Content
-        print("=== Step 3: Crawling Using Raw HTML Content ===")
-        with open(html_file_path, 'r', encoding='utf-8') as f:
-            raw_html_content = f.read()
-        raw_html_url = f"raw:{raw_html_content}"
-        raw_config = CrawlerRunConfig(bypass_cache=True)
-        raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
-
-        if not raw_result.success:
-            print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
-            return
-
-        raw_crawl_length = len(raw_result.markdown)
-        assert web_crawl_length == raw_crawl_length, "Markdown length mismatch"
-        print("✅ Markdown length matches between web and raw HTML crawl.\n")
-
-        print("All tests passed successfully!")
-    if html_file_path.exists():
-        os.remove(html_file_path)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-# Conclusion
-
-With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios.
-```
-
-
-## File: docs/md_v2/core/markdown-generation.md
-
-```md
-# Markdown Generation Basics
-
-One of Crawl4AI’s core features is generating **clean, structured markdown** from web pages. Originally built to solve the problem of extracting only the “actual” content and discarding boilerplate or noise, Crawl4AI’s markdown system remains one of its biggest draws for AI workflows.
-
-In this tutorial, you’ll learn:
-
-1. How to configure the **Default Markdown Generator**  
-2. How **content filters** (BM25 or Pruning) help you refine markdown and discard junk  
-3. The difference between raw markdown (`result.markdown`) and filtered markdown (`fit_markdown`)  
-
-> **Prerequisites**  
-> - You’ve completed or read [AsyncWebCrawler Basics](../core/simple-crawling.md) to understand how to run a simple crawl.  
-> - You know how to configure `CrawlerRunConfig`.
-
----
-
-## 1. Quick Example
-
-Here’s a minimal code snippet that uses the **DefaultMarkdownGenerator** with no additional filtering:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-
-async def main():
-    config = CrawlerRunConfig(
-        markdown_generator=DefaultMarkdownGenerator()
-    )
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com", config=config)
-        
-        if result.success:
-            print("Raw Markdown Output:\n")
-            print(result.markdown)  # The unfiltered markdown from the page
-        else:
-            print("Crawl failed:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**What’s happening?**  
-- `CrawlerRunConfig( markdown_generator = DefaultMarkdownGenerator() )` instructs Crawl4AI to convert the final HTML into markdown at the end of each crawl.  
-- The resulting markdown is accessible via `result.markdown`.
-
----
-
-## 2. How Markdown Generation Works
-
-### 2.1 HTML-to-Text Conversion (Forked & Modified)
-
-Under the hood, **DefaultMarkdownGenerator** uses a specialized HTML-to-text approach that:
-
-- Preserves headings, code blocks, bullet points, etc.  
-- Removes extraneous tags (scripts, styles) that don’t add meaningful content.  
-- Can optionally generate references for links or skip them altogether.
-
-A set of **options** (passed as a dict) allows you to customize precisely how HTML converts to markdown. These map to standard html2text-like configuration plus your own enhancements (e.g., ignoring internal links, preserving certain tags verbatim, or adjusting line widths).
-
-### 2.2 Link Citations & References
-
-By default, the generator can convert `<a href="...">` elements into `[text][1]` citations, then place the actual links at the bottom of the document. This is handy for research workflows that demand references in a structured manner.
-
-### 2.3 Optional Content Filters
-
-Before or after the HTML-to-Markdown step, you can apply a **content filter** (like BM25 or Pruning) to reduce noise and produce a “fit_markdown”—a heavily pruned version focusing on the page’s main text. We’ll cover these filters shortly.
-
----
-
-## 3. Configuring the Default Markdown Generator
-
-You can tweak the output by passing an `options` dict to `DefaultMarkdownGenerator`. For example:
-
-```python
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    # Example: ignore all links, don't escape HTML, and wrap text at 80 characters
-    md_generator = DefaultMarkdownGenerator(
-        options={
-            "ignore_links": True,
-            "escape_html": False,
-            "body_width": 80
-        }
-    )
-
-    config = CrawlerRunConfig(
-        markdown_generator=md_generator
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com/docs", config=config)
-        if result.success:
-            print("Markdown:\n", result.markdown[:500])  # Just a snippet
-        else:
-            print("Crawl failed:", result.error_message)
-
-if __name__ == "__main__":
-    import asyncio
-    asyncio.run(main())
-```
-
-Some commonly used `options`:
-
-- **`ignore_links`** (bool): Whether to remove all hyperlinks in the final markdown.  
-- **`ignore_images`** (bool): Remove all `![image]()` references.  
-- **`escape_html`** (bool): Turn HTML entities into text (default is often `True`).  
-- **`body_width`** (int): Wrap text at N characters. `0` or `None` means no wrapping.  
-- **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page.  
-- **`include_sup_sub`** (bool): Attempt to handle `<sup>` / `<sub>` in a more readable way.
-
-## 4. Selecting the HTML Source for Markdown Generation
-
-The `content_source` parameter allows you to control which HTML content is used as input for markdown generation. This gives you flexibility in how the HTML is processed before conversion to markdown.
-
-```python
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    # Option 1: Use the raw HTML directly from the webpage (before any processing)
-    raw_md_generator = DefaultMarkdownGenerator(
-        content_source="raw_html",
-        options={"ignore_links": True}
-    )
-    
-    # Option 2: Use the cleaned HTML (after scraping strategy processing - default)
-    cleaned_md_generator = DefaultMarkdownGenerator(
-        content_source="cleaned_html",  # This is the default
-        options={"ignore_links": True}
-    )
-    
-    # Option 3: Use preprocessed HTML optimized for schema extraction
-    fit_md_generator = DefaultMarkdownGenerator(
-        content_source="fit_html",
-        options={"ignore_links": True}
-    )
-    
-    # Use one of the generators in your crawler config
-    config = CrawlerRunConfig(
-        markdown_generator=raw_md_generator  # Try each of the generators
-    )
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com", config=config)
-        if result.success:
-            print("Markdown:\n", result.markdown.raw_markdown[:500])
-        else:
-            print("Crawl failed:", result.error_message)
-
-if __name__ == "__main__":
-    import asyncio
-    asyncio.run(main())
-```
-
-### HTML Source Options
-
-- **`"cleaned_html"`** (default): Uses the HTML after it has been processed by the scraping strategy. This HTML is typically cleaner and more focused on content, with some boilerplate removed.
-
-- **`"raw_html"`**: Uses the original HTML directly from the webpage, before any cleaning or processing. This preserves more of the original content, but may include navigation bars, ads, footers, and other elements that might not be relevant to the main content.
-
-- **`"fit_html"`**: Uses HTML preprocessed for schema extraction. This HTML is optimized for structured data extraction and may have certain elements simplified or removed.
-
-### When to Use Each Option
-
-- Use **`"cleaned_html"`** (default) for most cases where you want a balance of content preservation and noise removal.
-- Use **`"raw_html"`** when you need to preserve all original content, or when the cleaning process is removing content you actually want to keep.
-- Use **`"fit_html"`** when working with structured data or when you need HTML that's optimized for schema extraction.
-
----
-
-## 5. Content Filters
-
-**Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want.
-
-### 5.1 BM25ContentFilter
-
-If you have a **search query**, BM25 is a good choice:
-
-```python
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import BM25ContentFilter
-from crawl4ai import CrawlerRunConfig
-
-bm25_filter = BM25ContentFilter(
-    user_query="machine learning",
-    bm25_threshold=1.2,
-    use_stemming=True
-)
-
-md_generator = DefaultMarkdownGenerator(
-    content_filter=bm25_filter,
-    options={"ignore_links": True}
-)
-
-config = CrawlerRunConfig(markdown_generator=md_generator)
-```
-
-- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.  
-- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.  
-- **`use_stemming`**: If `True`, variations of words match (e.g., “learn,” “learning,” “learnt”).
-
-**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
-
-### 5.2 PruningContentFilter
-
-If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections.
-
-```python
-from crawl4ai.content_filter_strategy import PruningContentFilter
-
-prune_filter = PruningContentFilter(
-    threshold=0.5,
-    threshold_type="fixed",  # or "dynamic"
-    min_word_threshold=50
-)
-```
-
-- **`threshold`**: Score boundary. Blocks below this score get removed.  
-- **`threshold_type`**:  
-    - `"fixed"`: Straight comparison (`score >= threshold` keeps the block).  
-    - `"dynamic"`: The filter adjusts threshold in a data-driven manner.  
-- **`min_word_threshold`**: Discard blocks under N words as likely too short or unhelpful.
-
-**When to Use PruningContentFilter**  
-- You want a broad cleanup without a user query.  
-- The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
-
-### 5.3 LLMContentFilter
-
-For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
-
-```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
-from crawl4ai.content_filter_strategy import LLMContentFilter
-
-async def main():
-    # Initialize LLM filter with specific instruction
-    filter = LLMContentFilter(
-        llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
-        instruction="""
-        Focus on extracting the core educational content.
-        Include:
-        - Key concepts and explanations
-        - Important code examples
-        - Essential technical details
-        Exclude:
-        - Navigation elements
-        - Sidebars
-        - Footer content
-        Format the output as clean markdown with proper code blocks and headers.
-        """,
-        chunk_token_threshold=4096,  # Adjust based on your needs
-        verbose=True
-    )
-
-    config = CrawlerRunConfig(
-        content_filter=filter
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com", config=config)
-        print(result.markdown.fit_markdown)  # Filtered markdown content
-```
-
-**Key Features:**
-- **Intelligent Filtering**: Uses LLMs to understand and extract relevant content while maintaining context
-- **Customizable Instructions**: Tailor the filtering process with specific instructions
-- **Chunk Processing**: Handles large documents by processing them in chunks (controlled by `chunk_token_threshold`)
-- **Parallel Processing**: For better performance, use smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks
-
-**Two Common Use Cases:**
-
-1. **Exact Content Preservation**:
-```python
-filter = LLMContentFilter(
-    instruction="""
-    Extract the main educational content while preserving its original wording and substance completely.
-    1. Maintain the exact language and terminology
-    2. Keep all technical explanations and examples intact
-    3. Preserve the original flow and structure
-    4. Remove only clearly irrelevant elements like navigation menus and ads
-    """,
-    chunk_token_threshold=4096
-)
-```
-
-2. **Focused Content Extraction**:
-```python
-filter = LLMContentFilter(
-    instruction="""
-    Focus on extracting specific types of content:
-    - Technical documentation
-    - Code examples
-    - API references
-    Reformat the content into clear, well-structured markdown
-    """,
-    chunk_token_threshold=4096
-)
-```
-
-> **Performance Tip**: Set a smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks. The default value is infinity, which processes the entire content as a single chunk.
-
----
-
-## 6. Using Fit Markdown
-
-When a content filter is active, the library produces two forms of markdown inside `result.markdown`:
-
-1. **`raw_markdown`**: The full unfiltered markdown.  
-2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments.
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import PruningContentFilter
-
-async def main():
-    config = CrawlerRunConfig(
-        markdown_generator=DefaultMarkdownGenerator(
-            content_filter=PruningContentFilter(threshold=0.6),
-            options={"ignore_links": True}
-        )
-    )
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://news.example.com/tech", config=config)
-        if result.success:
-            print("Raw markdown:\n", result.markdown)
-            
-            # If a filter is used, we also have .fit_markdown:
-            md_object = result.markdown  # or your equivalent
-            print("Filtered markdown:\n", md_object.fit_markdown)
-        else:
-            print("Crawl failed:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-## 7. The `MarkdownGenerationResult` Object
-
-If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as:
-
-- **`raw_markdown`**: The direct HTML-to-markdown transformation (no filtering).  
-- **`markdown_with_citations`**: A version that moves links to reference-style footnotes.  
-- **`references_markdown`**: A separate string or section containing the gathered references.  
-- **`fit_markdown`**: The filtered markdown if you used a content filter.  
-- **`fit_html`**: The corresponding HTML snippet used to generate `fit_markdown` (helpful for debugging or advanced usage).
-
-**Example**:
-
-```python
-md_obj = result.markdown  # your library’s naming may vary
-print("RAW:\n", md_obj.raw_markdown)
-print("CITED:\n", md_obj.markdown_with_citations)
-print("REFERENCES:\n", md_obj.references_markdown)
-print("FIT:\n", md_obj.fit_markdown)
-```
-
-**Why Does This Matter?**  
-- You can supply `raw_markdown` to an LLM if you want the entire text.  
-- Or feed `fit_markdown` into a vector database to reduce token usage.  
-- `references_markdown` can help you keep track of link provenance.
-
----
-
-Below is a **revised section** under “Combining Filters (BM25 + Pruning)” that demonstrates how you can run **two** passes of content filtering without re-crawling, by taking the HTML (or text) from a first pass and feeding it into the second filter. It uses real code patterns from the snippet you provided for **BM25ContentFilter**, which directly accepts **HTML** strings (and can also handle plain text with minimal adaptation).
-
----
-
-## 8. Combining Filters (BM25 + Pruning) in Two Passes
-
-You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead:
-
-1. **First pass**: Apply `PruningContentFilter` directly to the raw HTML from `result.html` (the crawler’s downloaded HTML).  
-2. **Second pass**: Take the pruned HTML (or text) from step 1, and feed it into `BM25ContentFilter`, focusing on a user query.
-
-### Two-Pass Example
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
-from bs4 import BeautifulSoup
-
-async def main():
-    # 1. Crawl with minimal or no markdown generator, just get raw HTML
-    config = CrawlerRunConfig(
-        # If you only want raw HTML, you can skip passing a markdown_generator
-        # or provide one but focus on .html in this example
-    )
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com/tech-article", config=config)
-
-        if not result.success or not result.html:
-            print("Crawl failed or no HTML content.")
-            return
-        
-        raw_html = result.html
-        
-        # 2. First pass: PruningContentFilter on raw HTML
-        pruning_filter = PruningContentFilter(threshold=0.5, min_word_threshold=50)
-        
-        # filter_content returns a list of "text chunks" or cleaned HTML sections
-        pruned_chunks = pruning_filter.filter_content(raw_html)
-        # This list is basically pruned content blocks, presumably in HTML or text form
-        
-        # For demonstration, let's combine these chunks back into a single HTML-like string
-        # or you could do further processing. It's up to your pipeline design.
-        pruned_html = "\n".join(pruned_chunks)
-        
-        # 3. Second pass: BM25ContentFilter with a user query
-        bm25_filter = BM25ContentFilter(
-            user_query="machine learning",
-            bm25_threshold=1.2,
-            language="english"
-        )
-        
-        # returns a list of text chunks
-        bm25_chunks = bm25_filter.filter_content(pruned_html)  
-        
-        if not bm25_chunks:
-            print("Nothing matched the BM25 query after pruning.")
-            return
-        
-        # 4. Combine or display final results
-        final_text = "\n---\n".join(bm25_chunks)
-        
-        print("==== PRUNED OUTPUT (first pass) ====")
-        print(pruned_html[:500], "... (truncated)")  # preview
-
-        print("\n==== BM25 OUTPUT (second pass) ====")
-        print(final_text[:500], "... (truncated)")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### What’s Happening?
-
-1. **Raw HTML**: We crawl once and store the raw HTML in `result.html`.  
-2. **PruningContentFilter**: Takes HTML + optional parameters. It extracts blocks of text or partial HTML, removing headings/sections deemed “noise.” It returns a **list of text chunks**.  
-3. **Combine or Transform**: We join these pruned chunks back into a single HTML-like string. (Alternatively, you could store them in a list for further logic—whatever suits your pipeline.)  
-4. **BM25ContentFilter**: We feed the pruned string into `BM25ContentFilter` with a user query. This second pass further narrows the content to chunks relevant to “machine learning.”
-
-**No Re-Crawling**: We used `raw_html` from the first pass, so there’s no need to run `arun()` again—**no second network request**.
-
-### Tips & Variations
-
-- **Plain Text vs. HTML**: If your pruned output is mostly text, BM25 can still handle it; just keep in mind it expects a valid string input. If you supply partial HTML (like `"<p>some text</p>"`), it will parse it as HTML.  
-- **Chaining in a Single Pipeline**: If your code supports it, you can chain multiple filters automatically. Otherwise, manual two-pass filtering (as shown) is straightforward.  
-- **Adjust Thresholds**: If you see too much or too little text in step one, tweak `threshold=0.5` or `min_word_threshold=50`. Similarly, `bm25_threshold=1.2` can be raised/lowered for more or fewer chunks in step two.
-
-### One-Pass Combination?
-
-If your codebase or pipeline design allows applying multiple filters in one pass, you could do so. But often it’s simpler—and more transparent—to run them sequentially, analyzing each step’s result.
-
-**Bottom Line**: By **manually chaining** your filtering logic in two passes, you get powerful incremental control over the final content. First, remove “global” clutter with Pruning, then refine further with BM25-based query relevance—without incurring a second network crawl.
-
----
-
-## 9. Common Pitfalls & Tips
-
-1. **No Markdown Output?**  
-   - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements.  
-   - Check if your content filter is too aggressive. Lower thresholds or disable the filter to see if content reappears.
-
-2. **Performance Considerations**  
-   - Very large pages with multiple filters can be slower. Consider `cache_mode` to avoid re-downloading.  
-   - If your final use case is LLM ingestion, consider summarizing further or chunking big texts.
-
-3. **Take Advantage of `fit_markdown`**  
-   - Great for RAG pipelines, semantic search, or any scenario where extraneous boilerplate is unwanted.  
-   - Still verify the textual quality—some sites have crucial data in footers or sidebars.
-
-4. **Adjusting `html2text` Options**  
-   - If you see lots of raw HTML slipping into the text, turn on `escape_html`.  
-   - If code blocks look messy, experiment with `mark_code` or `handle_code_in_pre`.
-
----
-
-## 10. Summary & Next Steps
-
-In this **Markdown Generation Basics** tutorial, you learned to:
-
-- Configure the **DefaultMarkdownGenerator** with HTML-to-text options.  
-- Select different HTML sources using the `content_source` parameter.  
-- Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal.  
-- Distinguish between raw and filtered markdown (`fit_markdown`).  
-- Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.).
-
-Now you can produce high-quality Markdown from any website, focusing on exactly the content you need—an essential step for powering AI models, summarization pipelines, or knowledge-base queries.
-
-**Last Updated**: 2025-01-01
-
-```
-
-
-## File: docs/md_v2/core/page-interaction.md
-
-```md
-# Page Interaction
-
-Crawl4AI provides powerful features for interacting with **dynamic** webpages, handling JavaScript execution, waiting for conditions, and managing multi-step flows. By combining **js_code**, **wait_for**, and certain **CrawlerRunConfig** parameters, you can:
-
-1. Click “Load More” buttons  
-2. Fill forms and submit them  
-3. Wait for elements or data to appear  
-4. Reuse sessions across multiple steps  
-
-Below is a quick overview of how to do it.
-
----
-
-## 1. JavaScript Execution
-
-### Basic Execution
-
-**`js_code`** in **`CrawlerRunConfig`** accepts either a single JS string or a list of JS snippets.  
-**Example**: We’ll scroll to the bottom of the page, then optionally click a “Load More” button.
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    # Single JS command
-    config = CrawlerRunConfig(
-        js_code="window.scrollTo(0, document.body.scrollHeight);"
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://news.ycombinator.com",  # Example site
-            config=config
-        )
-        print("Crawled length:", len(result.cleaned_html))
-
-    # Multiple commands
-    js_commands = [
-        "window.scrollTo(0, document.body.scrollHeight);",
-        # 'More' link on Hacker News
-        "document.querySelector('a.morelink')?.click();",  
-    ]
-    config = CrawlerRunConfig(js_code=js_commands)
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://news.ycombinator.com",  # Another pass
-            config=config
-        )
-        print("After scroll+click, length:", len(result.cleaned_html))
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Relevant `CrawlerRunConfig` params**:
-- **`js_code`**: A string or list of strings with JavaScript to run after the page loads.
-- **`js_only`**: If set to `True` on subsequent calls, indicates we’re continuing an existing session without a new full navigation.  
-- **`session_id`**: If you want to keep the same page across multiple calls, specify an ID.
-
----
-
-## 2. Wait Conditions
-
-### 2.1 CSS-Based Waiting
-
-Sometimes, you just want to wait for a specific element to appear. For example:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    config = CrawlerRunConfig(
-        # Wait for at least 30 items on Hacker News
-        wait_for="css:.athing:nth-child(30)"  
-    )
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://news.ycombinator.com",
-            config=config
-        )
-        print("We have at least 30 items loaded!")
-        # Rough check
-        print("Total items in HTML:", result.cleaned_html.count("athing"))  
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key param**:
-- **`wait_for="css:..."`**: Tells the crawler to wait until that CSS selector is present.
-
-### 2.2 JavaScript-Based Waiting
-
-For more complex conditions (e.g., waiting for content length to exceed a threshold), prefix `js:`:
-
-```python
-wait_condition = """() => {
-    const items = document.querySelectorAll('.athing');
-    return items.length > 50;  // Wait for at least 51 items
-}"""
-
-config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
-```
-
-**Behind the Scenes**: Crawl4AI keeps polling the JS function until it returns `true` or a timeout occurs.
-
----
-
-## 3. Handling Dynamic Content
-
-Many modern sites require **multiple steps**: scrolling, clicking “Load More,” or updating via JavaScript. Below are typical patterns.
-
-### 3.1 Load More Example (Hacker News “More” Link)
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    # Step 1: Load initial Hacker News page
-    config = CrawlerRunConfig(
-        wait_for="css:.athing:nth-child(30)"  # Wait for 30 items
-    )
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://news.ycombinator.com",
-            config=config
-        )
-        print("Initial items loaded.")
-
-        # Step 2: Let's scroll and click the "More" link
-        load_more_js = [
-            "window.scrollTo(0, document.body.scrollHeight);",
-            # The "More" link at page bottom
-            "document.querySelector('a.morelink')?.click();"  
-        ]
-        
-        next_page_conf = CrawlerRunConfig(
-            js_code=load_more_js,
-            wait_for="""js:() => {
-                return document.querySelectorAll('.athing').length > 30;
-            }""",
-            # Mark that we do not re-navigate, but run JS in the same session:
-            js_only=True,
-            session_id="hn_session"
-        )
-
-        # Re-use the same crawler session
-        result2 = await crawler.arun(
-            url="https://news.ycombinator.com",  # same URL but continuing session
-            config=next_page_conf
-        )
-        total_items = result2.cleaned_html.count("athing")
-        print("Items after load-more:", total_items)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key params**:
-- **`session_id="hn_session"`**: Keep the same page across multiple calls to `arun()`.
-- **`js_only=True`**: We’re not performing a full reload, just applying JS in the existing page.
-- **`wait_for`** with `js:`: Wait for item count to grow beyond 30.
-
----
-
-### 3.2 Form Interaction
-
-If the site has a search or login form, you can fill fields and submit them with **`js_code`**. For instance, if GitHub had a local search form:
-
-```python
-js_form_interaction = """
-document.querySelector('#your-search').value = 'TypeScript commits';
-document.querySelector('form').submit();
-"""
-
-config = CrawlerRunConfig(
-    js_code=js_form_interaction,
-    wait_for="css:.commit"
-)
-result = await crawler.arun(url="https://github.com/search", config=config)
-```
-
-**In reality**: Replace IDs or classes with the real site’s form selectors.
-
----
-
-## 4. Timing Control
-
-1. **`page_timeout`** (ms): Overall page load or script execution time limit.  
-2. **`delay_before_return_html`** (seconds): Wait an extra moment before capturing the final HTML.  
-3. **`mean_delay`** & **`max_range`**: If you call `arun_many()` with multiple URLs, these add a random pause between each request.
-
-**Example**:
-
-```python
-config = CrawlerRunConfig(
-    page_timeout=60000,  # 60s limit
-    delay_before_return_html=2.5
-)
-```
-
----
-
-## 5. Multi-Step Interaction Example
-
-Below is a simplified script that does multiple “Load More” clicks on GitHub’s TypeScript commits page. It **re-uses** the same session to accumulate new commits each time. The code includes the relevant **`CrawlerRunConfig`** parameters you’d rely on.
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-
-async def multi_page_commits():
-    browser_cfg = BrowserConfig(
-        headless=False,  # Visible for demonstration
-        verbose=True
-    )
-    session_id = "github_ts_commits"
-    
-    base_wait = """js:() => {
-        const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-        return commits.length > 0;
-    }"""
-
-    # Step 1: Load initial commits
-    config1 = CrawlerRunConfig(
-        wait_for=base_wait,
-        session_id=session_id,
-        cache_mode=CacheMode.BYPASS,
-        # Not using js_only yet since it's our first load
-    )
-
-    async with AsyncWebCrawler(config=browser_cfg) as crawler:
-        result = await crawler.arun(
-            url="https://github.com/microsoft/TypeScript/commits/main",
-            config=config1
-        )
-        print("Initial commits loaded. Count:", result.cleaned_html.count("commit"))
-
-        # Step 2: For subsequent pages, we run JS to click 'Next Page' if it exists
-        js_next_page = """
-        const selector = 'a[data-testid="pagination-next-button"]';
-        const button = document.querySelector(selector);
-        if (button) button.click();
-        """
-        
-        # Wait until new commits appear
-        wait_for_more = """js:() => {
-            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
-            if (!window.firstCommit && commits.length>0) {
-                window.firstCommit = commits[0].textContent;
-                return false;
-            }
-            // If top commit changes, we have new commits
-            const topNow = commits[0]?.textContent.trim();
-            return topNow && topNow !== window.firstCommit;
-        }"""
-
-        for page in range(2):  # let's do 2 more "Next" pages
-            config_next = CrawlerRunConfig(
-                session_id=session_id,
-                js_code=js_next_page,
-                wait_for=wait_for_more,
-                js_only=True,       # We're continuing from the open tab
-                cache_mode=CacheMode.BYPASS
-            )
-            result2 = await crawler.arun(
-                url="https://github.com/microsoft/TypeScript/commits/main",
-                config=config_next
-            )
-            print(f"Page {page+2} commits count:", result2.cleaned_html.count("commit"))
-
-        # Optionally kill session
-        await crawler.crawler_strategy.kill_session(session_id)
-
-async def main():
-    await multi_page_commits()
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key Points**:
-
-- **`session_id`**: Keep the same page open.  
-- **`js_code`** + **`wait_for`** + **`js_only=True`**: We do partial refreshes, waiting for new commits to appear.  
-- **`cache_mode=CacheMode.BYPASS`** ensures we always see fresh data each step.
-
----
-
-## 6. Combine Interaction with Extraction
-
-Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
-
-```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-schema = {
-    "name": "Commits",
-    "baseSelector": "li.Box-sc-g0xbh4-0",
-    "fields": [
-        {"name": "title", "selector": "h4.markdown-title", "type": "text"}
-    ]
-}
-config = CrawlerRunConfig(
-    session_id="ts_commits_session",
-    js_code=js_next_page,
-    wait_for=wait_for_more,
-    extraction_strategy=JsonCssExtractionStrategy(schema)
-)
-```
-
-When done, check `result.extracted_content` for the JSON.
-
----
-
-## 7. Relevant `CrawlerRunConfig` Parameters
-
-Below are the key interaction-related parameters in `CrawlerRunConfig`. For a full list, see [Configuration Parameters](../api/parameters.md).
-
-- **`js_code`**: JavaScript to run after initial load.  
-- **`js_only`**: If `True`, no new page navigation—only JS in the existing session.  
-- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.  
-- **`session_id`**: Reuse the same page across calls.  
-- **`cache_mode`**: Whether to read/write from the cache or bypass.  
-- **`remove_overlay_elements`**: Remove certain popups automatically.  
-- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or “human-like” interactions.
-
----
-
-## 8. Conclusion
-
-Crawl4AI’s **page interaction** features let you:
-
-1. **Execute JavaScript** for scrolling, clicks, or form filling.  
-2. **Wait** for CSS or custom JS conditions before capturing data.  
-3. **Handle** multi-step flows (like “Load More”) with partial reloads or persistent sessions.  
-4. Combine with **structured extraction** for dynamic sites.
-
-With these tools, you can scrape modern, interactive webpages confidently. For advanced hooking, user simulation, or in-depth config, check the [API reference](../api/parameters.md) or related advanced docs. Happy scripting!
-```
-
-
-## File: docs/md_v2/core/quickstart.md
-
-```md
-# Getting Started with Crawl4AI
-
-Welcome to **Crawl4AI**, an open-source LLM-friendly Web Crawler & Scraper. In this tutorial, you’ll:
-
-1. Run your **first crawl** using minimal configuration.  
-2. Generate **Markdown** output (and learn how it’s influenced by content filters).  
-3. Experiment with a simple **CSS-based extraction** strategy.  
-4. See a glimpse of **LLM-based extraction** (including open-source and closed-source model options).  
-5. Crawl a **dynamic** page that loads content via JavaScript.
-
----
-
-## 1. Introduction
-
-Crawl4AI provides:
-
-- An asynchronous crawler, **`AsyncWebCrawler`**.  
-- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**.  
-- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports optional filters).  
-- Multiple extraction strategies (LLM-based or “traditional” CSS/XPath-based).
-
-By the end of this guide, you’ll have performed a basic crawl, generated Markdown, tried out two extraction strategies, and crawled a dynamic page that uses “Load More” buttons or JavaScript updates.
-
----
-
-## 2. Your First Crawl
-
-Here’s a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-
-async def main():
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com")
-        print(result.markdown[:300])  # Print first 300 chars
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**What’s happening?**
-- **`AsyncWebCrawler`** launches a headless browser (Chromium by default).
-- It fetches `https://example.com`.
-- Crawl4AI automatically converts the HTML into Markdown.
-
-You now have a simple, working crawl!
-
----
-
-## 3. Basic Configuration (Light Introduction)
-
-Crawl4AI’s crawler can be heavily customized using two main classes:
-
-1. **`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.).  
-2. **`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.).
-
-Below is an example with minimal usage:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-
-async def main():
-    browser_conf = BrowserConfig(headless=True)  # or False to see the browser
-    run_conf = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS
-    )
-
-    async with AsyncWebCrawler(config=browser_conf) as crawler:
-        result = await crawler.arun(
-            url="https://example.com",
-            config=run_conf
-        )
-        print(result.markdown)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
-
-We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
-
----
-
-## 4. Generating Markdown Output
-
-By default, Crawl4AI automatically generates Markdown from each crawled page. However, the exact output depends on whether you specify a **markdown generator** or **content filter**.
-
-- **`result.markdown`**:  
-  The direct HTML-to-Markdown conversion.  
-- **`result.markdown.fit_markdown`**:  
-  The same content after applying any configured **content filter** (e.g., `PruningContentFilter`).
-
-### Example: Using a Filter with `DefaultMarkdownGenerator`
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-
-md_generator = DefaultMarkdownGenerator(
-    content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed")
-)
-
-config = CrawlerRunConfig(
-    cache_mode=CacheMode.BYPASS,
-    markdown_generator=md_generator
-)
-
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun("https://news.ycombinator.com", config=config)
-    print("Raw Markdown length:", len(result.markdown.raw_markdown))
-    print("Fit Markdown length:", len(result.markdown.fit_markdown))
-```
-
-**Note**: If you do **not** specify a content filter or markdown generator, you’ll typically see only the raw Markdown. `PruningContentFilter` may adds around `50ms` in processing time. We’ll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial.
-
----
-
-## 5. Simple Data Extraction (CSS-based)
-
-Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. Below is a minimal CSS-based example:
-
-> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
-
-```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-from crawl4ai import LLMConfig
-
-# Generate a schema (one-time cost)
-html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
-
-# Using OpenAI (requires API token)
-schema = JsonCssExtractionStrategy.generate_schema(
-    html,
-    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")  # Required for OpenAI
-)
-
-# Or using Ollama (open source, no token needed)
-schema = JsonCssExtractionStrategy.generate_schema(
-    html,
-    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
-)
-
-# Use the schema for fast, repeated extractions
-strategy = JsonCssExtractionStrategy(schema)
-```
-
-For a complete guide on schema generation and advanced usage, see [No-LLM Extraction Strategies](../extraction/no-llm-strategies.md).
-
-Here's a basic extraction example:
-
-```python
-import asyncio
-import json
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def main():
-    schema = {
-        "name": "Example Items",
-        "baseSelector": "div.item",
-        "fields": [
-            {"name": "title", "selector": "h2", "type": "text"},
-            {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
-        ]
-    }
-
-    raw_html = "<div class='item'><h2>Item 1</h2><a href='https://example.com/item1'>Link 1</a></div>"
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="raw://" + raw_html,
-            config=CrawlerRunConfig(
-                cache_mode=CacheMode.BYPASS,
-                extraction_strategy=JsonCssExtractionStrategy(schema)
-            )
-        )
-        # The JSON output is stored in 'extracted_content'
-        data = json.loads(result.extracted_content)
-        print(data)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Why is this helpful?**
-- Great for repetitive page structures (e.g., item listings, articles).
-- No AI usage or costs.
-- The crawler returns a JSON string you can parse or store.
-
-> Tips: You can pass raw HTML to the crawler instead of a URL. To do so, prefix the HTML with `raw://`.
-
----
-
-## 6. Simple Data Extraction (LLM-based)
-
-For more complex or irregular pages, a language model can parse text intelligently into a structure you define. Crawl4AI supports **open-source** or **closed-source** providers:
-
-- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`)  
-- **OpenAI Models** (e.g., `openai/gpt-4`, requires `api_token`)  
-- Or any provider supported by the underlying library
-
-Below is an example using **open-source** style (no token) and closed-source:
-
-```python
-import os
-import json
-import asyncio
-from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-class OpenAIModelFee(BaseModel):
-    model_name: str = Field(..., description="Name of the OpenAI model.")
-    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(
-        ..., description="Fee for output token for the OpenAI model."
-    )
-
-async def extract_structured_data_using_llm(
-    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
-):
-    print(f"\n--- Extracting Structured Data with {provider} ---")
-
-    if api_token is None and provider != "ollama":
-        print(f"API token is required for {provider}. Skipping this example.")
-        return
-
-    browser_config = BrowserConfig(headless=True)
-
-    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
-    if extra_headers:
-        extra_args["extra_headers"] = extra_headers
-
-    crawler_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        word_count_threshold=1,
-        page_timeout=80000,
-        extraction_strategy=LLMExtractionStrategy(
-            llm_config = LLMConfig(provider=provider,api_token=api_token),
-            schema=OpenAIModelFee.model_json_schema(),
-            extraction_type="schema",
-            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
-            Do not miss any models in the entire content.""",
-            extra_args=extra_args,
-        ),
-    )
-
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url="https://openai.com/api/pricing/", config=crawler_config
-        )
-        print(result.extracted_content)
-
-if __name__ == "__main__":
-
-    asyncio.run(
-        extract_structured_data_using_llm(
-            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
-        )
-    )
-```
-
-**What’s happening?**
-- We define a Pydantic schema (`PricingInfo`) describing the fields we want.
-- The LLM extraction strategy uses that schema and your instructions to transform raw text into structured JSON.
-- Depending on the **provider** and **api_token**, you can use local models or a remote API.
-
----
-
-## 7. Multi-URL Concurrency (Preview)
-
-If you need to crawl multiple URLs in **parallel**, you can use `arun_many()`. By default, Crawl4AI employs a **MemoryAdaptiveDispatcher**, automatically adjusting concurrency based on system resources. Here’s a quick glimpse:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-
-async def quick_parallel_example():
-    urls = [
-        "https://example.com/page1",
-        "https://example.com/page2",
-        "https://example.com/page3"
-    ]
-    
-    run_conf = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        stream=True  # Enable streaming mode
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        # Stream results as they complete
-        async for result in await crawler.arun_many(urls, config=run_conf):
-            if result.success:
-                print(f"[OK] {result.url}, length: {len(result.markdown.raw_markdown)}")
-            else:
-                print(f"[ERROR] {result.url} => {result.error_message}")
-
-        # Or get all results at once (default behavior)
-        run_conf = run_conf.clone(stream=False)
-        results = await crawler.arun_many(urls, config=run_conf)
-        for res in results:
-            if res.success:
-                print(f"[OK] {res.url}, length: {len(res.markdown.raw_markdown)}")
-            else:
-                print(f"[ERROR] {res.url} => {res.error_message}")
-
-if __name__ == "__main__":
-    asyncio.run(quick_parallel_example())
-```
-
-The example above shows two ways to handle multiple URLs:
-1. **Streaming mode** (`stream=True`): Process results as they become available using `async for`
-2. **Batch mode** (`stream=False`): Wait for all results to complete
-
-For more advanced concurrency (e.g., a **semaphore-based** approach, **adaptive memory usage throttling**, or customized rate limiting), see [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md).
-
----
-
-## 8. Dynamic Content Example
-
-Some sites require multiple “page clicks” or dynamic JavaScript updates. Below is an example showing how to **click** a “Next Page” button and wait for new commits to load on GitHub, using **`BrowserConfig`** and **`CrawlerRunConfig`**:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def extract_structured_data_using_css_extractor():
-    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
-    schema = {
-        "name": "KidoCode Courses",
-        "baseSelector": "section.charge-methodology .w-tab-content > div",
-        "fields": [
-            {
-                "name": "section_title",
-                "selector": "h3.heading-50",
-                "type": "text",
-            },
-            {
-                "name": "section_description",
-                "selector": ".charge-content",
-                "type": "text",
-            },
-            {
-                "name": "course_name",
-                "selector": ".text-block-93",
-                "type": "text",
-            },
-            {
-                "name": "course_description",
-                "selector": ".course-content-text",
-                "type": "text",
-            },
-            {
-                "name": "course_icon",
-                "selector": ".image-92",
-                "type": "attribute",
-                "attribute": "src",
-            },
-        ],
-    }
-
-    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
-
-    js_click_tabs = """
-    (async () => {
-        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
-        for(let tab of tabs) {
-            tab.scrollIntoView();
-            tab.click();
-            await new Promise(r => setTimeout(r, 500));
-        }
-    })();
-    """
-
-    crawler_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        extraction_strategy=JsonCssExtractionStrategy(schema),
-        js_code=[js_click_tabs],
-    )
-
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology", config=crawler_config
-        )
-
-        companies = json.loads(result.extracted_content)
-        print(f"Successfully extracted {len(companies)} companies")
-        print(json.dumps(companies[0], indent=2))
-
-async def main():
-    await extract_structured_data_using_css_extractor()
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key Points**:
-
-- **`BrowserConfig(headless=False)`**: We want to watch it click “Next Page.”  
-- **`CrawlerRunConfig(...)`**: We specify the extraction strategy, pass `session_id` to reuse the same page.  
-- **`js_code`** and **`wait_for`** are used for subsequent pages (`page > 0`) to click the “Next” button and wait for new commits to load.  
-- **`js_only=True`** indicates we’re not re-navigating but continuing the existing session.  
-- Finally, we call `kill_session()` to clean up the page and browser session.
-
----
-
-## 9. Next Steps
-
-Congratulations! You have:
-
-1. Performed a basic crawl and printed Markdown.  
-2. Used **content filters** with a markdown generator.  
-3. Extracted JSON via **CSS** or **LLM** strategies.  
-4. Handled **dynamic** pages with JavaScript triggers.
-
-If you’re ready for more, check out:
-
-- **Installation**: A deeper dive into advanced installs, Docker usage (experimental), or optional dependencies.  
-- **Hooks & Auth**: Learn how to run custom JavaScript or handle logins with cookies, local storage, etc.  
-- **Deployment**: Explore ephemeral testing in Docker or plan for the upcoming stable Docker release.  
-- **Browser Management**: Delve into user simulation, stealth modes, and concurrency best practices.  
-
-Crawl4AI is a powerful, flexible tool. Enjoy building out your scrapers, data pipelines, or AI-driven extraction flows. Happy crawling!
-```
-
-
-## File: docs/md_v2/extraction/llm-strategies.md
-
-```md
-# Extracting JSON (LLM)
-
-In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
-
-1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more).  
-2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.  
-3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
-
-**Important**: LLM-based extraction can be slower and costlier than schema-based approaches. If your page data is highly structured, consider using [`JsonCssExtractionStrategy`](./no-llm-strategies.md) or [`JsonXPathExtractionStrategy`](./no-llm-strategies.md) first. But if you need AI to interpret or reorganize content, read on!
-
----
-
-## 1. Why Use an LLM?
-
-- **Complex Reasoning**: If the site’s data is unstructured, scattered, or full of natural language context.  
-- **Semantic Extraction**: Summaries, knowledge graphs, or relational data that require comprehension.  
-- **Flexible**: You can pass instructions to the model to do more advanced transformations or classification.
-
----
-
-## 2. Provider-Agnostic via LiteLLM
-
-You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
-
-```python
-llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
-```
-
-Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
-
-- **`provider`**: The `<provider>/<model_name>` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).  
-- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.  
-- **`base_url`** (optional): If your provider has a custom endpoint.  
-
-This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
-
----
-
-## 3. How LLM Extraction Works
-
-### 3.1 Flow
-
-1. **Chunking** (optional): The HTML or markdown is split into smaller segments if it’s very long (based on `chunk_token_threshold`, overlap, etc.).  
-2. **Prompt Construction**: For each chunk, the library forms a prompt that includes your **`instruction`** (and possibly schema or examples).  
-3. **LLM Inference**: Each chunk is sent to the model in parallel or sequentially (depending on your concurrency).  
-4. **Combining**: The results from each chunk are merged and parsed into JSON.
-
-### 3.2 `extraction_type`
-
-- **`"schema"`**: The model tries to return JSON conforming to your Pydantic-based schema.  
-- **`"block"`**: The model returns freeform text, or smaller JSON structures, which the library collects.  
-
-For structured data, `"schema"` is recommended. You provide `schema=YourPydanticModel.model_json_schema()`.
-
----
-
-## 4. Key Parameters
-
-Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
-
-1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.    
-2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.  
-3. **`extraction_type`** (str): `"schema"` or `"block"`.  
-4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”  
-5. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.  
-6. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.  
-7. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.  
-8. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:  
-   - `"markdown"`: The raw markdown (default).  
-   - `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.  
-   - `"html"`: The cleaned or raw HTML.  
-9. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.  
-10. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).  
-
-**Example**:
-
-```python
-extraction_strategy = LLMExtractionStrategy(
-    llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
-    schema=MyModel.model_json_schema(),
-    extraction_type="schema",
-    instruction="Extract a list of items from the text with 'name' and 'price' fields.",
-    chunk_token_threshold=1200,
-    overlap_rate=0.1,
-    apply_chunking=True,
-    input_format="html",
-    extra_args={"temperature": 0.1, "max_tokens": 1000},
-    verbose=True
-)
-```
-
----
-
-## 5. Putting It in `CrawlerRunConfig`
-
-**Important**: In Crawl4AI, all strategy definitions should go inside the `CrawlerRunConfig`, not directly as a param in `arun()`. Here’s a full example:
-
-```python
-import os
-import asyncio
-import json
-from pydantic import BaseModel, Field
-from typing import List
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-class Product(BaseModel):
-    name: str
-    price: str
-
-async def main():
-    # 1. Define the LLM extraction strategy
-    llm_strategy = LLMExtractionStrategy(
-        llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
-        schema=Product.schema_json(), # Or use model_json_schema()
-        extraction_type="schema",
-        instruction="Extract all product objects with 'name' and 'price' from the content.",
-        chunk_token_threshold=1000,
-        overlap_rate=0.0,
-        apply_chunking=True,
-        input_format="markdown",   # or "html", "fit_markdown"
-        extra_args={"temperature": 0.0, "max_tokens": 800}
-    )
-
-    # 2. Build the crawler config
-    crawl_config = CrawlerRunConfig(
-        extraction_strategy=llm_strategy,
-        cache_mode=CacheMode.BYPASS
-    )
-
-    # 3. Create a browser config if needed
-    browser_cfg = BrowserConfig(headless=True)
-
-    async with AsyncWebCrawler(config=browser_cfg) as crawler:
-        # 4. Let's say we want to crawl a single page
-        result = await crawler.arun(
-            url="https://example.com/products",
-            config=crawl_config
-        )
-
-        if result.success:
-            # 5. The extracted content is presumably JSON
-            data = json.loads(result.extracted_content)
-            print("Extracted items:", data)
-            
-            # 6. Show usage stats
-            llm_strategy.show_usage()  # prints token usage
-        else:
-            print("Error:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-## 6. Chunking Details
-
-### 6.1 `chunk_token_threshold`
-
-If your page is large, you might exceed your LLM’s context window. **`chunk_token_threshold`** sets the approximate max tokens per chunk. The library calculates word→token ratio using `word_token_rate` (often ~0.75 by default). If chunking is enabled (`apply_chunking=True`), the text is split into segments.
-
-### 6.2 `overlap_rate`
-
-To keep context continuous across chunks, we can overlap them. E.g., `overlap_rate=0.1` means each subsequent chunk includes 10% of the previous chunk’s text. This is helpful if your needed info might straddle chunk boundaries.
-
-### 6.3 Performance & Parallelism
-
-By chunking, you can potentially process multiple chunks in parallel (depending on your concurrency settings and the LLM provider). This reduces total time if the site is huge or has many sections.
-
----
-
-## 7. Input Format
-
-By default, **LLMExtractionStrategy** uses `input_format="markdown"`, meaning the **crawler’s final markdown** is fed to the LLM. You can change to:
-
-- **`html`**: The cleaned HTML or raw HTML (depending on your crawler config) goes into the LLM.  
-- **`fit_markdown`**: If you used, for instance, `PruningContentFilter`, the “fit” version of the markdown is used. This can drastically reduce tokens if you trust the filter.  
-- **`markdown`**: Standard markdown output from the crawler’s `markdown_generator`.
-
-This setting is crucial: if the LLM instructions rely on HTML tags, pick `"html"`. If you prefer a text-based approach, pick `"markdown"`.
-
-```python
-LLMExtractionStrategy(
-    # ...
-    input_format="html",  # Instead of "markdown" or "fit_markdown"
-)
-```
-
----
-
-## 8. Token Usage & Show Usage
-
-To keep track of tokens and cost, each chunk is processed with an LLM call. We record usage in:
-
-- **`usages`** (list): token usage per chunk or call.  
-- **`total_usage`**: sum of all chunk calls.  
-- **`show_usage()`**: prints a usage report (if the provider returns usage data).
-
-```python
-llm_strategy = LLMExtractionStrategy(...)
-# ...
-llm_strategy.show_usage()
-# e.g. “Total usage: 1241 tokens across 2 chunk calls”
-```
-
-If your model provider doesn’t return usage info, these fields might be partial or empty.
-
----
-
-## 9. Example: Building a Knowledge Graph
-
-Below is a snippet combining **`LLMExtractionStrategy`** with a Pydantic schema for a knowledge graph. Notice how we pass an **`instruction`** telling the model what to parse.
-
-```python
-import os
-import json
-import asyncio
-from typing import List
-from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
-
-class Entity(BaseModel):
-    name: str
-    description: str
-
-class Relationship(BaseModel):
-    entity1: Entity
-    entity2: Entity
-    description: str
-    relation_type: str
-
-class KnowledgeGraph(BaseModel):
-    entities: List[Entity]
-    relationships: List[Relationship]
-
-async def main():
-    # LLM extraction strategy
-    llm_strat = LLMExtractionStrategy(
-        llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
-        schema=KnowledgeGraph.schema_json(),
-        extraction_type="schema",
-        instruction="Extract entities and relationships from the content. Return valid JSON.",
-        chunk_token_threshold=1400,
-        apply_chunking=True,
-        input_format="html",
-        extra_args={"temperature": 0.1, "max_tokens": 1500}
-    )
-
-    crawl_config = CrawlerRunConfig(
-        extraction_strategy=llm_strat,
-        cache_mode=CacheMode.BYPASS
-    )
-
-    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
-        # Example page
-        url = "https://www.nbcnews.com/business"
-        result = await crawler.arun(url=url, config=crawl_config)
-
-        if result.success:
-            with open("kb_result.json", "w", encoding="utf-8") as f:
-                f.write(result.extracted_content)
-            llm_strat.show_usage()
-        else:
-            print("Crawl failed:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key Observations**:
-
-- **`extraction_type="schema"`** ensures we get JSON fitting our `KnowledgeGraph`.  
-- **`input_format="html"`** means we feed HTML to the model.  
-- **`instruction`** guides the model to output a structured knowledge graph.  
-
----
-
-## 10. Best Practices & Caveats
-
-1. **Cost & Latency**: LLM calls can be slow or expensive. Consider chunking or smaller coverage if you only need partial data.  
-2. **Model Token Limits**: If your page + instruction exceed the context window, chunking is essential.  
-3. **Instruction Engineering**: Well-crafted instructions can drastically improve output reliability.  
-4. **Schema Strictness**: `"schema"` extraction tries to parse the model output as JSON. If the model returns invalid JSON, partial extraction might happen, or you might get an error.  
-5. **Parallel vs. Serial**: The library can process multiple chunks in parallel, but you must watch out for rate limits on certain providers.  
-6. **Check Output**: Sometimes, an LLM might omit fields or produce extraneous text. You may want to post-validate with Pydantic or do additional cleanup.
-
----
-
-## 11. Conclusion
-
-**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LiteLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
-
-- Put your LLM strategy **in `CrawlerRunConfig`**.  
-- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.  
-- Tweak **`chunk_token_threshold`**, **`overlap_rate`**, and **`apply_chunking`** to handle large content efficiently.  
-- Monitor token usage with `show_usage()`.
-
-If your site’s data is consistent or repetitive, consider [`JsonCssExtractionStrategy`](./no-llm-strategies.md) first for speed and simplicity. But if you need an **AI-driven** approach, `LLMExtractionStrategy` offers a flexible, multi-provider solution for extracting structured JSON from any website.
-
-**Next Steps**:
-
-1. **Experiment with Different Providers**  
-   - Try switching the `provider` (e.g., `"ollama/llama2"`, `"openai/gpt-4o"`, etc.) to see differences in speed, accuracy, or cost.  
-   - Pass different `extra_args` like `temperature`, `top_p`, and `max_tokens` to fine-tune your results.
-
-2. **Performance Tuning**  
-   - If pages are large, tweak `chunk_token_threshold`, `overlap_rate`, or `apply_chunking` to optimize throughput.  
-   - Check the usage logs with `show_usage()` to keep an eye on token consumption and identify potential bottlenecks.
-
-3. **Validate Outputs**  
-   - If using `extraction_type="schema"`, parse the LLM’s JSON with a Pydantic model for a final validation step.  
-   - Log or handle any parse errors gracefully, especially if the model occasionally returns malformed JSON.
-
-4. **Explore Hooks & Automation**  
-   - Integrate LLM extraction with [hooks](../advanced/hooks-auth.md) for complex pre/post-processing.  
-   - Use a multi-step pipeline: crawl, filter, LLM-extract, then store or index results for further analysis.
-
-**Last Updated**: 2025-01-01
-
----
-
-That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
-
-```
-
-
-## File: docs/md_v2/extraction/no-llm-strategies.md
-
-```md
-# Extracting JSON (No LLM)
-
-One of Crawl4AI's **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. Crawl4AI offers several strategies for LLM-free extraction:
-
-1. **Schema-based extraction** with CSS or XPath selectors via `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`
-2. **Regular expression extraction** with `RegexExtractionStrategy` for fast pattern matching
-
-These approaches let you extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
-
-**Why avoid LLM for basic extractions?**
-
-1. **Faster & Cheaper**: No API calls or GPU overhead.  
-2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. Pattern-based extraction is practically carbon-free.  
-3. **Precise & Repeatable**: CSS/XPath selectors and regex patterns do exactly what you specify. LLM outputs can vary or hallucinate.  
-4. **Scales Readily**: For thousands of pages, pattern-based extraction runs quickly and in parallel.
-
-Below, we'll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We'll also highlight advanced features like **nested fields** and **base element attributes**.
-
----
-
-## 1. Intro to Schema-Based Extraction
-
-A schema defines:
-
-1. A **base selector** that identifies each "container" element on the page (e.g., a product row, a blog post card).  
-2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
-3. **Nested** or **list** types for repeated or hierarchical structures.  
-
-For example, if you have a list of products, each one might have a name, price, reviews, and "related products." This approach is faster and more reliable than an LLM for consistent, structured pages.
-
----
-
-## 2. Simple Example: Crypto Prices
-
-Let's begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don't** call any LLM:
-
-```python
-import json
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-async def extract_crypto_prices():
-    # 1. Define a simple extraction schema
-    schema = {
-        "name": "Crypto Prices",
-        "baseSelector": "div.crypto-row",    # Repeated elements
-        "fields": [
-            {
-                "name": "coin_name",
-                "selector": "h2.coin-name",
-                "type": "text"
-            },
-            {
-                "name": "price",
-                "selector": "span.coin-price",
-                "type": "text"
-            }
-        ]
-    }
-
-    # 2. Create the extraction strategy
-    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-    # 3. Set up your crawler config (if needed)
-    config = CrawlerRunConfig(
-        # e.g., pass js_code or wait_for if the page is dynamic
-        # wait_for="css:.crypto-row:nth-child(20)"
-        cache_mode = CacheMode.BYPASS,
-        extraction_strategy=extraction_strategy,
-    )
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        # 4. Run the crawl and extraction
-        result = await crawler.arun(
-            url="https://example.com/crypto-prices",
-            
-            config=config
-        )
-
-        if not result.success:
-            print("Crawl failed:", result.error_message)
-            return
-
-        # 5. Parse the extracted JSON
-        data = json.loads(result.extracted_content)
-        print(f"Extracted {len(data)} coin entries")
-        print(json.dumps(data[0], indent=2) if data else "No data found")
-
-asyncio.run(extract_crypto_prices())
-```
-
-**Highlights**:
-
-- **`baseSelector`**: Tells us where each "item" (crypto row) is.  
-- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.  
-- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
-
-No LLM is needed, and the performance is **near-instant** for hundreds or thousands of items.
-
----
-
-### **XPath Example with `raw://` HTML**
-
-Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We'll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
-
-```python
-import json
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
-
-async def extract_crypto_prices_xpath():
-    # 1. Minimal dummy HTML with some repeating rows
-    dummy_html = """
-    <html>
-      <body>
-        <div class='crypto-row'>
-          <h2 class='coin-name'>Bitcoin</h2>
-          <span class='coin-price'>$28,000</span>
-        </div>
-        <div class='crypto-row'>
-          <h2 class='coin-name'>Ethereum</h2>
-          <span class='coin-price'>$1,800</span>
-        </div>
-      </body>
-    </html>
-    """
-
-    # 2. Define the JSON schema (XPath version)
-    schema = {
-        "name": "Crypto Prices via XPath",
-        "baseSelector": "//div[@class='crypto-row']",
-        "fields": [
-            {
-                "name": "coin_name",
-                "selector": ".//h2[@class='coin-name']",
-                "type": "text"
-            },
-            {
-                "name": "price",
-                "selector": ".//span[@class='coin-price']",
-                "type": "text"
-            }
-        ]
-    }
-
-    # 3. Place the strategy in the CrawlerRunConfig
-    config = CrawlerRunConfig(
-        extraction_strategy=JsonXPathExtractionStrategy(schema, verbose=True)
-    )
-
-    # 4. Use raw:// scheme to pass dummy_html directly
-    raw_url = f"raw://{dummy_html}"
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url=raw_url,
-            config=config
-        )
-
-        if not result.success:
-            print("Crawl failed:", result.error_message)
-            return
-
-        data = json.loads(result.extracted_content)
-        print(f"Extracted {len(data)} coin rows")
-        if data:
-            print("First item:", data[0])
-
-asyncio.run(extract_crypto_prices_xpath())
-```
-
-**Key Points**:
-
-1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
-2. **`baseSelector`** and each field's `"selector"` use **XPath** instead of CSS.  
-3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
-4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**.  
-
-That's how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
-
----
-
-## 3. Advanced Schema & Nested Structures
-
-Real sites often have **nested** or repeated data—like categories containing products, which themselves have a list of reviews or features. For that, we can define **nested** or **list** (and even **nested_list**) fields.
-
-### Sample E-Commerce HTML
-
-We have a **sample e-commerce** HTML file on GitHub (example):
-```
-https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
-```
-This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**.
-
-```python
-schema = {
-    "name": "E-commerce Product Catalog",
-    "baseSelector": "div.category",
-    # (1) We can define optional baseFields if we want to extract attributes 
-    # from the category container
-    "baseFields": [
-        {"name": "data_cat_id", "type": "attribute", "attribute": "data-cat-id"}, 
-    ],
-    "fields": [
-        {
-            "name": "category_name",
-            "selector": "h2.category-name",
-            "type": "text"
-        },
-        {
-            "name": "products",
-            "selector": "div.product",
-            "type": "nested_list",    # repeated sub-objects
-            "fields": [
-                {
-                    "name": "name",
-                    "selector": "h3.product-name",
-                    "type": "text"
-                },
-                {
-                    "name": "price",
-                    "selector": "p.product-price",
-                    "type": "text"
-                },
-                {
-                    "name": "details",
-                    "selector": "div.product-details",
-                    "type": "nested",  # single sub-object
-                    "fields": [
-                        {
-                            "name": "brand",
-                            "selector": "span.brand",
-                            "type": "text"
-                        },
-                        {
-                            "name": "model",
-                            "selector": "span.model",
-                            "type": "text"
-                        }
-                    ]
-                },
-                {
-                    "name": "features",
-                    "selector": "ul.product-features li",
-                    "type": "list",
-                    "fields": [
-                        {"name": "feature", "type": "text"} 
-                    ]
-                },
-                {
-                    "name": "reviews",
-                    "selector": "div.review",
-                    "type": "nested_list",
-                    "fields": [
-                        {
-                            "name": "reviewer", 
-                            "selector": "span.reviewer", 
-                            "type": "text"
-                        },
-                        {
-                            "name": "rating", 
-                            "selector": "span.rating", 
-                            "type": "text"
-                        },
-                        {
-                            "name": "comment", 
-                            "selector": "p.review-text", 
-                            "type": "text"
-                        }
-                    ]
-                },
-                {
-                    "name": "related_products",
-                    "selector": "ul.related-products li",
-                    "type": "list",
-                    "fields": [
-                        {
-                            "name": "name", 
-                            "selector": "span.related-name", 
-                            "type": "text"
-                        },
-                        {
-                            "name": "price", 
-                            "selector": "span.related-price", 
-                            "type": "text"
-                        }
-                    ]
-                }
-            ]
-        }
-    ]
-}
-```
-
-Key Takeaways:
-
-- **Nested vs. List**:  
-  - **`type: "nested"`** means a **single** sub-object (like `details`).  
-  - **`type: "list"`** means multiple items that are **simple** dictionaries or single text fields.  
-  - **`type: "nested_list"`** means repeated **complex** objects (like `products` or `reviews`).
-- **Base Fields**: We can extract **attributes** from the container element via `"baseFields"`. For instance, `"data_cat_id"` might be `data-cat-id="elect123"`.  
-- **Transforms**: We can also define a `transform` if we want to lower/upper case, strip whitespace, or even run a custom function.
-
-### Running the Extraction
-
-```python
-import json
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-
-ecommerce_schema = {
-    # ... the advanced schema from above ...
-}
-
-async def extract_ecommerce_data():
-    strategy = JsonCssExtractionStrategy(ecommerce_schema, verbose=True)
-    
-    config = CrawlerRunConfig()
-    
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(
-            url="https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html",
-            extraction_strategy=strategy,
-            config=config
-        )
-
-        if not result.success:
-            print("Crawl failed:", result.error_message)
-            return
-        
-        # Parse the JSON output
-        data = json.loads(result.extracted_content)
-        print(json.dumps(data, indent=2) if data else "No data found.")
-
-asyncio.run(extract_ecommerce_data())
-```
-
-If all goes well, you get a **structured** JSON array with each "category," containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
-
----
-
-## 4. RegexExtractionStrategy - Fast Pattern-Based Extraction
-
-Crawl4AI now offers a powerful new zero-LLM extraction strategy: `RegexExtractionStrategy`. This strategy provides lightning-fast extraction of common data types like emails, phone numbers, URLs, dates, and more using pre-compiled regular expressions.
-
-### Key Features
-
-- **Zero LLM Dependency**: Extracts data without any AI model calls
-- **Blazing Fast**: Uses pre-compiled regex patterns for maximum performance
-- **Built-in Patterns**: Includes ready-to-use patterns for common data types
-- **Custom Patterns**: Add your own regex patterns for domain-specific extraction
-- **LLM-Assisted Pattern Generation**: Optionally use an LLM once to generate optimized patterns, then reuse them without further LLM calls
-
-### Simple Example: Extracting Common Entities
-
-The easiest way to start is by using the built-in pattern catalog:
-
-```python
-import json
-import asyncio
-from crawl4ai import (
-    AsyncWebCrawler,
-    CrawlerRunConfig,
-    RegexExtractionStrategy
-)
-
-async def extract_with_regex():
-    # Create a strategy using built-in patterns for URLs and currencies
-    strategy = RegexExtractionStrategy(
-        pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
-    )
-    
-    config = CrawlerRunConfig(extraction_strategy=strategy)
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.com",
-            config=config
-        )
-        
-        if result.success:
-            data = json.loads(result.extracted_content)
-            for item in data[:5]:  # Show first 5 matches
-                print(f"{item['label']}: {item['value']}")
-            print(f"Total matches: {len(data)}")
-
-asyncio.run(extract_with_regex())
-```
-
-### Available Built-in Patterns
-
-`RegexExtractionStrategy` provides these common patterns as IntFlag attributes for easy combining:
-
-```python
-# Use individual patterns
-strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email)
-
-# Combine multiple patterns
-strategy = RegexExtractionStrategy(
-    pattern = (
-        RegexExtractionStrategy.Email | 
-        RegexExtractionStrategy.PhoneUS | 
-        RegexExtractionStrategy.Url
-    )
-)
-
-# Use all available patterns
-strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.All)
-```
-
-Available patterns include:
-- `Email` - Email addresses
-- `PhoneIntl` - International phone numbers
-- `PhoneUS` - US-format phone numbers
-- `Url` - HTTP/HTTPS URLs
-- `IPv4` - IPv4 addresses
-- `IPv6` - IPv6 addresses
-- `Uuid` - UUIDs
-- `Currency` - Currency values (USD, EUR, etc.)
-- `Percentage` - Percentage values
-- `Number` - Numeric values
-- `DateIso` - ISO format dates
-- `DateUS` - US format dates
-- `Time24h` - 24-hour format times
-- `PostalUS` - US postal codes
-- `PostalUK` - UK postal codes
-- `HexColor` - HTML hex color codes
-- `TwitterHandle` - Twitter handles
-- `Hashtag` - Hashtags
-- `MacAddr` - MAC addresses
-- `Iban` - International bank account numbers
-- `CreditCard` - Credit card numbers
-
-### Custom Pattern Example
-
-For more targeted extraction, you can provide custom patterns:
-
-```python
-import json
-import asyncio
-from crawl4ai import (
-    AsyncWebCrawler,
-    CrawlerRunConfig,
-    RegexExtractionStrategy
-)
-
-async def extract_prices():
-    # Define a custom pattern for US Dollar prices
-    price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
-    
-    # Create strategy with custom pattern
-    strategy = RegexExtractionStrategy(custom=price_pattern)
-    config = CrawlerRunConfig(extraction_strategy=strategy)
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://www.example.com/products",
-            config=config
-        )
-        
-        if result.success:
-            data = json.loads(result.extracted_content)
-            for item in data:
-                print(f"Found price: {item['value']}")
-
-asyncio.run(extract_prices())
-```
-
-### LLM-Assisted Pattern Generation
-
-For complex or site-specific patterns, you can use an LLM once to generate an optimized pattern, then save and reuse it without further LLM calls:
-
-```python
-import json
-import asyncio
-from pathlib import Path
-from crawl4ai import (
-    AsyncWebCrawler,
-    CrawlerRunConfig,
-    RegexExtractionStrategy,
-    LLMConfig
-)
-
-async def extract_with_generated_pattern():
-    cache_dir = Path("./pattern_cache")
-    cache_dir.mkdir(exist_ok=True)
-    pattern_file = cache_dir / "price_pattern.json"
-    
-    # 1. Generate or load pattern
-    if pattern_file.exists():
-        pattern = json.load(pattern_file.open())
-        print(f"Using cached pattern: {pattern}")
-    else:
-        print("Generating pattern via LLM...")
-        
-        # Configure LLM
-        llm_config = LLMConfig(
-            provider="openai/gpt-4o-mini",
-            api_token="env:OPENAI_API_KEY",
-        )
-        
-        # Get sample HTML for context
-        async with AsyncWebCrawler() as crawler:
-            result = await crawler.arun("https://example.com/products")
-            html = result.fit_html
-        
-        # Generate pattern (one-time LLM usage)
-        pattern = RegexExtractionStrategy.generate_pattern(
-            label="price",
-            html=html,
-            query="Product prices in USD format",
-            llm_config=llm_config,
-        )
-        
-        # Cache pattern for future use
-        json.dump(pattern, pattern_file.open("w"), indent=2)
-    
-    # 2. Use pattern for extraction (no LLM calls)
-    strategy = RegexExtractionStrategy(custom=pattern)
-    config = CrawlerRunConfig(extraction_strategy=strategy)
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.com/products",
-            config=config
-        )
-        
-        if result.success:
-            data = json.loads(result.extracted_content)
-            for item in data[:10]:
-                print(f"Extracted: {item['value']}")
-            print(f"Total matches: {len(data)}")
-
-asyncio.run(extract_with_generated_pattern())
-```
-
-This pattern allows you to:
-1. Use an LLM once to generate a highly optimized regex for your specific site
-2. Save the pattern to disk for reuse 
-3. Extract data using only regex (no further LLM calls) in production
-
-### Extraction Results Format
-
-The `RegexExtractionStrategy` returns results in a consistent format:
-
-```json
-[
-  {
-    "url": "https://example.com",
-    "label": "email",
-    "value": "contact@example.com",
-    "span": [145, 163]
-  },
-  {
-    "url": "https://example.com",
-    "label": "url",
-    "value": "https://support.example.com",
-    "span": [210, 235]
-  }
-]
-```
-
-Each match includes:
-- `url`: The source URL
-- `label`: The pattern name that matched (e.g., "email", "phone_us")
-- `value`: The extracted text
-- `span`: The start and end positions in the source content
-
----
-
-## 5. Why "No LLM" Is Often Better
-
-1. **Zero Hallucination**: Pattern-based extraction doesn't guess text. It either finds it or not.  
-2. **Guaranteed Structure**: The same schema or regex yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
-3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
-4. **Scalable**: Adding or updating a field is a matter of adjusting the schema or regex, not re-tuning a model.
-
-**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema or regex approach first for repeated or consistent data patterns.
-
----
-
-## 6. Base Element Attributes & Additional Fields
-
-It's easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
-
-```json
-{
-  "name": "href",
-  "type": "attribute",
-  "attribute": "href",
-  "default": null
-}
-```
-
-You can define them in **`baseFields`** (extracted from the main container element) or in each field's sub-lists. This is especially helpful if you need an item's link or ID stored in the parent `<div>`.
-
----
-
-## 7. Putting It All Together: Larger Example
-
-Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author:
-
-```python
-schema = {
-  "name": "Blog Posts",
-  "baseSelector": "a.blog-post-card",
-  "baseFields": [
-    {"name": "post_url", "type": "attribute", "attribute": "href"}
-  ],
-  "fields": [
-    {"name": "title", "selector": "h2.post-title", "type": "text", "default": "No Title"},
-    {"name": "date", "selector": "time.post-date", "type": "text", "default": ""},
-    {"name": "summary", "selector": "p.post-summary", "type": "text", "default": ""},
-    {"name": "author", "selector": "span.post-author", "type": "text", "default": ""}
-  ]
-}
-```
-
-Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post objects, each with `"post_url"`, `"title"`, `"date"`, `"summary"`, `"author"`.
-
----
-
-## 8. Tips & Best Practices
-
-1. **Inspect the DOM** in Chrome DevTools or Firefox's Inspector to find stable selectors.  
-2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
-3. **Test** your schema on partial HTML or a test page before a big crawl.  
-4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
-5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it'll often show warnings.  
-6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the "parent" item.  
-7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
-8. **Consider Using Regex First**: For simple data types like emails, URLs, and dates, `RegexExtractionStrategy` is often the fastest approach.
-
----
-
-## 9. Schema Generation Utility
-
-While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
-
-1. You're dealing with a new website structure and want a quick starting point
-2. You need to extract complex nested data structures
-3. You want to avoid the learning curve of CSS/XPath selector syntax
-
-### Using the Schema Generator
-
-The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
-
-```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
-from crawl4ai import LLMConfig
-
-# Sample HTML with product information
-html = """
-<div class="product-card">
-    <h2 class="title">Gaming Laptop</h2>
-    <div class="price">$999.99</div>
-    <div class="specs">
-        <ul>
-            <li>16GB RAM</li>
-            <li>1TB SSD</li>
-        </ul>
-    </div>
-</div>
-"""
-
-# Option 1: Using OpenAI (requires API token)
-css_schema = JsonCssExtractionStrategy.generate_schema(
-    html,
-    schema_type="css", 
-    llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
-)
-
-# Option 2: Using Ollama (open source, no token needed)
-xpath_schema = JsonXPathExtractionStrategy.generate_schema(
-    html,
-    schema_type="xpath",
-    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)  # Not needed for Ollama
-)
-
-# Use the generated schema for fast, repeated extractions
-strategy = JsonCssExtractionStrategy(css_schema)
-```
-
-### LLM Provider Options
-
-1. **OpenAI GPT-4 (`openai/gpt4o`)**
-   - Default provider
-   - Requires an API token
-   - Generally provides more accurate schemas
-   - Set via environment variable: `OPENAI_API_KEY`
-
-2. **Ollama (`ollama/llama3.3`)**
-   - Open source alternative
-   - No API token required
-   - Self-hosted option
-   - Good for development and testing
-
-### Benefits of Schema Generation
-
-1. **One-Time Cost**: While schema generation uses LLM, it's a one-time cost. The generated schema can be reused for unlimited extractions without further LLM calls.
-2. **Smart Pattern Recognition**: The LLM analyzes the HTML structure and identifies common patterns, often producing more robust selectors than manual attempts.
-3. **Automatic Nesting**: Complex nested structures are automatically detected and properly represented in the schema.
-4. **Learning Tool**: The generated schemas serve as excellent examples for learning how to write your own schemas.
-
-### Best Practices
-
-1. **Review Generated Schemas**: While the generator is smart, always review and test the generated schema before using it in production.
-2. **Provide Representative HTML**: The better your sample HTML represents the overall structure, the more accurate the generated schema will be.
-3. **Consider Both CSS and XPath**: Try both schema types and choose the one that works best for your specific case.
-4. **Cache Generated Schemas**: Since generation uses LLM, save successful schemas for reuse.
-5. **API Token Security**: Never hardcode API tokens. Use environment variables or secure configuration management.
-6. **Choose Provider Wisely**: 
-   - Use OpenAI for production-quality schemas
-   - Use Ollama for development, testing, or when you need a self-hosted solution
-
----
-
-## 10. Conclusion
-
-With Crawl4AI's LLM-free extraction strategies - `JsonCssExtractionStrategy`, `JsonXPathExtractionStrategy`, and now `RegexExtractionStrategy` - you can build powerful pipelines that:
-
-- Scrape any consistent site for structured data.  
-- Support nested objects, repeating lists, or pattern-based extraction.  
-- Scale to thousands of pages quickly and reliably.
-
-**Choosing the Right Strategy**:
-
-- Use **`RegexExtractionStrategy`** for fast extraction of common data types like emails, phones, URLs, dates, etc.
-- Use **`JsonCssExtractionStrategy`** or **`JsonXPathExtractionStrategy`** for structured data with clear HTML patterns
-- If you need both: first extract structured data with JSON strategies, then use regex on specific fields
-
-**Remember**: For repeated, structured data, you don't need to pay for or wait on an LLM. Well-crafted schemas and regex patterns get you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
-
-**Last Updated**: 2025-05-02
-
----
-
-That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) and regex patterns can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
-```
-
-
-## File: docs/md_v2/extraction/clustring-strategies.md
-
-```md
-# Cosine Strategy
-
-The Cosine Strategy in Crawl4AI uses similarity-based clustering to identify and extract relevant content sections from web pages. This strategy is particularly useful when you need to find and extract content based on semantic similarity rather than structural patterns.
-
-## How It Works
-
-The Cosine Strategy:
-1. Breaks down page content into meaningful chunks
-2. Converts text into vector representations
-3. Calculates similarity between chunks
-4. Clusters similar content together
-5. Ranks and filters content based on relevance
-
-## Basic Usage
-
-```python
-from crawl4ai.extraction_strategy import CosineStrategy
-
-strategy = CosineStrategy(
-    semantic_filter="product reviews",    # Target content type
-    word_count_threshold=10,             # Minimum words per cluster
-    sim_threshold=0.3                    # Similarity threshold
-)
-
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(
-        url="https://example.com/reviews",
-        extraction_strategy=strategy
-    )
-    
-    content = result.extracted_content
-```
-
-## Configuration Options
-
-### Core Parameters
-
-```python
-CosineStrategy(
-    # Content Filtering
-    semantic_filter: str = None,       # Keywords/topic for content filtering
-    word_count_threshold: int = 10,    # Minimum words per cluster
-    sim_threshold: float = 0.3,        # Similarity threshold (0.0 to 1.0)
-    
-    # Clustering Parameters
-    max_dist: float = 0.2,            # Maximum distance for clustering
-    linkage_method: str = 'ward',      # Clustering linkage method
-    top_k: int = 3,                   # Number of top categories to extract
-    
-    # Model Configuration
-    model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',  # Embedding model
-    
-    verbose: bool = False             # Enable logging
-)
-```
-
-### Parameter Details
-
-1. **semantic_filter**
-   - Sets the target topic or content type
-   - Use keywords relevant to your desired content
-   - Example: "technical specifications", "user reviews", "pricing information"
-
-2. **sim_threshold**
-   - Controls how similar content must be to be grouped together
-   - Higher values (e.g., 0.8) mean stricter matching
-   - Lower values (e.g., 0.3) allow more variation
-   ```python
-   # Strict matching
-   strategy = CosineStrategy(sim_threshold=0.8)
-   
-   # Loose matching
-   strategy = CosineStrategy(sim_threshold=0.3)
-   ```
-
-3. **word_count_threshold**
-   - Filters out short content blocks
-   - Helps eliminate noise and irrelevant content
-   ```python
-   # Only consider substantial paragraphs
-   strategy = CosineStrategy(word_count_threshold=50)
-   ```
-
-4. **top_k**
-   - Number of top content clusters to return
-   - Higher values return more diverse content
-   ```python
-   # Get top 5 most relevant content clusters
-   strategy = CosineStrategy(top_k=5)
-   ```
-
-## Use Cases
-
-### 1. Article Content Extraction
-```python
-strategy = CosineStrategy(
-    semantic_filter="main article content",
-    word_count_threshold=100,  # Longer blocks for articles
-    top_k=1                   # Usually want single main content
-)
-
-result = await crawler.arun(
-    url="https://example.com/blog/post",
-    extraction_strategy=strategy
-)
-```
-
-### 2. Product Review Analysis
-```python
-strategy = CosineStrategy(
-    semantic_filter="customer reviews and ratings",
-    word_count_threshold=20,   # Reviews can be shorter
-    top_k=10,                 # Get multiple reviews
-    sim_threshold=0.4         # Allow variety in review content
-)
-```
-
-### 3. Technical Documentation
-```python
-strategy = CosineStrategy(
-    semantic_filter="technical specifications documentation",
-    word_count_threshold=30,
-    sim_threshold=0.6,        # Stricter matching for technical content
-    max_dist=0.3             # Allow related technical sections
-)
-```
-
-## Advanced Features
-
-### Custom Clustering
-```python
-strategy = CosineStrategy(
-    linkage_method='complete',  # Alternative clustering method
-    max_dist=0.4,              # Larger clusters
-    model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'  # Multilingual support
-)
-```
-
-### Content Filtering Pipeline
-```python
-strategy = CosineStrategy(
-    semantic_filter="pricing plans features",
-    word_count_threshold=15,
-    sim_threshold=0.5,
-    top_k=3
-)
-
-async def extract_pricing_features(url: str):
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url=url,
-            extraction_strategy=strategy
-        )
-        
-        if result.success:
-            content = json.loads(result.extracted_content)
-            return {
-                'pricing_features': content,
-                'clusters': len(content),
-                'similarity_scores': [item['score'] for item in content]
-            }
-```
-
-## Best Practices
-
-1. **Adjust Thresholds Iteratively**
-   - Start with default values
-   - Adjust based on results
-   - Monitor clustering quality
-
-2. **Choose Appropriate Word Count Thresholds**
-   - Higher for articles (100+)
-   - Lower for reviews/comments (20+)
-   - Medium for product descriptions (50+)
-
-3. **Optimize Performance**
-   ```python
-   strategy = CosineStrategy(
-       word_count_threshold=10,  # Filter early
-       top_k=5,                 # Limit results
-       verbose=True             # Monitor performance
-   )
-   ```
-
-4. **Handle Different Content Types**
-   ```python
-   # For mixed content pages
-   strategy = CosineStrategy(
-       semantic_filter="product features",
-       sim_threshold=0.4,      # More flexible matching
-       max_dist=0.3,          # Larger clusters
-       top_k=3                # Multiple relevant sections
-   )
-   ```
-
-## Error Handling
-
-```python
-try:
-    result = await crawler.arun(
-        url="https://example.com",
-        extraction_strategy=strategy
-    )
-    
-    if result.success:
-        content = json.loads(result.extracted_content)
-        if not content:
-            print("No relevant content found")
-    else:
-        print(f"Extraction failed: {result.error_message}")
-        
-except Exception as e:
-    print(f"Error during extraction: {str(e)}")
-```
-
-The Cosine Strategy is particularly effective when:
-- Content structure is inconsistent
-- You need semantic understanding
-- You want to find similar content blocks
-- Structure-based extraction (CSS/XPath) isn't reliable
-
-It works well with other strategies and can be used as a pre-processing step for LLM-based extraction.
-```
-
-
-## File: docs/md_v2/advanced/advanced-features.md
-
-```md
-# Overview of Some Important Advanced Features 
-(Proxy, PDF, Screenshot, SSL, Headers, & Storage State)
-
-Crawl4AI offers multiple power-user features that go beyond simple crawling. This tutorial covers:
-
-1. **Proxy Usage**  
-2. **Capturing PDFs & Screenshots**  
-3. **Handling SSL Certificates**  
-4. **Custom Headers**  
-5. **Session Persistence & Local Storage**  
-6. **Robots.txt Compliance**  
-
-> **Prerequisites**  
-> - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md)  
-> - You know how to run or configure your Python environment with Playwright installed
-
----
-
-## 1. Proxy Usage
-
-If you need to route your crawl traffic through a proxy—whether for IP rotation, geo-testing, or privacy—Crawl4AI supports it via `BrowserConfig.proxy_config`.
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def main():
-    browser_cfg = BrowserConfig(
-        proxy_config={
-            "server": "http://proxy.example.com:8080",
-            "username": "myuser",
-            "password": "mypass",
-        },
-        headless=True
-    )
-    crawler_cfg = CrawlerRunConfig(
-        verbose=True
-    )
-
-    async with AsyncWebCrawler(config=browser_cfg) as crawler:
-        result = await crawler.arun(
-            url="https://www.whatismyip.com/",
-            config=crawler_cfg
-        )
-        if result.success:
-            print("[OK] Page fetched via proxy.")
-            print("Page HTML snippet:", result.html[:200])
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key Points**  
-- **`proxy_config`** expects a dict with `server` and optional auth credentials.  
-- Many commercial proxies provide an HTTP/HTTPS “gateway” server that you specify in `server`.  
-- If your proxy doesn’t need auth, omit `username`/`password`.
-
----
-
-## 2. Capturing PDFs & Screenshots
-
-Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI can do both in one pass:
-
-```python
-import os, asyncio
-from base64 import b64decode
-from crawl4ai import AsyncWebCrawler, CacheMode
-
-async def main():
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
-            cache_mode=CacheMode.BYPASS,
-            pdf=True,
-            screenshot=True
-        )
-        
-        if result.success:
-            # Save screenshot
-            if result.screenshot:
-                with open("wikipedia_screenshot.png", "wb") as f:
-                    f.write(b64decode(result.screenshot))
-            
-            # Save PDF
-            if result.pdf:
-                with open("wikipedia_page.pdf", "wb") as f:
-                    f.write(result.pdf)
-            
-            print("[OK] PDF & screenshot captured.")
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Why PDF + Screenshot?**  
-- Large or complex pages can be slow or error-prone with “traditional” full-page screenshots.  
-- Exporting a PDF is more reliable for very long pages. Crawl4AI automatically converts the first PDF page into an image if you request both.  
-
-**Relevant Parameters**  
-- **`pdf=True`**: Exports the current page as a PDF (base64-encoded in `result.pdf`).  
-- **`screenshot=True`**: Creates a screenshot (base64-encoded in `result.screenshot`).  
-- **`scan_full_page`** or advanced hooking can further refine how the crawler captures content.
-
----
-
-## 3. Handling SSL Certificates
-
-If you need to verify or export a site’s SSL certificate—for compliance, debugging, or data analysis—Crawl4AI can fetch it during the crawl:
-
-```python
-import asyncio, os
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-
-async def main():
-    tmp_dir = os.path.join(os.getcwd(), "tmp")
-    os.makedirs(tmp_dir, exist_ok=True)
-    
-    config = CrawlerRunConfig(
-        fetch_ssl_certificate=True,
-        cache_mode=CacheMode.BYPASS
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(url="https://example.com", config=config)
-        
-        if result.success and result.ssl_certificate:
-            cert = result.ssl_certificate
-            print("\nCertificate Information:")
-            print(f"Issuer (CN): {cert.issuer.get('CN', '')}")
-            print(f"Valid until: {cert.valid_until}")
-            print(f"Fingerprint: {cert.fingerprint}")
-
-            # Export in multiple formats:
-            cert.to_json(os.path.join(tmp_dir, "certificate.json"))
-            cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))
-            cert.to_der(os.path.join(tmp_dir, "certificate.der"))
-            
-            print("\nCertificate exported to JSON/PEM/DER in 'tmp' folder.")
-        else:
-            print("[ERROR] No certificate or crawl failed.")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key Points**  
-- **`fetch_ssl_certificate=True`** triggers certificate retrieval.  
-- `result.ssl_certificate` includes methods (`to_json`, `to_pem`, `to_der`) for saving in various formats (handy for server config, Java keystores, etc.).
-
----
-
-## 4. Custom Headers
-
-Sometimes you need to set custom headers (e.g., language preferences, authentication tokens, or specialized user-agent strings). You can do this in multiple ways:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-
-async def main():
-    # Option 1: Set headers at the crawler strategy level
-    crawler1 = AsyncWebCrawler(
-        # The underlying strategy can accept headers in its constructor
-        crawler_strategy=None  # We'll override below for clarity
-    )
-    crawler1.crawler_strategy.update_user_agent("MyCustomUA/1.0")
-    crawler1.crawler_strategy.set_custom_headers({
-        "Accept-Language": "fr-FR,fr;q=0.9"
-    })
-    result1 = await crawler1.arun("https://www.example.com")
-    print("Example 1 result success:", result1.success)
-
-    # Option 2: Pass headers directly to `arun()`
-    crawler2 = AsyncWebCrawler()
-    result2 = await crawler2.arun(
-        url="https://www.example.com",
-        headers={"Accept-Language": "es-ES,es;q=0.9"}
-    )
-    print("Example 2 result success:", result2.success)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Notes**  
-- Some sites may react differently to certain headers (e.g., `Accept-Language`).  
-- If you need advanced user-agent randomization or client hints, see [Identity-Based Crawling (Anti-Bot)](./identity-based-crawling.md) or use `UserAgentGenerator`.
-
----
-
-## 5. Session Persistence & Local Storage
-
-Crawl4AI can preserve cookies and localStorage so you can continue where you left off—ideal for logging into sites or skipping repeated auth flows.
-
-### 5.1 `storage_state`
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler
-
-async def main():
-    storage_dict = {
-        "cookies": [
-            {
-                "name": "session",
-                "value": "abcd1234",
-                "domain": "example.com",
-                "path": "/",
-                "expires": 1699999999.0,
-                "httpOnly": False,
-                "secure": False,
-                "sameSite": "None"
-            }
-        ],
-        "origins": [
-            {
-                "origin": "https://example.com",
-                "localStorage": [
-                    {"name": "token", "value": "my_auth_token"}
-                ]
-            }
-        ]
-    }
-
-    # Provide the storage state as a dictionary to start "already logged in"
-    async with AsyncWebCrawler(
-        headless=True,
-        storage_state=storage_dict
-    ) as crawler:
-        result = await crawler.arun("https://example.com/protected")
-        if result.success:
-            print("Protected page content length:", len(result.html))
-        else:
-            print("Failed to crawl protected page")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### 5.2 Exporting & Reusing State
-
-You can sign in once, export the browser context, and reuse it later—without re-entering credentials.
-
-- **`await context.storage_state(path="my_storage.json")`**: Exports cookies, localStorage, etc. to a file.  
-- Provide `storage_state="my_storage.json"` on subsequent runs to skip the login step.
-
-**See**: [Detailed session management tutorial](./session-management.md) or [Explanations → Browser Context & Managed Browser](./identity-based-crawling.md) for more advanced scenarios (like multi-step logins, or capturing after interactive pages).
-
----
-
-## 6. Robots.txt Compliance
-
-Crawl4AI supports respecting robots.txt rules with efficient caching:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    # Enable robots.txt checking in config
-    config = CrawlerRunConfig(
-        check_robots_txt=True  # Will check and respect robots.txt rules
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            "https://example.com",
-            config=config
-        )
-        
-        if not result.success and result.status_code == 403:
-            print("Access denied by robots.txt")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Key Points**
-- Robots.txt files are cached locally for efficiency
-- Cache is stored in `~/.crawl4ai/robots/robots_cache.db`
-- Cache has a default TTL of 7 days
-- If robots.txt can't be fetched, crawling is allowed
-- Returns 403 status code if URL is disallowed
-
----
-
-## Putting It All Together
-
-Here’s a snippet that combines multiple “advanced” features (proxy, PDF, screenshot, SSL, custom headers, and session reuse) into one run. Normally, you’d tailor each setting to your project’s needs.
-
-```python
-import os, asyncio
-from base64 import b64decode
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-
-async def main():
-    # 1. Browser config with proxy + headless
-    browser_cfg = BrowserConfig(
-        proxy_config={
-            "server": "http://proxy.example.com:8080",
-            "username": "myuser",
-            "password": "mypass",
-        },
-        headless=True,
-    )
-
-    # 2. Crawler config with PDF, screenshot, SSL, custom headers, and ignoring caches
-    crawler_cfg = CrawlerRunConfig(
-        pdf=True,
-        screenshot=True,
-        fetch_ssl_certificate=True,
-        cache_mode=CacheMode.BYPASS,
-        headers={"Accept-Language": "en-US,en;q=0.8"},
-        storage_state="my_storage.json",  # Reuse session from a previous sign-in
-        verbose=True,
-    )
-
-    # 3. Crawl
-    async with AsyncWebCrawler(config=browser_cfg) as crawler:
-        result = await crawler.arun(
-            url = "https://secure.example.com/protected", 
-            config=crawler_cfg
-        )
-        
-        if result.success:
-            print("[OK] Crawled the secure page. Links found:", len(result.links.get("internal", [])))
-            
-            # Save PDF & screenshot
-            if result.pdf:
-                with open("result.pdf", "wb") as f:
-                    f.write(b64decode(result.pdf))
-            if result.screenshot:
-                with open("result.png", "wb") as f:
-                    f.write(b64decode(result.screenshot))
-            
-            # Check SSL cert
-            if result.ssl_certificate:
-                print("SSL Issuer CN:", result.ssl_certificate.issuer.get("CN", ""))
-        else:
-            print("[ERROR]", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-## Conclusion & Next Steps
-
-You’ve now explored several **advanced** features:
-
-- **Proxy Usage**  
-- **PDF & Screenshot** capturing for large or critical pages  
-- **SSL Certificate** retrieval & exporting  
-- **Custom Headers** for language or specialized requests  
-- **Session Persistence** via storage state
-- **Robots.txt Compliance**
-
-With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
-
-**Last Updated**: 2025-01-01
-```
-
-
-## File: docs/md_v2/advanced/crawl-dispatcher.md
-
-```md
-# Crawl Dispatcher
-
-We’re excited to announce a **Crawl Dispatcher** module that can handle **thousands** of crawling tasks simultaneously. By efficiently managing system resources (memory, CPU, network), this dispatcher ensures high-performance data extraction at scale. It also provides **real-time monitoring** of each crawler’s status, memory usage, and overall progress.
-
-Stay tuned—this feature is **coming soon** in an upcoming release of Crawl4AI! For the latest news, keep an eye on our changelogs and follow [@unclecode](https://twitter.com/unclecode) on X.
-
-Below is a **sample** of how the dispatcher’s performance monitor might look in action:
-
-![Crawl Dispatcher Performance Monitor](../assets/images/dispatcher.png)
-
-
-We can’t wait to bring you this streamlined, **scalable** approach to multi-URL crawling—**watch this space** for updates!
-```
-
-
-## File: docs/md_v2/advanced/file-downloading.md
-
-```md
-# Download Handling in Crawl4AI
-
-This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files.
-
-## Enabling Downloads
-
-To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler.
-
-```python
-from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler
-
-async def main():
-    config = BrowserConfig(accept_downloads=True)  # Enable downloads globally
-    async with AsyncWebCrawler(config=config) as crawler:
-        # ... your crawling logic ...
-
-asyncio.run(main())
-```
-
-## Specifying Download Location
-
-Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory.
-
-```python
-from crawl4ai.async_configs import BrowserConfig
-import os
-
-downloads_path = os.path.join(os.getcwd(), "my_downloads")  # Custom download path
-os.makedirs(downloads_path, exist_ok=True)
-
-config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path)
-
-async def main():
-    async with AsyncWebCrawler(config=config) as crawler:
-        result = await crawler.arun(url="https://example.com")
-        # ...
-```
-
-## Triggering Downloads
-
-Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start.
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-
-config = CrawlerRunConfig(
-    js_code="""
-        const downloadLink = document.querySelector('a[href$=".exe"]');
-        if (downloadLink) {
-            downloadLink.click();
-        }
-    """,
-    wait_for=5  # Wait 5 seconds for the download to start
-)
-
-result = await crawler.arun(url="https://www.python.org/downloads/", config=config)
-```
-
-## Accessing Downloaded Files
-
-The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files.
-
-```python
-if result.downloaded_files:
-    print("Downloaded files:")
-    for file_path in result.downloaded_files:
-        print(f"- {file_path}")
-        file_size = os.path.getsize(file_path)
-        print(f"- File size: {file_size} bytes")
-else:
-    print("No files downloaded.")
-```
-
-## Example: Downloading Multiple Files
-
-```python
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-import os
-from pathlib import Path
-
-async def download_multiple_files(url: str, download_path: str):
-    config = BrowserConfig(accept_downloads=True, downloads_path=download_path)
-    async with AsyncWebCrawler(config=config) as crawler:
-        run_config = CrawlerRunConfig(
-            js_code="""
-                const downloadLinks = document.querySelectorAll('a[download]');
-                for (const link of downloadLinks) {
-                    link.click();
-                    // Delay between clicks
-                    await new Promise(r => setTimeout(r, 2000));  
-                }
-            """,
-            wait_for=10  # Wait for all downloads to start
-        )
-        result = await crawler.arun(url=url, config=run_config)
-
-        if result.downloaded_files:
-            print("Downloaded files:")
-            for file in result.downloaded_files:
-                print(f"- {file}")
-        else:
-            print("No files downloaded.")
-
-# Usage
-download_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
-os.makedirs(download_path, exist_ok=True)
-
-asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path))
-```
-
-## Important Considerations
-
-- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage.
-- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing.
-- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully.
-- **Security:** Scan downloaded files for potential security threats before use.
-
-This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed!
-```
-
-
-## File: docs/md_v2/advanced/hooks-auth.md
-
-```md
-# Hooks & Auth in AsyncWebCrawler
-
-Crawl4AI’s **hooks** let you customize the crawler at specific points in the pipeline:
-
-1. **`on_browser_created`** – After browser creation.  
-2. **`on_page_context_created`** – After a new context & page are created.  
-3. **`before_goto`** – Just before navigating to a page.  
-4. **`after_goto`** – Right after navigation completes.  
-5. **`on_user_agent_updated`** – Whenever the user agent changes.  
-6. **`on_execution_started`** – Once custom JavaScript execution begins.  
-7. **`before_retrieve_html`** – Just before the crawler retrieves final HTML.  
-8. **`before_return_html`** – Right before returning the HTML content.
-
-**Important**: Avoid heavy tasks in `on_browser_created` since you don’t yet have a page context. If you need to *log in*, do so in **`on_page_context_created`**.
-
-> note "Important Hook Usage Warning"
-    **Avoid Misusing Hooks**: Do not manipulate page objects in the wrong hook or at the wrong time, as it can crash the pipeline or produce incorrect results. A common mistake is attempting to handle authentication prematurely—such as creating or closing pages in `on_browser_created`. 
-
->   **Use the Right Hook for Auth**: If you need to log in or set tokens, use `on_page_context_created`. This ensures you have a valid page/context to work with, without disrupting the main crawling flow.
-
->    **Identity-Based Crawling**: For robust auth, consider identity-based crawling (or passing a session ID) to preserve state. Run your initial login steps in a separate, well-defined process, then feed that session to your main crawl—rather than shoehorning complex authentication into early hooks. Check out [Identity-Based Crawling](../advanced/identity-based-crawling.md) for more details.
-
->    **Be Cautious**: Overwriting or removing elements in the wrong hook can compromise the final crawl. Keep hooks focused on smaller tasks (like route filters, custom headers), and let your main logic (crawling, data extraction) proceed normally.
-
-
-Below is an example demonstration.
-
----
-
-## Example: Using Hooks in AsyncWebCrawler
-
-```python
-import asyncio
-import json
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from playwright.async_api import Page, BrowserContext
-
-async def main():
-    print("🔗 Hooks Example: Demonstrating recommended usage")
-
-    # 1) Configure the browser
-    browser_config = BrowserConfig(
-        headless=True,
-        verbose=True
-    )
-
-    # 2) Configure the crawler run
-    crawler_run_config = CrawlerRunConfig(
-        js_code="window.scrollTo(0, document.body.scrollHeight);",
-        wait_for="body",
-        cache_mode=CacheMode.BYPASS
-    )
-
-    # 3) Create the crawler instance
-    crawler = AsyncWebCrawler(config=browser_config)
-
-    #
-    # Define Hook Functions
-    #
-
-    async def on_browser_created(browser, **kwargs):
-        # Called once the browser instance is created (but no pages or contexts yet)
-        print("[HOOK] on_browser_created - Browser created successfully!")
-        # Typically, do minimal setup here if needed
-        return browser
-
-    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
-        # Called right after a new page + context are created (ideal for auth or route config).
-        print("[HOOK] on_page_context_created - Setting up page & context.")
-        
-        # Example 1: Route filtering (e.g., block images)
-        async def route_filter(route):
-            if route.request.resource_type == "image":
-                print(f"[HOOK] Blocking image request: {route.request.url}")
-                await route.abort()
-            else:
-                await route.continue_()
-
-        await context.route("**", route_filter)
-
-        # Example 2: (Optional) Simulate a login scenario
-        # (We do NOT create or close pages here, just do quick steps if needed)
-        # e.g., await page.goto("https://example.com/login")
-        # e.g., await page.fill("input[name='username']", "testuser")
-        # e.g., await page.fill("input[name='password']", "password123")
-        # e.g., await page.click("button[type='submit']")
-        # e.g., await page.wait_for_selector("#welcome")
-        # e.g., await context.add_cookies([...])
-        # Then continue
-
-        # Example 3: Adjust the viewport
-        await page.set_viewport_size({"width": 1080, "height": 600})
-        return page
-
-    async def before_goto(
-        page: Page, context: BrowserContext, url: str, **kwargs
-    ):
-        # Called before navigating to each URL.
-        print(f"[HOOK] before_goto - About to navigate: {url}")
-        # e.g., inject custom headers
-        await page.set_extra_http_headers({
-            "Custom-Header": "my-value"
-        })
-        return page
-
-    async def after_goto(
-        page: Page, context: BrowserContext, 
-        url: str, response, **kwargs
-    ):
-        # Called after navigation completes.
-        print(f"[HOOK] after_goto - Successfully loaded: {url}")
-        # e.g., wait for a certain element if we want to verify
-        try:
-            await page.wait_for_selector('.content', timeout=1000)
-            print("[HOOK] Found .content element!")
-        except:
-            print("[HOOK] .content not found, continuing anyway.")
-        return page
-
-    async def on_user_agent_updated(
-        page: Page, context: BrowserContext, 
-        user_agent: str, **kwargs
-    ):
-        # Called whenever the user agent updates.
-        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
-        return page
-
-    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
-        # Called after custom JavaScript execution begins.
-        print("[HOOK] on_execution_started - JS code is running!")
-        return page
-
-    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
-        # Called before final HTML retrieval.
-        print("[HOOK] before_retrieve_html - We can do final actions")
-        # Example: Scroll again
-        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
-        return page
-
-    async def before_return_html(
-        page: Page, context: BrowserContext, html: str, **kwargs
-    ):
-        # Called just before returning the HTML in the result.
-        print(f"[HOOK] before_return_html - HTML length: {len(html)}")
-        return page
-
-    #
-    # Attach Hooks
-    #
-
-    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
-    crawler.crawler_strategy.set_hook(
-        "on_page_context_created", on_page_context_created
-    )
-    crawler.crawler_strategy.set_hook("before_goto", before_goto)
-    crawler.crawler_strategy.set_hook("after_goto", after_goto)
-    crawler.crawler_strategy.set_hook(
-        "on_user_agent_updated", on_user_agent_updated
-    )
-    crawler.crawler_strategy.set_hook(
-        "on_execution_started", on_execution_started
-    )
-    crawler.crawler_strategy.set_hook(
-        "before_retrieve_html", before_retrieve_html
-    )
-    crawler.crawler_strategy.set_hook(
-        "before_return_html", before_return_html
-    )
-
-    await crawler.start()
-
-    # 4) Run the crawler on an example page
-    url = "https://example.com"
-    result = await crawler.arun(url, config=crawler_run_config)
-    
-    if result.success:
-        print("\nCrawled URL:", result.url)
-        print("HTML length:", len(result.html))
-    else:
-        print("Error:", result.error_message)
-
-    await crawler.close()
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-## Hook Lifecycle Summary
-
-1. **`on_browser_created`**:  
-   - Browser is up, but **no** pages or contexts yet.  
-   - Light setup only—don’t try to open or close pages here (that belongs in `on_page_context_created`).
-
-2. **`on_page_context_created`**:  
-   - Perfect for advanced **auth** or route blocking.  
-   - You have a **page** + **context** ready but haven’t navigated to the target URL yet.
-
-3. **`before_goto`**:  
-   - Right before navigation. Typically used for setting **custom headers** or logging the target URL.
-
-4. **`after_goto`**:  
-   - After page navigation is done. Good place for verifying content or waiting on essential elements. 
-
-5. **`on_user_agent_updated`**:  
-   - Whenever the user agent changes (for stealth or different UA modes).
-
-6. **`on_execution_started`**:  
-   - If you set `js_code` or run custom scripts, this runs once your JS is about to start.
-
-7. **`before_retrieve_html`**:  
-   - Just before the final HTML snapshot is taken. Often you do a final scroll or lazy-load triggers here.
-
-8. **`before_return_html`**:  
-   - The last hook before returning HTML to the `CrawlResult`. Good for logging HTML length or minor modifications.
-
----
-
-## When to Handle Authentication
-
-**Recommended**: Use **`on_page_context_created`** if you need to:
-
-- Navigate to a login page or fill forms
-- Set cookies or localStorage tokens
-- Block resource routes to avoid ads
-
-This ensures the newly created context is under your control **before** `arun()` navigates to the main URL.
-
----
-
-## Additional Considerations
-
-- **Session Management**: If you want multiple `arun()` calls to reuse a single session, pass `session_id=` in your `CrawlerRunConfig`. Hooks remain the same.  
-- **Performance**: Hooks can slow down crawling if they do heavy tasks. Keep them concise.  
-- **Error Handling**: If a hook fails, the overall crawl might fail. Catch exceptions or handle them gracefully.  
-- **Concurrency**: If you run `arun_many()`, each URL triggers these hooks in parallel. Ensure your hooks are thread/async-safe.
-
----
-
-## Conclusion
-
-Hooks provide **fine-grained** control over:
-
-- **Browser** creation (light tasks only)
-- **Page** and **context** creation (auth, route blocking)
-- **Navigation** phases
-- **Final HTML** retrieval
-
-Follow the recommended usage:
-- **Login** or advanced tasks in `on_page_context_created`  
-- **Custom headers** or logs in `before_goto` / `after_goto`  
-- **Scrolling** or final checks in `before_retrieve_html` / `before_return_html`
-
-
-```
-
-
-## File: docs/md_v2/advanced/identity-based-crawling.md
-
-```md
-# Preserve Your Identity with Crawl4AI
-
-Crawl4AI empowers you to navigate and interact with the web using your **authentic digital identity**, ensuring you’re recognized as a human and not mistaken for a bot. This tutorial covers:
-
-1. **Managed Browsers** – The recommended approach for persistent profiles and identity-based crawling.  
-2. **Magic Mode** – A simplified fallback solution for quick automation without persistent identity.
-
----
-
-## 1. Managed Browsers: Your Digital Identity Solution
-
-**Managed Browsers** let developers create and use **persistent browser profiles**. These profiles store local storage, cookies, and other session data, letting you browse as your **real self**—complete with logins, preferences, and cookies.
-
-### Key Benefits
-
-- **Authentic Browsing Experience**: Retain session data and browser fingerprints as though you’re a normal user.  
-- **Effortless Configuration**: Once you log in or solve CAPTCHAs in your chosen data directory, you can re-run crawls without repeating those steps.  
-- **Empowered Data Access**: If you can see the data in your own browser, you can automate its retrieval with your genuine identity.
-
----
-
-Below is a **partial update** to your **Managed Browsers** tutorial, specifically the section about **creating a user-data directory** using **Playwright’s Chromium** binary rather than a system-wide Chrome/Edge. We’ll show how to **locate** that binary and launch it with a `--user-data-dir` argument to set up your profile. You can then point `BrowserConfig.user_data_dir` to that folder for subsequent crawls.
-
----
-
-### Creating a User Data Directory (Command-Line Approach via Playwright)
-
-If you installed Crawl4AI (which installs Playwright under the hood), you already have a Playwright-managed Chromium on your system. Follow these steps to launch that **Chromium** from your command line, specifying a **custom** data directory:
-
-1. **Find** the Playwright Chromium binary:
-   - On most systems, installed browsers go under a `~/.cache/ms-playwright/` folder or similar path.  
-   - To see an overview of installed browsers, run:
-     ```bash
-     python -m playwright install --dry-run
-     ```
-     or
-     ```bash
-     playwright install --dry-run
-     ```
-     (depending on your environment). This shows where Playwright keeps Chromium.
-
-   - For instance, you might see a path like:
-     ```
-     ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome
-     ```
-     on Linux, or a corresponding folder on macOS/Windows.
-
-2. **Launch** the Playwright Chromium binary with a **custom** user-data directory:
-   ```bash
-   # Linux example
-   ~/.cache/ms-playwright/chromium-1234/chrome-linux/chrome \
-       --user-data-dir=/home/<you>/my_chrome_profile
-   ```
-   ```bash
-   # macOS example (Playwright’s internal binary)
-   ~/Library/Caches/ms-playwright/chromium-1234/chrome-mac/Chromium.app/Contents/MacOS/Chromium \
-       --user-data-dir=/Users/<you>/my_chrome_profile
-   ```
-   ```powershell
-   # Windows example (PowerShell/cmd)
-   "C:\Users\<you>\AppData\Local\ms-playwright\chromium-1234\chrome-win\chrome.exe" ^
-       --user-data-dir="C:\Users\<you>\my_chrome_profile"
-   ```
-   
-   **Replace** the path with the actual subfolder indicated in your `ms-playwright` cache structure.  
-   - This **opens** a fresh Chromium with your new or existing data folder.  
-   - **Log into** any sites or configure your browser the way you want.  
-   - **Close** when done—your profile data is saved in that folder.
-
-3. **Use** that folder in **`BrowserConfig.user_data_dir`**:
-   ```python
-   from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-   browser_config = BrowserConfig(
-       headless=True,
-       use_managed_browser=True,
-       user_data_dir="/home/<you>/my_chrome_profile",
-       browser_type="chromium"
-   )
-   ```
-   - Next time you run your code, it reuses that folder—**preserving** your session data, cookies, local storage, etc.
-
----
-
-## 3. Using Managed Browsers in Crawl4AI
-
-Once you have a data directory with your session data, pass it to **`BrowserConfig`**:
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def main():
-    # 1) Reference your persistent data directory
-    browser_config = BrowserConfig(
-        headless=True,             # 'True' for automated runs
-        verbose=True,
-        use_managed_browser=True,  # Enables persistent browser strategy
-        browser_type="chromium",
-        user_data_dir="/path/to/my-chrome-profile"
-    )
-
-    # 2) Standard crawl config
-    crawl_config = CrawlerRunConfig(
-        wait_for="css:.logged-in-content"
-    )
-
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(url="https://example.com/private", config=crawl_config)
-        if result.success:
-            print("Successfully accessed private data with your identity!")
-        else:
-            print("Error:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-### Workflow
-
-1. **Login** externally (via CLI or your normal Chrome with `--user-data-dir=...`).  
-2. **Close** that browser.  
-3. **Use** the same folder in `user_data_dir=` in Crawl4AI.  
-4. **Crawl** – The site sees your identity as if you’re the same user who just logged in.
-
----
-
-## 4. Magic Mode: Simplified Automation
-
-If you **don’t** need a persistent profile or identity-based approach, **Magic Mode** offers a quick way to simulate human-like browsing without storing long-term data.
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(
-        url="https://example.com",
-        config=CrawlerRunConfig(
-            magic=True,  # Simplifies a lot of interaction
-            remove_overlay_elements=True,
-            page_timeout=60000
-        )
-    )
-```
-
-**Magic Mode**:
-
-- Simulates a user-like experience  
-- Randomizes user agent & navigator
-- Randomizes interactions & timings  
-- Masks automation signals  
-- Attempts pop-up handling  
-
-**But** it’s no substitute for **true** user-based sessions if you want a fully legitimate identity-based solution.
-
----
-
-## 5. Comparing Managed Browsers vs. Magic Mode
-
-| Feature                    | **Managed Browsers**                                           | **Magic Mode**                                     |
-|----------------------------|---------------------------------------------------------------|-----------------------------------------------------|
-| **Session Persistence**    | Full localStorage/cookies retained in user_data_dir           | No persistent data (fresh each run)                |
-| **Genuine Identity**       | Real user profile with full rights & preferences              | Emulated user-like patterns, but no actual identity |
-| **Complex Sites**          | Best for login-gated sites or heavy config                    | Simple tasks, minimal login or config needed        |
-| **Setup**                  | External creation of user_data_dir, then use in Crawl4AI       | Single-line approach (`magic=True`)                 |
-| **Reliability**            | Extremely consistent (same data across runs)                  | Good for smaller tasks, can be less stable          |
-
----
-
-## 6. Using the BrowserProfiler Class
-
-Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing.
-
-### Creating and Managing Profiles with BrowserProfiler
-
-The `BrowserProfiler` class offers a comprehensive API for browser profile management:
-
-```python
-import asyncio
-from crawl4ai import BrowserProfiler
-
-async def manage_profiles():
-    # Create a profiler instance
-    profiler = BrowserProfiler()
-    
-    # Create a profile interactively - opens a browser window
-    profile_path = await profiler.create_profile(
-        profile_name="my-login-profile"  # Optional: name your profile
-    )
-    
-    print(f"Profile saved at: {profile_path}")
-    
-    # List all available profiles
-    profiles = profiler.list_profiles()
-    
-    for profile in profiles:
-        print(f"Profile: {profile['name']}")
-        print(f"  Path: {profile['path']}")
-        print(f"  Created: {profile['created']}")
-        print(f"  Browser type: {profile['type']}")
-    
-    # Get a specific profile path by name
-    specific_profile = profiler.get_profile_path("my-login-profile")
-    
-    # Delete a profile when no longer needed
-    success = profiler.delete_profile("old-profile-name")
-    
-asyncio.run(manage_profiles())
-```
-
-**How profile creation works:**
-1. A browser window opens for you to interact with
-2. You log in to websites, set preferences, etc.
-3. When you're done, press 'q' in the terminal to close the browser
-4. The profile is saved in the Crawl4AI profiles directory
-5. You can use the returned path with `BrowserConfig.user_data_dir`
-
-### Interactive Profile Management
-
-The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion:
-
-```python
-import asyncio
-from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig
-
-# Define a function to use a profile for crawling
-async def crawl_with_profile(profile_path, url):
-    browser_config = BrowserConfig(
-        headless=True,
-        use_managed_browser=True,
-        user_data_dir=profile_path
-    )
-    
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(url)
-        return result
-
-async def main():
-    # Create a profiler instance
-    profiler = BrowserProfiler()
-    
-    # Launch the interactive profile manager
-    # Passing the crawl function as a callback adds a "crawl with profile" option
-    await profiler.interactive_manager(crawl_callback=crawl_with_profile)
-    
-asyncio.run(main())
-```
-
-### Legacy Methods
-
-For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class:
-
-```python
-from crawl4ai.browser_manager import ManagedBrowser
-
-# These methods still work but use BrowserProfiler internally
-profiles = ManagedBrowser.list_profiles()
-```
-
-### Complete Example
-
-See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class.
-
----
-
-## 7. Locale, Timezone, and Geolocation Control
-
-In addition to using persistent profiles, Crawl4AI supports customizing your browser's locale, timezone, and geolocation settings. These features enhance your identity-based browsing experience by allowing you to control how websites perceive your location and regional settings.
-
-### Setting Locale and Timezone
-
-You can set the browser's locale and timezone through `CrawlerRunConfig`:
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(
-        url="https://example.com",
-        config=CrawlerRunConfig(
-            # Set browser locale (language and region formatting)
-            locale="fr-FR",  # French (France)
-            
-            # Set browser timezone
-            timezone_id="Europe/Paris",
-            
-            # Other normal options...
-            magic=True,
-            page_timeout=60000
-        )
-    )
-```
-
-**How it works:**
-- `locale` affects language preferences, date formats, number formats, etc.
-- `timezone_id` affects JavaScript's Date object and time-related functionality
-- These settings are applied when creating the browser context and maintained throughout the session
-
-### Configuring Geolocation
-
-Control the GPS coordinates reported by the browser's geolocation API:
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, GeolocationConfig
-
-async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(
-        url="https://maps.google.com",  # Or any location-aware site
-        config=CrawlerRunConfig(
-            # Configure precise GPS coordinates
-            geolocation=GeolocationConfig(
-                latitude=48.8566,   # Paris coordinates
-                longitude=2.3522,
-                accuracy=100        # Accuracy in meters (optional)
-            ),
-            
-            # This site will see you as being in Paris
-            page_timeout=60000
-        )
-    )
-```
-
-**Important notes:**
-- When `geolocation` is specified, the browser is automatically granted permission to access location
-- Websites using the Geolocation API will receive the exact coordinates you specify
-- This affects map services, store locators, delivery services, etc.
-- Combined with the appropriate `locale` and `timezone_id`, you can create a fully consistent location profile
-
-### Combining with Managed Browsers
-
-These settings work perfectly with managed browsers for a complete identity solution:
-
-```python
-from crawl4ai import (
-    AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, 
-    GeolocationConfig
-)
-
-browser_config = BrowserConfig(
-    use_managed_browser=True,
-    user_data_dir="/path/to/my-profile",
-    browser_type="chromium"
-)
-
-crawl_config = CrawlerRunConfig(
-    # Location settings
-    locale="es-MX",                  # Spanish (Mexico)
-    timezone_id="America/Mexico_City",
-    geolocation=GeolocationConfig(
-        latitude=19.4326,            # Mexico City
-        longitude=-99.1332
-    )
-)
-
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    result = await crawler.arun(url="https://example.com", config=crawl_config)
-```
-
-Combining persistent profiles with precise geolocation and region settings gives you complete control over your digital identity.
-
-## 8. Summary
-
-- **Create** your user-data directory either:
-  - By launching Chrome/Chromium externally with `--user-data-dir=/some/path` 
-  - Or by using the built-in `BrowserProfiler.create_profile()` method
-  - Or through the interactive interface with `profiler.interactive_manager()`
-- **Log in** or configure sites as needed, then close the browser
-- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
-- **Customize** identity aspects with `locale`, `timezone_id`, and `geolocation`
-- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
-- **Manage** your profiles with the dedicated `BrowserProfiler` class
-- Enjoy **persistent** sessions that reflect your real identity
-- If you only need quick, ephemeral automation, **Magic Mode** might suffice
-
-**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.
-
-With these approaches, you preserve your **authentic** browsing environment, ensuring the site sees you exactly as a normal user—no repeated logins or wasted time.
-```
-
-
-## File: docs/md_v2/advanced/lazy-loading.md
-
-```md
-## Handling Lazy-Loaded Images
-
-Many websites now load images **lazily** as you scroll. If you need to ensure they appear in your final crawl (and in `result.media`), consider:
-
-1. **`wait_for_images=True`** – Wait for images to fully load.  
-2. **`scan_full_page`** – Force the crawler to scroll the entire page, triggering lazy loads.  
-3. **`scroll_delay`** – Add small delays between scroll steps.  
-
-**Note**: If the site requires multiple “Load More” triggers or complex interactions, see the [Page Interaction docs](../core/page-interaction.md).
-
-### Example: Ensuring Lazy Images Appear
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
-from crawl4ai.async_configs import CacheMode
-
-async def main():
-    config = CrawlerRunConfig(
-        # Force the crawler to wait until images are fully loaded
-        wait_for_images=True,
-
-        # Option 1: If you want to automatically scroll the page to load images
-        scan_full_page=True,  # Tells the crawler to try scrolling the entire page
-        scroll_delay=0.5,     # Delay (seconds) between scroll steps
-
-        # Option 2: If the site uses a 'Load More' or JS triggers for images,
-        # you can also specify js_code or wait_for logic here.
-
-        cache_mode=CacheMode.BYPASS,
-        verbose=True
-    )
-
-    async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
-        result = await crawler.arun("https://www.example.com/gallery", config=config)
-        
-        if result.success:
-            images = result.media.get("images", [])
-            print("Images found:", len(images))
-            for i, img in enumerate(images[:5]):
-                print(f"[Image {i}] URL: {img['src']}, Score: {img.get('score','N/A')}")
-        else:
-            print("Error:", result.error_message)
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Explanation**:
-
-- **`wait_for_images=True`**  
-  The crawler tries to ensure images have finished loading before finalizing the HTML.  
-- **`scan_full_page=True`**  
-  Tells the crawler to attempt scrolling from top to bottom. Each scroll step helps trigger lazy loading.  
-- **`scroll_delay=0.5`**  
-  Pause half a second between each scroll step. Helps the site load images before continuing.
-
-**When to Use**:
-
-- **Lazy-Loading**: If images appear only when the user scrolls into view, `scan_full_page` + `scroll_delay` helps the crawler see them.  
-- **Heavier Pages**: If a page is extremely long, be mindful that scanning the entire page can be slow. Adjust `scroll_delay` or the max scroll steps as needed.
-
----
-
-## Combining with Other Link & Media Filters
-
-You can still combine **lazy-load** logic with the usual **exclude_external_images**, **exclude_domains**, or link filtration:
-
-```python
-config = CrawlerRunConfig(
-    wait_for_images=True,
-    scan_full_page=True,
-    scroll_delay=0.5,
-
-    # Filter out external images if you only want local ones
-    exclude_external_images=True,
-
-    # Exclude certain domains for links
-    exclude_domains=["spammycdn.com"],
-)
-```
-
-This approach ensures you see **all** images from the main domain while ignoring external ones, and the crawler physically scrolls the entire page so that lazy-loading triggers.
-
----
-
-## Tips & Troubleshooting
-
-1. **Long Pages**  
-   - Setting `scan_full_page=True` on extremely long or infinite-scroll pages can be resource-intensive.  
-   - Consider using [hooks](../core/page-interaction.md) or specialized logic to load specific sections or “Load More” triggers repeatedly.
-
-2. **Mixed Image Behavior**  
-   - Some sites load images in batches as you scroll. If you’re missing images, increase your `scroll_delay` or call multiple partial scrolls in a loop with JS code or hooks.
-
-3. **Combining with Dynamic Wait**  
-   - If the site has a placeholder that only changes to a real image after a certain event, you might do `wait_for="css:img.loaded"` or a custom JS `wait_for`.
-
-4. **Caching**  
-   - If `cache_mode` is enabled, repeated crawls might skip some network fetches. If you suspect caching is missing new images, set `cache_mode=CacheMode.BYPASS` for fresh fetches.
-
----
-
-With **lazy-loading** support, **wait_for_images**, and **scan_full_page** settings, you can capture the entire gallery or feed of images you expect—even if the site only loads them as the user scrolls. Combine these with the standard media filtering and domain exclusion for a complete link & media handling strategy.
-```
-
-
-## File: docs/md_v2/advanced/multi-url-crawling.md
-
-```md
-# Advanced Multi-URL Crawling with Dispatchers
-
-> **Heads Up**: Crawl4AI supports advanced dispatchers for **parallel** or **throttled** crawling, providing dynamic rate limiting and memory usage checks. The built-in `arun_many()` function uses these dispatchers to handle concurrency efficiently.
-
-## 1. Introduction
-
-When crawling many URLs:
-
-- **Basic**: Use `arun()` in a loop (simple but less efficient)
-- **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control
-- **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.)
-
-**Why Dispatchers?**  
-
-- **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources
-- **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses
-- **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance
-- **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency
-
----
-
-## 2. Core Components
-
-### 2.1 Rate Limiter
-
-```python
-class RateLimiter:
-    def __init__(
-        # Random delay range between requests
-        base_delay: Tuple[float, float] = (1.0, 3.0),  
-        
-        # Maximum backoff delay
-        max_delay: float = 60.0,                        
-        
-        # Retries before giving up
-        max_retries: int = 3,                          
-        
-        # Status codes triggering backoff
-        rate_limit_codes: List[int] = [429, 503]        
-    )
-```
-
-Here’s the revised and simplified explanation of the **RateLimiter**, focusing on constructor parameters and adhering to your markdown style and mkDocs guidelines.
-
-#### RateLimiter Constructor Parameters
-
-The **RateLimiter** is a utility that helps manage the pace of requests to avoid overloading servers or getting blocked due to rate limits. It operates internally to delay requests and handle retries but can be configured using its constructor parameters.
-
-**Parameters of the `RateLimiter` constructor:**
-
-1. **`base_delay`** (`Tuple[float, float]`, default: `(1.0, 3.0)`)  
-  The range for a random delay (in seconds) between consecutive requests to the same domain.
-
-- A random delay is chosen between `base_delay[0]` and `base_delay[1]` for each request.  
-- This prevents sending requests at a predictable frequency, reducing the chances of triggering rate limits.
-
-**Example:**  
-If `base_delay = (2.0, 5.0)`, delays could be randomly chosen as `2.3s`, `4.1s`, etc.
-
----
-
-2. **`max_delay`** (`float`, default: `60.0`)  
-  The maximum allowable delay when rate-limiting errors occur.
-
-- When servers return rate-limit responses (e.g., 429 or 503), the delay increases exponentially with jitter.  
-- The `max_delay` ensures the delay doesn’t grow unreasonably high, capping it at this value.
-
-**Example:**  
-For a `max_delay = 30.0`, even if backoff calculations suggest a delay of `45s`, it will cap at `30s`.
-
----
-
-3. **`max_retries`** (`int`, default: `3`)  
-  The maximum number of retries for a request if rate-limiting errors occur.
-
-- After encountering a rate-limit response, the `RateLimiter` retries the request up to this number of times.  
-- If all retries fail, the request is marked as failed, and the process continues.
-
-**Example:**  
-If `max_retries = 3`, the system retries a failed request three times before giving up.
-
----
-
-4. **`rate_limit_codes`** (`List[int]`, default: `[429, 503]`)  
-  A list of HTTP status codes that trigger the rate-limiting logic.
-
-- These status codes indicate the server is overwhelmed or actively limiting requests.  
-- You can customize this list to include other codes based on specific server behavior.
-
-**Example:**  
-If `rate_limit_codes = [429, 503, 504]`, the crawler will back off on these three error codes.
-
----
-
-**How to Use the `RateLimiter`:**
-
-Here’s an example of initializing and using a `RateLimiter` in your project:
-
-```python
-from crawl4ai import RateLimiter
-
-# Create a RateLimiter with custom settings
-rate_limiter = RateLimiter(
-    base_delay=(2.0, 4.0),  # Random delay between 2-4 seconds
-    max_delay=30.0,         # Cap delay at 30 seconds
-    max_retries=5,          # Retry up to 5 times on rate-limiting errors
-    rate_limit_codes=[429, 503]  # Handle these HTTP status codes
-)
-
-# RateLimiter will handle delays and retries internally
-# No additional setup is required for its operation
-```
-
-The `RateLimiter` integrates seamlessly with dispatchers like `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher`, ensuring requests are paced correctly without user intervention. Its internal mechanisms manage delays and retries to avoid overwhelming servers while maximizing efficiency.
-
-
-### 2.2 Crawler Monitor
-
-The CrawlerMonitor provides real-time visibility into crawling operations:
-
-```python
-from crawl4ai import CrawlerMonitor, DisplayMode
-monitor = CrawlerMonitor(
-    # Maximum rows in live display
-    max_visible_rows=15,          
-
-    # DETAILED or AGGREGATED view
-    display_mode=DisplayMode.DETAILED  
-)
-```
-
-**Display Modes**:
-
-1. **DETAILED**: Shows individual task status, memory usage, and timing
-2. **AGGREGATED**: Displays summary statistics and overall progress
-
----
-
-## 3. Available Dispatchers
-
-### 3.1 MemoryAdaptiveDispatcher (Default)
-
-Automatically manages concurrency based on system memory usage:
-
-```python
-from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
-
-dispatcher = MemoryAdaptiveDispatcher(
-    memory_threshold_percent=90.0,  # Pause if memory exceeds this
-    check_interval=1.0,             # How often to check memory
-    max_session_permit=10,          # Maximum concurrent tasks
-    rate_limiter=RateLimiter(       # Optional rate limiting
-        base_delay=(1.0, 2.0),
-        max_delay=30.0,
-        max_retries=2
-    ),
-    monitor=CrawlerMonitor(         # Optional monitoring
-        max_visible_rows=15,
-        display_mode=DisplayMode.DETAILED
-    )
-)
-```
-
-**Constructor Parameters:**
-
-1. **`memory_threshold_percent`** (`float`, default: `90.0`)  
-  Specifies the memory usage threshold (as a percentage). If system memory usage exceeds this value, the dispatcher pauses crawling to prevent system overload.
-
-2. **`check_interval`** (`float`, default: `1.0`)  
-  The interval (in seconds) at which the dispatcher checks system memory usage.
-
-3. **`max_session_permit`** (`int`, default: `10`)  
-  The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
-
-4. **`memory_wait_timeout`** (`float`, default: `600.0`)
-  Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
-
-5. **`rate_limiter`** (`RateLimiter`, default: `None`)  
-  Optional rate-limiting logic to avoid server-side blocking (e.g., for handling 429 or 503 errors). See **RateLimiter** for details.
-
-6. **`monitor`** (`CrawlerMonitor`, default: `None`)  
-  Optional monitoring for real-time task tracking and performance insights. See **CrawlerMonitor** for details.
-
----
-
-### 3.2 SemaphoreDispatcher
-
-Provides simple concurrency control with a fixed limit:
-
-```python
-from crawl4ai.async_dispatcher import SemaphoreDispatcher
-
-dispatcher = SemaphoreDispatcher(
-    max_session_permit=20,         # Maximum concurrent tasks
-    rate_limiter=RateLimiter(      # Optional rate limiting
-        base_delay=(0.5, 1.0),
-        max_delay=10.0
-    ),
-    monitor=CrawlerMonitor(        # Optional monitoring
-        max_visible_rows=15,
-        display_mode=DisplayMode.DETAILED
-    )
-)
-```
-
-**Constructor Parameters:**
-
-1. **`max_session_permit`** (`int`, default: `20`)  
-  The maximum number of concurrent crawling tasks allowed, irrespective of semaphore slots.
-
-2. **`rate_limiter`** (`RateLimiter`, default: `None`)  
-  Optional rate-limiting logic to avoid overwhelming servers. See **RateLimiter** for details.
-
-3. **`monitor`** (`CrawlerMonitor`, default: `None`)  
-  Optional monitoring for tracking task progress and resource usage. See **CrawlerMonitor** for details.
-
----
-
-## 4. Usage Examples
-
-### 4.1 Batch Processing (Default)
-
-```python
-async def crawl_batch():
-    browser_config = BrowserConfig(headless=True, verbose=False)
-    run_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        stream=False  # Default: get all results at once
-    )
-    
-    dispatcher = MemoryAdaptiveDispatcher(
-        memory_threshold_percent=70.0,
-        check_interval=1.0,
-        max_session_permit=10,
-        monitor=CrawlerMonitor(
-            display_mode=DisplayMode.DETAILED
-        )
-    )
-
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        # Get all results at once
-        results = await crawler.arun_many(
-            urls=urls,
-            config=run_config,
-            dispatcher=dispatcher
-        )
-        
-        # Process all results after completion
-        for result in results:
-            if result.success:
-                await process_result(result)
-            else:
-                print(f"Failed to crawl {result.url}: {result.error_message}")
-```
-
-**Review:**  
-- **Purpose:** Executes a batch crawl with all URLs processed together after crawling is complete.  
-- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` to manage concurrency and system memory.  
-- **Stream:** Disabled (`stream=False`), so all results are collected at once for post-processing.  
-- **Best Use Case:** When you need to analyze results in bulk rather than individually during the crawl.
-
----
-
-### 4.2 Streaming Mode
-
-```python
-async def crawl_streaming():
-    browser_config = BrowserConfig(headless=True, verbose=False)
-    run_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        stream=True  # Enable streaming mode
-    )
-    
-    dispatcher = MemoryAdaptiveDispatcher(
-        memory_threshold_percent=70.0,
-        check_interval=1.0,
-        max_session_permit=10,
-        monitor=CrawlerMonitor(
-            display_mode=DisplayMode.DETAILED
-        )
-    )
-
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        # Process results as they become available
-        async for result in await crawler.arun_many(
-            urls=urls,
-            config=run_config,
-            dispatcher=dispatcher
-        ):
-            if result.success:
-                # Process each result immediately
-                await process_result(result)
-            else:
-                print(f"Failed to crawl {result.url}: {result.error_message}")
-```
-
-**Review:**  
-- **Purpose:** Enables streaming to process results as soon as they’re available.  
-- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` for concurrency and memory management.  
-- **Stream:** Enabled (`stream=True`), allowing real-time processing during crawling.  
-- **Best Use Case:** When you need to act on results immediately, such as for real-time analytics or progressive data storage.
-
----
-
-### 4.3 Semaphore-based Crawling
-
-```python
-async def crawl_with_semaphore(urls):
-    browser_config = BrowserConfig(headless=True, verbose=False)
-    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
-    
-    dispatcher = SemaphoreDispatcher(
-        semaphore_count=5,
-        rate_limiter=RateLimiter(
-            base_delay=(0.5, 1.0),
-            max_delay=10.0
-        ),
-        monitor=CrawlerMonitor(
-            max_visible_rows=15,
-            display_mode=DisplayMode.DETAILED
-        )
-    )
-    
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        results = await crawler.arun_many(
-            urls, 
-            config=run_config,
-            dispatcher=dispatcher
-        )
-        return results
-```
-
-**Review:**  
-- **Purpose:** Uses `SemaphoreDispatcher` to limit concurrency with a fixed number of slots.  
-- **Dispatcher:** Configured with a semaphore to control parallel crawling tasks.  
-- **Rate Limiter:** Prevents servers from being overwhelmed by pacing requests.  
-- **Best Use Case:** When you want precise control over the number of concurrent requests, independent of system memory.
-
----
-
-### 4.4 Robots.txt Consideration
-
-```python
-import asyncio
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-
-async def main():
-    urls = [
-        "https://example1.com",
-        "https://example2.com",
-        "https://example3.com"
-    ]
-    
-    config = CrawlerRunConfig(
-        cache_mode=CacheMode.ENABLED,
-        check_robots_txt=True,  # Will respect robots.txt for each URL
-        semaphore_count=3      # Max concurrent requests
-    )
-    
-    async with AsyncWebCrawler() as crawler:
-        async for result in crawler.arun_many(urls, config=config):
-            if result.success:
-                print(f"Successfully crawled {result.url}")
-            elif result.status_code == 403 and "robots.txt" in result.error_message:
-                print(f"Skipped {result.url} - blocked by robots.txt")
-            else:
-                print(f"Failed to crawl {result.url}: {result.error_message}")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-**Review:**  
-- **Purpose:** Ensures compliance with `robots.txt` rules for ethical and legal web crawling.  
-- **Configuration:** Set `check_robots_txt=True` to validate each URL against `robots.txt` before crawling.  
-- **Dispatcher:** Handles requests with concurrency limits (`semaphore_count=3`).  
-- **Best Use Case:** When crawling websites that strictly enforce robots.txt policies or for responsible crawling practices.
-
----
-
-## 5. Dispatch Results
-
-Each crawl result includes dispatch information:
-
-```python
-@dataclass
-class DispatchResult:
-    task_id: str
-    memory_usage: float
-    peak_memory: float
-    start_time: datetime
-    end_time: datetime
-    error_message: str = ""
-```
-
-Access via `result.dispatch_result`:
-
-```python
-for result in results:
-    if result.success:
-        dr = result.dispatch_result
-        print(f"URL: {result.url}")
-        print(f"Memory: {dr.memory_usage:.1f}MB")
-        print(f"Duration: {dr.end_time - dr.start_time}")
-```
-
-## 6. Summary
-
-1. **Two Dispatcher Types**:
-
-   - MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory
-   - SemaphoreDispatcher: Fixed concurrency limit
-
-2. **Optional Components**:
-
-   - RateLimiter: Smart request pacing and backoff
-   - CrawlerMonitor: Real-time progress visualization
-
-3. **Key Benefits**:
-
-   - Automatic memory management
-   - Built-in rate limiting
-   - Live progress monitoring
-   - Flexible concurrency control
-
-Choose the dispatcher that best fits your needs:
-
-- **MemoryAdaptiveDispatcher**: For large crawls or limited resources
-- **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios
-
-```
-
-
-## File: docs/md_v2/advanced/network-console-capture.md
-
-```md
-# Network Requests & Console Message Capturing
-
-Crawl4AI can capture all network requests and browser console messages during a crawl, which is invaluable for debugging, security analysis, or understanding page behavior.
-
-## Configuration
-
-To enable network and console capturing, use these configuration options:
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-# Enable both network request capture and console message capture
-config = CrawlerRunConfig(
-    capture_network_requests=True,  # Capture all network requests and responses
-    capture_console_messages=True   # Capture all browser console output
-)
-```
-
-## Example Usage
-
-```python
-import asyncio
-import json
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async def main():
-    # Enable both network request capture and console message capture
-    config = CrawlerRunConfig(
-        capture_network_requests=True,
-        capture_console_messages=True
-    )
-    
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://example.com",
-            config=config
-        )
-        
-        if result.success:
-            # Analyze network requests
-            if result.network_requests:
-                print(f"Captured {len(result.network_requests)} network events")
-                
-                # Count request types
-                request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
-                response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
-                failed_count = len([r for r in result.network_requests if r.get("event_type") == "request_failed"])
-                
-                print(f"Requests: {request_count}, Responses: {response_count}, Failed: {failed_count}")
-                
-                # Find API calls
-                api_calls = [r for r in result.network_requests 
-                            if r.get("event_type") == "request" and "api" in r.get("url", "")]
-                if api_calls:
-                    print(f"Detected {len(api_calls)} API calls:")
-                    for call in api_calls[:3]:  # Show first 3
-                        print(f"  - {call.get('method')} {call.get('url')}")
-            
-            # Analyze console messages
-            if result.console_messages:
-                print(f"Captured {len(result.console_messages)} console messages")
-                
-                # Group by type
-                message_types = {}
-                for msg in result.console_messages:
-                    msg_type = msg.get("type", "unknown")
-                    message_types[msg_type] = message_types.get(msg_type, 0) + 1
-                
-                print("Message types:", message_types)
-                
-                # Show errors (often the most important)
-                errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
-                if errors:
-                    print(f"Found {len(errors)} console errors:")
-                    for err in errors[:2]:  # Show first 2
-                        print(f"  - {err.get('text', '')[:100]}")
-            
-            # Export all captured data to a file for detailed analysis
-            with open("network_capture.json", "w") as f:
-                json.dump({
-                    "url": result.url,
-                    "network_requests": result.network_requests or [],
-                    "console_messages": result.console_messages or []
-                }, f, indent=2)
-            
-            print("Exported detailed capture data to network_capture.json")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
-## Captured Data Structure
-
-### Network Requests
-
-The `result.network_requests` contains a list of dictionaries, each representing a network event with these common fields:
-
-| Field | Description |
-|-------|-------------|
-| `event_type` | Type of event: `"request"`, `"response"`, or `"request_failed"` |
-| `url` | The URL of the request |
-| `timestamp` | Unix timestamp when the event was captured |
-
-#### Request Event Fields
-
-```json
-{
-  "event_type": "request",
-  "url": "https://example.com/api/data.json",
-  "method": "GET",
-  "headers": {"User-Agent": "...", "Accept": "..."},
-  "post_data": "key=value&otherkey=value",
-  "resource_type": "fetch",
-  "is_navigation_request": false,
-  "timestamp": 1633456789.123
-}
-```
-
-#### Response Event Fields
-
-```json
-{
-  "event_type": "response",
-  "url": "https://example.com/api/data.json",
-  "status": 200,
-  "status_text": "OK",
-  "headers": {"Content-Type": "application/json", "Cache-Control": "..."},
-  "from_service_worker": false,
-  "request_timing": {"requestTime": 1234.56, "receiveHeadersEnd": 1234.78},
-  "timestamp": 1633456789.456
-}
-```
-
-#### Failed Request Event Fields
-
-```json
-{
-  "event_type": "request_failed",
-  "url": "https://example.com/missing.png",
-  "method": "GET",
-  "resource_type": "image",
-  "failure_text": "net::ERR_ABORTED 404",
-  "timestamp": 1633456789.789
-}
-```
-
-### Console Messages
-
-The `result.console_messages` contains a list of dictionaries, each representing a console message with these common fields:
-
-| Field | Description |
-|-------|-------------|
-| `type` | Message type: `"log"`, `"error"`, `"warning"`, `"info"`, etc. |
-| `text` | The message text |
-| `timestamp` | Unix timestamp when the message was captured |
-
-#### Console Message Example
-
-```json
-{
-  "type": "error",
-  "text": "Uncaught TypeError: Cannot read property 'length' of undefined",
-  "location": "https://example.com/script.js:123:45",
-  "timestamp": 1633456790.123
-}
-```
-
-## Key Benefits
-
-- **Full Request Visibility**: Capture all network activity including:
-  - Requests (URLs, methods, headers, post data)
-  - Responses (status codes, headers, timing)
-  - Failed requests (with error messages)
-  
-- **Console Message Access**: View all JavaScript console output:
-  - Log messages
-  - Warnings
-  - Errors with stack traces
-  - Developer debugging information
-
-- **Debugging Power**: Identify issues such as:
-  - Failed API calls or resource loading
-  - JavaScript errors affecting page functionality
-  - CORS or other security issues
-  - Hidden API endpoints and data flows
-
-- **Security Analysis**: Detect:
-  - Unexpected third-party requests
-  - Data leakage in request payloads
-  - Suspicious script behavior
-
-- **Performance Insights**: Analyze:
-  - Request timing data
-  - Resource loading patterns
-  - Potential bottlenecks
-
-## Use Cases
-
-1. **API Discovery**: Identify hidden endpoints and data flows in single-page applications
-2. **Debugging**: Track down JavaScript errors affecting page functionality
-3. **Security Auditing**: Detect unwanted third-party requests or data leakage
-4. **Performance Analysis**: Identify slow-loading resources
-5. **Ad/Tracker Analysis**: Detect and catalog advertising or tracking calls
-
-This capability is especially valuable for complex sites with heavy JavaScript, single-page applications, or when you need to understand the exact communication happening between a browser and servers.
-```
-
-
-## File: docs/md_v2/advanced/proxy-security.md
-
-```md
-# Proxy 
-
-## Basic Proxy Setup
-
-Simple proxy configuration with `BrowserConfig`:
-
-```python
-from crawl4ai.async_configs import BrowserConfig
-
-# Using proxy URL
-browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    result = await crawler.arun(url="https://example.com")
-
-# Using SOCKS proxy
-browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    result = await crawler.arun(url="https://example.com")
-```
-
-## Authenticated Proxy
-
-Use an authenticated proxy with `BrowserConfig`:
-
-```python
-from crawl4ai.async_configs import BrowserConfig
-
-proxy_config = {
-    "server": "http://proxy.example.com:8080",
-    "username": "user",
-    "password": "pass"
-}
-
-browser_config = BrowserConfig(proxy_config=proxy_config)
-async with AsyncWebCrawler(config=browser_config) as crawler:
-    result = await crawler.arun(url="https://example.com")
-```
-
-Here's the corrected documentation:
-
-## Rotating Proxies 
-
-Example using a proxy rotation service dynamically:
-
-```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-
-async def get_next_proxy():
-    # Your proxy rotation logic here
-    return {"server": "http://next.proxy.com:8080"}
-
-async def main():
-    browser_config = BrowserConfig()
-    run_config = CrawlerRunConfig()
-    
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        # For each URL, create a new run config with different proxy
-        for url in urls:
-            proxy = await get_next_proxy()
-            # Clone the config and update proxy - this creates a new browser context
-            current_config = run_config.clone(proxy_config=proxy)
-            result = await crawler.arun(url=url, config=current_config)
-
-if __name__ == "__main__":
-    import asyncio
-    asyncio.run(main())
-```
-
-
-```
-
-
-## File: docs/md_v2/advanced/session-management.md
-
-```md
-# Session Management
-
-Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab (or page object) across sequential actions and crawls, which is beneficial for:
-
-- **Performing JavaScript actions before and after crawling.**
-- **Executing multiple sequential crawls faster** without needing to reopen tabs or allocate memory repeatedly.
-
-**Note:** This feature is designed for sequential workflows and is not suitable for parallel operations.
-
----
-
-#### Basic Session Usage
-
-Use `BrowserConfig` and `CrawlerRunConfig` to maintain state with a `session_id`:
-
-```python
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-
-async with AsyncWebCrawler() as crawler:
-    session_id = "my_session"
-
-    # Define configurations
-    config1 = CrawlerRunConfig(
-        url="https://example.com/page1", session_id=session_id
-    )
-    config2 = CrawlerRunConfig(
-        url="https://example.com/page2", session_id=session_id
-    )
-
-    # First request
-    result1 = await crawler.arun(config=config1)
-
-    # Subsequent request using the same session
-    result2 = await crawler.arun(config=config2)
-
-    # Clean up when done
-    await crawler.crawler_strategy.kill_session(session_id)
-```
-
----
-
-#### Dynamic Content with Sessions
-
-Here's an example of crawling GitHub commits across multiple pages while preserving session state:
-
-```python
-from crawl4ai.async_configs import CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-from crawl4ai.cache_context import CacheMode
-
-async def crawl_dynamic_content():
-    async with AsyncWebCrawler() as crawler:
-        session_id = "github_commits_session"
-        url = "https://github.com/microsoft/TypeScript/commits/main"
-        all_commits = []
-
-        # Define extraction schema
-        schema = {
-            "name": "Commit Extractor",
-            "baseSelector": "li.Box-sc-g0xbh4-0",
-            "fields": [{
-                "name": "title", "selector": "h4.markdown-title", "type": "text"
-            }],
-        }
-        extraction_strategy = JsonCssExtractionStrategy(schema)
-
-        # JavaScript and wait configurations
-        js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
-        wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
-
-        # Crawl multiple pages
-        for page in range(3):
-            config = CrawlerRunConfig(
-                url=url,
-                session_id=session_id,
-                extraction_strategy=extraction_strategy,
-                js_code=js_next_page if page > 0 else None,
-                wait_for=wait_for if page > 0 else None,
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS
-            )
-
-            result = await crawler.arun(config=config)
-            if result.success:
-                commits = json.loads(result.extracted_content)
-                all_commits.extend(commits)
-                print(f"Page {page + 1}: Found {len(commits)} commits")
-
-        # Clean up session
-        await crawler.crawler_strategy.kill_session(session_id)
-        return all_commits
-```
-
----
-
-## Example 1: Basic Session-Based Crawling
-
-A simple example using session-based crawling:
-
-```python
-import asyncio
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
-from crawl4ai.cache_context import CacheMode
-
-async def basic_session_crawl():
-    async with AsyncWebCrawler() as crawler:
-        session_id = "dynamic_content_session"
-        url = "https://example.com/dynamic-content"
-
-        for page in range(3):
-            config = CrawlerRunConfig(
-                url=url,
-                session_id=session_id,
-                js_code="document.querySelector('.load-more-button').click();" if page > 0 else None,
-                css_selector=".content-item",
-                cache_mode=CacheMode.BYPASS
-            )
-            
-            result = await crawler.arun(config=config)
-            print(f"Page {page + 1}: Found {result.extracted_content.count('.content-item')} items")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-
-asyncio.run(basic_session_crawl())
-```
-
-This example shows:
-1. Reusing the same `session_id` across multiple requests.
-2. Executing JavaScript to load more content dynamically.
-3. Properly closing the session to free resources.
-
----
-
-## Advanced Technique 1: Custom Execution Hooks
-
-> Warning: You might feel confused by the end of the next few examples 😅, so make sure you are comfortable with the order of the parts before you start this.
-
-Use custom hooks to handle complex scenarios, such as waiting for content to load dynamically:
-
-```python
-async def advanced_session_crawl_with_hooks():
-    first_commit = ""
-
-    async def on_execution_started(page):
-        nonlocal first_commit
-        try:
-            while True:
-                await page.wait_for_selector("li.commit-item h4")
-                commit = await page.query_selector("li.commit-item h4")
-                commit = await commit.evaluate("(element) => element.textContent").strip()
-                if commit and commit != first_commit:
-                    first_commit = commit
-                    break
-                await asyncio.sleep(0.5)
-        except Exception as e:
-            print(f"Warning: New content didn't appear: {e}")
-
-    async with AsyncWebCrawler() as crawler:
-        session_id = "commit_session"
-        url = "https://github.com/example/repo/commits/main"
-        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
-
-        js_next_page = """document.querySelector('a.pagination-next').click();"""
-
-        for page in range(3):
-            config = CrawlerRunConfig(
-                url=url,
-                session_id=session_id,
-                js_code=js_next_page if page > 0 else None,
-                css_selector="li.commit-item",
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS
-            )
-
-            result = await crawler.arun(config=config)
-            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-
-asyncio.run(advanced_session_crawl_with_hooks())
-```
-
-This technique ensures new content loads before the next action.
-
----
-
-## Advanced Technique 2: Integrated JavaScript Execution and Waiting
-
-Combine JavaScript execution and waiting logic for concise handling of dynamic content:
-
-```python
-async def integrated_js_and_wait_crawl():
-    async with AsyncWebCrawler() as crawler:
-        session_id = "integrated_session"
-        url = "https://github.com/example/repo/commits/main"
-
-        js_next_page_and_wait = """
-        (async () => {
-            const getCurrentCommit = () => document.querySelector('li.commit-item h4').textContent.trim();
-            const initialCommit = getCurrentCommit();
-            document.querySelector('a.pagination-next').click();
-            while (getCurrentCommit() === initialCommit) {
-                await new Promise(resolve => setTimeout(resolve, 100));
-            }
-        })();
-        """
-
-        for page in range(3):
-            config = CrawlerRunConfig(
-                url=url,
-                session_id=session_id,
-                js_code=js_next_page_and_wait if page > 0 else None,
-                css_selector="li.commit-item",
-                js_only=page > 0,
-                cache_mode=CacheMode.BYPASS
-            )
-
-            result = await crawler.arun(config=config)
-            print(f"Page {page + 1}: Found {len(result.extracted_content)} commits")
-
-        await crawler.crawler_strategy.kill_session(session_id)
-
-asyncio.run(integrated_js_and_wait_crawl())
-```
-
----
-
-#### Common Use Cases for Sessions
-
-1. **Authentication Flows**: Login and interact with secured pages.
-
-2. **Pagination Handling**: Navigate through multiple pages.
-
-3. **Form Submissions**: Fill forms, submit, and process results.
-
-4. **Multi-step Processes**: Complete workflows that span multiple actions.
-
-5. **Dynamic Content Navigation**: Handle JavaScript-rendered or event-triggered content.
-
-```
-
-
-## File: docs/md_v2/advanced/ssl-certificate.md
-
-```md
-# `SSLCertificate` Reference
-
-The **`SSLCertificate`** class encapsulates an SSL certificate’s data and allows exporting it in various formats (PEM, DER, JSON, or text). It’s used within **Crawl4AI** whenever you set **`fetch_ssl_certificate=True`** in your **`CrawlerRunConfig`**.  
-
-## 1. Overview
-
-**Location**: `crawl4ai/ssl_certificate.py`
-
-```python
-class SSLCertificate:
-    """
-    Represents an SSL certificate with methods to export in various formats.
-
-    Main Methods:
-    - from_url(url, timeout=10)
-    - from_file(file_path)
-    - from_binary(binary_data)
-    - to_json(filepath=None)
-    - to_pem(filepath=None)
-    - to_der(filepath=None)
-    ...
-
-    Common Properties:
-    - issuer
-    - subject
-    - valid_from
-    - valid_until
-    - fingerprint
-    """
-```
-
-### Typical Use Case
-1. You **enable** certificate fetching in your crawl by:
-   ```python
-   CrawlerRunConfig(fetch_ssl_certificate=True, ...)
-   ```
-2. After `arun()`, if `result.ssl_certificate` is present, it’s an instance of **`SSLCertificate`**.  
-3. You can **read** basic properties (issuer, subject, validity) or **export** them in multiple formats.
-
----
-
-## 2. Construction & Fetching
-
-### 2.1 **`from_url(url, timeout=10)`**
-Manually load an SSL certificate from a given URL (port 443). Typically used internally, but you can call it directly if you want:
-
-```python
-cert = SSLCertificate.from_url("https://example.com")
-if cert:
-    print("Fingerprint:", cert.fingerprint)
-```
-
-### 2.2 **`from_file(file_path)`**
-Load from a file containing certificate data in ASN.1 or DER. Rarely needed unless you have local cert files:
-
-```python
-cert = SSLCertificate.from_file("/path/to/cert.der")
-```
-
-### 2.3 **`from_binary(binary_data)`**
-Initialize from raw binary. E.g., if you captured it from a socket or another source:
-
-```python
-cert = SSLCertificate.from_binary(raw_bytes)
-```
-
----
-
-## 3. Common Properties
-
-After obtaining a **`SSLCertificate`** instance (e.g. `result.ssl_certificate` from a crawl), you can read:
-
-1. **`issuer`** *(dict)*  
-   - E.g. `{"CN": "My Root CA", "O": "..."}`
-2. **`subject`** *(dict)*  
-   - E.g. `{"CN": "example.com", "O": "ExampleOrg"}`
-3. **`valid_from`** *(str)*  
-   - NotBefore date/time. Often in ASN.1/UTC format.
-4. **`valid_until`** *(str)*  
-   - NotAfter date/time.
-5. **`fingerprint`** *(str)*  
-   - The SHA-256 digest (lowercase hex).  
-   - E.g. `"d14d2e..."`
-
----
-
-## 4. Export Methods
-
-Once you have a **`SSLCertificate`** object, you can **export** or **inspect** it:
-
-### 4.1 **`to_json(filepath=None)` → `Optional[str]`**
-- Returns a JSON string containing the parsed certificate fields.  
-- If `filepath` is provided, saves it to disk instead, returning `None`.
-
-**Usage**:
-```python
-json_data = cert.to_json()  # returns JSON string
-cert.to_json("certificate.json")  # writes file, returns None
-```
-
-### 4.2 **`to_pem(filepath=None)` → `Optional[str]`**
-- Returns a PEM-encoded string (common for web servers).  
-- If `filepath` is provided, saves it to disk instead.
-
-```python
-pem_str = cert.to_pem()              # in-memory PEM string
-cert.to_pem("/path/to/cert.pem")     # saved to file
-```
-
-### 4.3 **`to_der(filepath=None)` → `Optional[bytes]`**
-- Returns the original DER (binary ASN.1) bytes.  
-- If `filepath` is specified, writes the bytes there instead.
-
-```python
-der_bytes = cert.to_der()
-cert.to_der("certificate.der")
-```
-
-### 4.4 (Optional) **`export_as_text()`**
-- If you see a method like `export_as_text()`, it typically returns an OpenSSL-style textual representation.  
-- Not always needed, but can help for debugging or manual inspection.
-
----
-
-## 5. Example Usage in Crawl4AI
-
-Below is a minimal sample showing how the crawler obtains an SSL cert from a site, then reads or exports it. The code snippet:
-
-```python
-import asyncio
-import os
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-
-async def main():
-    tmp_dir = "tmp"
-    os.makedirs(tmp_dir, exist_ok=True)
-
-    config = CrawlerRunConfig(
-        fetch_ssl_certificate=True,
-        cache_mode=CacheMode.BYPASS
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun("https://example.com", config=config)
-        if result.success and result.ssl_certificate:
-            cert = result.ssl_certificate
-            # 1. Basic Info
-            print("Issuer CN:", cert.issuer.get("CN", ""))
-            print("Valid until:", cert.valid_until)
-            print("Fingerprint:", cert.fingerprint)
-            
-            # 2. Export
-            cert.to_json(os.path.join(tmp_dir, "certificate.json"))
-            cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))
-            cert.to_der(os.path.join(tmp_dir, "certificate.der"))
-    
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-
----
-
-## 6. Notes & Best Practices
-
-1. **Timeout**: `SSLCertificate.from_url` internally uses a default **10s** socket connect and wraps SSL.  
-2. **Binary Form**: The certificate is loaded in ASN.1 (DER) form, then re-parsed by `OpenSSL.crypto`.  
-3. **Validation**: This does **not** validate the certificate chain or trust store. It only fetches and parses.  
-4. **Integration**: Within Crawl4AI, you typically just set `fetch_ssl_certificate=True` in `CrawlerRunConfig`; the final result’s `ssl_certificate` is automatically built.  
-5. **Export**: If you need to store or analyze a cert, the `to_json` and `to_pem` are quite universal.
-
----
-
-### Summary
-
-- **`SSLCertificate`** is a convenience class for capturing and exporting the **TLS certificate** from your crawled site(s).  
-- Common usage is in the **`CrawlResult.ssl_certificate`** field, accessible after setting `fetch_ssl_certificate=True`.  
-- Offers quick access to essential certificate details (`issuer`, `subject`, `fingerprint`) and is easy to export (PEM, DER, JSON) for further analysis or server usage.
-
-Use it whenever you need **insight** into a site’s certificate or require some form of cryptographic or compliance check.
-```
-
diff --git a/docs/md_v2/blog/releases/0.5.0.md b/docs/md_v2/blog/releases/0.5.0.md
index 30269a29..e55bde20 100644
--- a/docs/md_v2/blog/releases/0.5.0.md
+++ b/docs/md_v2/blog/releases/0.5.0.md
@@ -334,7 +334,7 @@ asyncio.run(main())
   schemas.
 
 ```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai import LLMConfig
 
 llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
@@ -402,7 +402,7 @@ print(schema)
 
   ```python
   from crawl4ai import LLMConfig
-  from crawl4ai.extraction_strategy import LLMExtractionStrategy
+  from crawl4ai import LLMExtractionStrategy
   from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 
   # Example of using LLMConfig with LLMExtractionStrategy
diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
index 9ea8f2a1..541dcfbe 100644
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -274,7 +274,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def main():
     # 1) Browser config: headless, bigger viewport, no proxy
diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md
index 07c8861b..e87218b8 100644
--- a/docs/md_v2/core/content-selection.md
+++ b/docs/md_v2/core/content-selection.md
@@ -191,7 +191,7 @@ You can combine content selection with a more advanced extraction strategy. For
 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def main():
     # Minimal schema for repeated items
@@ -243,7 +243,7 @@ import asyncio
 import json
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class ArticleData(BaseModel):
     headline: str
@@ -288,7 +288,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a
 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def extract_main_articles(url: str):
     schema = {
diff --git a/docs/md_v2/core/crawler-result.md b/docs/md_v2/core/crawler-result.md
index d7648ecb..54c7bade 100644
--- a/docs/md_v2/core/crawler-result.md
+++ b/docs/md_v2/core/crawler-result.md
@@ -138,7 +138,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu
 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def main():
     schema = {
diff --git a/docs/md_v2/core/page-interaction.md b/docs/md_v2/core/page-interaction.md
index 5fadc692..a72e0068 100644
--- a/docs/md_v2/core/page-interaction.md
+++ b/docs/md_v2/core/page-interaction.md
@@ -296,7 +296,7 @@ if __name__ == "__main__":
 Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
 
 ```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 schema = {
     "name": "Commits",
diff --git a/docs/md_v2/core/quickstart.md b/docs/md_v2/core/quickstart.md
index de0b7e5e..c27be327 100644
--- a/docs/md_v2/core/quickstart.md
+++ b/docs/md_v2/core/quickstart.md
@@ -127,7 +127,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
 > **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
 
 ```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai import LLMConfig
 
 # Generate a schema (one-time cost)
@@ -157,7 +157,7 @@ Here's a basic extraction example:
 import asyncio
 import json
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def main():
     schema = {
@@ -212,7 +212,7 @@ import json
 import asyncio
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class OpenAIModelFee(BaseModel):
     model_name: str = Field(..., description="Name of the OpenAI model.")
@@ -328,7 +328,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def extract_structured_data_using_css_extractor():
     print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
diff --git a/docs/md_v2/extraction/clustring-strategies.md b/docs/md_v2/extraction/clustring-strategies.md
index 3fe00fa1..c32b1c9c 100644
--- a/docs/md_v2/extraction/clustring-strategies.md
+++ b/docs/md_v2/extraction/clustring-strategies.md
@@ -14,7 +14,7 @@ The Cosine Strategy:
 ## Basic Usage
 
 ```python
-from crawl4ai.extraction_strategy import CosineStrategy
+from crawl4ai import CosineStrategy
 
 strategy = CosineStrategy(
     semantic_filter="product reviews",    # Target content type
diff --git a/docs/md_v2/extraction/llm-strategies.md b/docs/md_v2/extraction/llm-strategies.md
index 9f6a6b3e..a395dd1f 100644
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -102,7 +102,7 @@ import json
 from pydantic import BaseModel, Field
 from typing import List
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class Product(BaseModel):
     name: str
@@ -219,7 +219,7 @@ import asyncio
 from typing import List
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 class Entity(BaseModel):
     name: str
diff --git a/docs/md_v2/extraction/no-llm-strategies.md b/docs/md_v2/extraction/no-llm-strategies.md
index 23fa7ad2..7081f44f 100644
--- a/docs/md_v2/extraction/no-llm-strategies.md
+++ b/docs/md_v2/extraction/no-llm-strategies.md
@@ -38,7 +38,7 @@ Let's begin with a **simple** schema-based extraction using the `JsonCssExtracti
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 async def extract_crypto_prices():
     # 1. Define a simple extraction schema
@@ -108,7 +108,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`*
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
+from crawl4ai import JsonXPathExtractionStrategy
 
 async def extract_crypto_prices_xpath():
     # 1. Minimal dummy HTML with some repeating rows
@@ -309,7 +309,7 @@ Key Takeaways:
 import json
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 
 ecommerce_schema = {
     # ... the advanced schema from above ...
@@ -649,7 +649,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c
 The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
 
 ```python
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
 from crawl4ai import LLMConfig
 
 # Sample HTML with product information
diff --git a/docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb b/docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb
index 053bc6c5..c6ceab2e 100644
--- a/docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb
+++ b/docs/releases_review/Crawl4AI_v0.3.72_Release_Announcement.ipynb
@@ -149,7 +149,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from crawl4ai.extraction_strategy import LLMExtractionStrategy\n",
+    "from crawl4ai import LLMExtractionStrategy\n",
     "from pydantic import BaseModel\n",
     "import json, os\n",
     "from typing import List\n",
diff --git a/pyproject.toml b/pyproject.toml
index fc961d2b..1e40b77b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
     "aiohttp>=3.11.11",
     "brotli>=1.1.0",
     "humanize>=4.10.0",
+    "lark>=1.2.2"
 ]
 classifiers = [
     "Development Status :: 4 - Beta",
diff --git a/tests/async/test_0.4.2_config_params.py b/tests/async/test_0.4.2_config_params.py
index 9a15f864..bb2113d0 100644
--- a/tests/async/test_0.4.2_config_params.py
+++ b/tests/async/test_0.4.2_config_params.py
@@ -8,7 +8,7 @@ import asyncio
 from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy
 from crawl4ai.chunking_strategy import RegexChunking
 
 
diff --git a/tests/async/test_chunking_and_extraction_strategies.py b/tests/async/test_chunking_and_extraction_strategies.py
index 90e17a9d..7c628bc9 100644
--- a/tests/async/test_chunking_and_extraction_strategies.py
+++ b/tests/async/test_chunking_and_extraction_strategies.py
@@ -10,7 +10,7 @@ sys.path.append(parent_dir)
 from crawl4ai import LLMConfig
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 from crawl4ai.chunking_strategy import RegexChunking
-from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from crawl4ai import LLMExtractionStrategy
 
 
 @pytest.mark.asyncio
diff --git a/tests/general/test_schema_builder.py b/tests/general/test_schema_builder.py
index 46d0e240..92963224 100644
--- a/tests/general/test_schema_builder.py
+++ b/tests/general/test_schema_builder.py
@@ -9,7 +9,7 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
 from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
 import json
 
diff --git a/tests/test_web_crawler.py b/tests/test_web_crawler.py
index b8453192..162f85f4 100644
--- a/tests/test_web_crawler.py
+++ b/tests/test_web_crawler.py
@@ -6,7 +6,7 @@ from crawl4ai.chunking_strategy import (
     FixedLengthWordChunking,
     SlidingWindowChunking,
 )
-from crawl4ai.extraction_strategy import (
+from crawl4ai import (
     CosineStrategy,
     LLMExtractionStrategy,
     TopicExtractionStrategy,