feat(proxy): add proxy rotation strategy

Implements a new proxy rotation system with the following changes: - Add ProxyRotationStrategy abstract base class - Add RoundRobinProxyStrategy concrete implementation - Integrate proxy rotation with AsyncWebCrawler - Add proxy_rotation_strategy parameter to CrawlerRunConfig - Add example script demonstrating proxy rotation usage - Remove deprecated synchronous WebCrawler code - Clean up rate limiting documentation BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
2025-02-09 18:49:10 +08:00
parent b957ff2ecd
commit 19df96ed56
12 changed files with 257 additions and 162 deletions
--- a/docs/examples/proxy_rotation_demo.py
+++ b/docs/examples/proxy_rotation_demo.py
@@ -0,0 +1,161 @@
+import os
+import re
+from typing import List, Dict
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    RoundRobinProxyStrategy
+)
+
+def load_proxies_from_env() -> List[Dict]:
+    """Load proxies from PROXIES environment variable"""
+    proxies = []
+    try:
+        proxy_list = os.getenv("PROXIES", "").split(",")
+        for proxy in proxy_list:
+            if not proxy:
+                continue
+            ip, port, username, password = proxy.split(":")
+            proxies.append({
+                "server": f"http://{ip}:{port}",
+                "username": username,
+                "password": password,
+                "ip": ip  # Store original IP for verification
+            })
+    except Exception as e:
+        print(f"Error loading proxies from environment: {e}")
+    return proxies
+
+async def demo_proxy_rotation():
+    """
+    Proxy Rotation Demo using RoundRobinProxyStrategy
+    ===============================================
+    Demonstrates proxy rotation using the strategy pattern.
+    """
+    print("\n=== Proxy Rotation Demo (Round Robin) ===")
+    
+    # Load proxies and create rotation strategy
+    proxies = load_proxies_from_env()
+    if not proxies:
+        print("No proxies found in environment. Set PROXIES env variable!")
+        return
+        
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+    
+    # Create configs
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        proxy_rotation_strategy=proxy_strategy
+    )
+    
+    # Test URLs
+    urls = ["https://httpbin.org/ip"] * len(proxies)  # Test each proxy once
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        for url in urls:
+            result = await crawler.arun(url=url, config=run_config)
+            
+            if result.success:
+                # Extract IP from response
+                ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                current_proxy = run_config.proxy_config if run_config.proxy_config else None
+                
+                if current_proxy:
+                    print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
+                    verified = ip_match and ip_match.group(0) == current_proxy['ip']
+                    if verified:
+                        print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
+                    else:
+                        print("❌ Proxy failed or IP mismatch!")
+            else:
+                print(f"Request failed: {result.error_message}")
+
+async def demo_proxy_rotation_batch():
+    """
+    Proxy Rotation Demo with Batch Processing
+    =======================================
+    Demonstrates proxy rotation using arun_many with memory dispatcher.
+    """
+    print("\n=== Proxy Rotation Batch Demo ===")
+    
+    try:
+        # Load proxies and create rotation strategy
+        proxies = load_proxies_from_env()
+        if not proxies:
+            print("No proxies found in environment. Set PROXIES env variable!")
+            return
+            
+        proxy_strategy = RoundRobinProxyStrategy(proxies)
+        
+        # Configurations
+        browser_config = BrowserConfig(headless=True, verbose=False)
+        run_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            proxy_rotation_strategy=proxy_strategy,
+            markdown_generator=DefaultMarkdownGenerator()
+        )
+
+        # Test URLs - multiple requests to test rotation
+        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
+
+        print("\n📈 Initializing crawler with proxy rotation...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            monitor = CrawlerMonitor(
+                max_visible_rows=10,
+                display_mode=DisplayMode.DETAILED
+            )
+            
+            dispatcher = MemoryAdaptiveDispatcher(
+                memory_threshold_percent=80.0,
+                check_interval=0.5,
+                max_session_permit=1, #len(proxies),  # Match concurrent sessions to proxy count
+                # monitor=monitor
+            )
+            
+            print("\n🚀 Starting batch crawl with proxy rotation...")
+            results = await crawler.arun_many(
+                urls=urls,
+                config=run_config,
+                dispatcher=dispatcher
+            )
+
+            # Verify results
+            success_count = 0
+            for result in results:
+                if result.success:
+                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
+                    
+                    if current_proxy and ip_match:
+                        print(f"URL {result.url}")
+                        print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
+                        verified = ip_match.group(0) == current_proxy['ip']
+                        if verified:
+                            print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
+                            success_count += 1
+                        else:
+                            print("❌ Proxy failed or IP mismatch!")
+                    print("---")
+                    
+            print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
+            
+    except Exception as e:
+        print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
+
+if __name__ == "__main__":
+    import asyncio
+    from crawl4ai import (
+        CrawlerMonitor, 
+        DisplayMode,
+        MemoryAdaptiveDispatcher,
+        DefaultMarkdownGenerator
+    )
+    
+    async def run_demos():
+        # await demo_proxy_rotation()  # Original single-request demo
+        await demo_proxy_rotation_batch()  # New batch processing demo
+        
+    asyncio.run(run_demos())
--- a/docs/examples/v0.3.74.overview.py
+++ b/docs/examples/v0.3.74.overview.py
@@ -1,276 +0,0 @@
-import os, sys
-
-# append the parent directory to the sys.path
-parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.append(parent_dir)
-parent_parent_dir = os.path.dirname(parent_dir)
-sys.path.append(parent_parent_dir)
-__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-__data__ = os.path.join(__location__, "__data")
-import asyncio
-from pathlib import Path
-import aiohttp
-import json
-from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.content_filter_strategy import BM25ContentFilter
-
-
-# 1. File Download Processing Example
-async def download_example():
-    """Example of downloading files from Python.org"""
-    # downloads_path = os.path.join(os.getcwd(), "downloads")
-    downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
-    os.makedirs(downloads_path, exist_ok=True)
-
-    print(f"Downloads will be saved to: {downloads_path}")
-
-    async with AsyncWebCrawler(
-        accept_downloads=True, downloads_path=downloads_path, verbose=True
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://www.python.org/downloads/",
-            js_code="""
-            // Find and click the first Windows installer link
-            const downloadLink = document.querySelector('a[href$=".exe"]');
-            if (downloadLink) {
-                console.log('Found download link:', downloadLink.href);
-                downloadLink.click();
-            } else {
-                console.log('No .exe download link found');
-            }
-            """,
-            delay_before_return_html=1,  # Wait 5 seconds to ensure download starts
-            cache_mode=CacheMode.BYPASS,
-        )
-
-        if result.downloaded_files:
-            print("\nDownload successful!")
-            print("Downloaded files:")
-            for file_path in result.downloaded_files:
-                print(f"- {file_path}")
-                print(f"  File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB")
-        else:
-            print("\nNo files were downloaded")
-
-
-# 2. Local File and Raw HTML Processing Example
-async def local_and_raw_html_example():
-    """Example of processing local files and raw HTML"""
-    # Create a sample HTML file
-    sample_file = os.path.join(__data__, "sample.html")
-    with open(sample_file, "w") as f:
-        f.write(
-            """
-        <html><body>
-            <h1>Test Content</h1>
-            <p>This is a test paragraph.</p>
-        </body></html>
-        """
-        )
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        # Process local file
-        local_result = await crawler.arun(url=f"file://{os.path.abspath(sample_file)}")
-
-        # Process raw HTML
-        raw_html = """
-        <html><body>
-            <h1>Raw HTML Test</h1>
-            <p>This is a test of raw HTML processing.</p>
-        </body></html>
-        """
-        raw_result = await crawler.arun(url=f"raw:{raw_html}")
-
-        # Clean up
-        os.remove(sample_file)
-
-        print("Local file content:", local_result.markdown)
-        print("\nRaw HTML content:", raw_result.markdown)
-
-
-# 3. Enhanced Markdown Generation Example
-async def markdown_generation_example():
-    """Example of enhanced markdown generation with citations and LLM-friendly features"""
-    async with AsyncWebCrawler(verbose=True) as crawler:
-        # Create a content filter (optional)
-        content_filter = BM25ContentFilter(
-            # user_query="History and cultivation",
-            bm25_threshold=1.0
-        )
-
-        result = await crawler.arun(
-            url="https://en.wikipedia.org/wiki/Apple",
-            css_selector="main div#bodyContent",
-            content_filter=content_filter,
-            cache_mode=CacheMode.BYPASS,
-        )
-
-        from crawl4ai.content_filter_strategy import BM25ContentFilter
-
-        result = await crawler.arun(
-            url="https://en.wikipedia.org/wiki/Apple",
-            css_selector="main div#bodyContent",
-            content_filter=BM25ContentFilter(),
-        )
-        print(result.markdown_v2.fit_markdown)
-
-        print("\nMarkdown Generation Results:")
-        print(f"1. Original markdown length: {len(result.markdown)}")
-        print("2. New markdown versions (markdown_v2):")
-        print(f"   - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
-        print(
-            f"   - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}"
-        )
-        print(
-            f"   - References section length: {len(result.markdown_v2.references_markdown)}"
-        )
-        if result.markdown_v2.fit_markdown:
-            print(
-                f"   - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}"
-            )
-
-        # Save examples to files
-        output_dir = os.path.join(__data__, "markdown_examples")
-        os.makedirs(output_dir, exist_ok=True)
-
-        # Save different versions
-        with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
-            f.write(result.markdown_v2.raw_markdown)
-
-        with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
-            f.write(result.markdown_v2.markdown_with_citations)
-
-        with open(os.path.join(output_dir, "3_references.md"), "w") as f:
-            f.write(result.markdown_v2.references_markdown)
-
-        if result.markdown_v2.fit_markdown:
-            with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
-                f.write(result.markdown_v2.fit_markdown)
-
-        print(f"\nMarkdown examples saved to: {output_dir}")
-
-        # Show a sample of citations and references
-        print("\nSample of markdown with citations:")
-        print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
-        print("Sample of references:")
-        print(
-            "\n".join(result.markdown_v2.references_markdown.split("\n")[:10]) + "..."
-        )
-
-
-# 4. Browser Management Example
-async def browser_management_example():
-    """Example of using enhanced browser management features"""
-    # Use the specified user directory path
-    user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
-    os.makedirs(user_data_dir, exist_ok=True)
-
-    print(f"Browser profile will be saved to: {user_data_dir}")
-
-    async with AsyncWebCrawler(
-        use_managed_browser=True,
-        user_data_dir=user_data_dir,
-        headless=False,
-        verbose=True,
-    ) as crawler:
-        result = await crawler.arun(
-            url="https://crawl4ai.com",
-            # session_id="persistent_session_1",
-            cache_mode=CacheMode.BYPASS,
-        )
-        # Use GitHub as an example - it's a good test for browser management
-        # because it requires proper browser handling
-        result = await crawler.arun(
-            url="https://github.com/trending",
-            # session_id="persistent_session_1",
-            cache_mode=CacheMode.BYPASS,
-        )
-
-        print("\nBrowser session result:", result.success)
-        if result.success:
-            print("Page title:", result.metadata.get("title", "No title found"))
-
-
-# 5. API Usage Example
-async def api_example():
-    """Example of using the new API endpoints"""
-    api_token = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
-    headers = {"Authorization": f"Bearer {api_token}"}
-    async with aiohttp.ClientSession() as session:
-        # Submit crawl job
-        crawl_request = {
-            "urls": ["https://news.ycombinator.com"],  # Hacker News as an example
-            "extraction_config": {
-                "type": "json_css",
-                "params": {
-                    "schema": {
-                        "name": "Hacker News Articles",
-                        "baseSelector": ".athing",
-                        "fields": [
-                            {"name": "title", "selector": ".title a", "type": "text"},
-                            {"name": "score", "selector": ".score", "type": "text"},
-                            {
-                                "name": "url",
-                                "selector": ".title a",
-                                "type": "attribute",
-                                "attribute": "href",
-                            },
-                        ],
-                    }
-                },
-            },
-            "crawler_params": {
-                "headless": True,
-                # "use_managed_browser": True
-            },
-            "cache_mode": "bypass",
-            # "screenshot": True,
-            # "magic": True
-        }
-
-        async with session.post(
-            "http://localhost:11235/crawl", json=crawl_request, headers=headers
-        ) as response:
-            task_data = await response.json()
-            task_id = task_data["task_id"]
-
-            # Check task status
-            while True:
-                async with session.get(
-                    f"http://localhost:11235/task/{task_id}", headers=headers
-                ) as status_response:
-                    result = await status_response.json()
-                    print(f"Task status: {result['status']}")
-
-                    if result["status"] == "completed":
-                        print("Task completed!")
-                        print("Results:")
-                        news = json.loads(result["results"][0]["extracted_content"])
-                        print(json.dumps(news[:4], indent=2))
-                        break
-                    else:
-                        await asyncio.sleep(1)
-
-
-# Main execution
-async def main():
-    # print("Running Crawl4AI feature examples...")
-
-    # print("\n1. Running Download Example:")
-    # await download_example()
-
-    # print("\n2. Running Markdown Generation Example:")
-    # await markdown_generation_example()
-
-    # # print("\n3. Running Local and Raw HTML Example:")
-    # await local_and_raw_html_example()
-
-    # # print("\n4. Running Browser Management Example:")
-    await browser_management_example()
-
-    # print("\n5. Running API Example:")
-    await api_example()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/docs/examples/v0_4_24_walkthrough.py
+++ b/docs/examples/v0_4_24_walkthrough.py
@@ -1,464 +0,0 @@
-"""
-Crawl4AI v0.4.24 Feature Walkthrough
-===================================
-
-This script demonstrates the new features introduced in Crawl4AI v0.4.24.
-Each section includes detailed examples and explanations of the new capabilities.
-"""
-
-import asyncio
-import os
-import json
-import re
-from typing import List
-from crawl4ai import (
-    AsyncWebCrawler,
-    BrowserConfig,
-    CrawlerRunConfig,
-    CacheMode,
-    LLMExtractionStrategy,
-    JsonCssExtractionStrategy,
-)
-from crawl4ai.content_filter_strategy import RelevantContentFilter
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from bs4 import BeautifulSoup
-
-# Sample HTML for demonstrations
-SAMPLE_HTML = """
-<div class="article-list">
-    <article class="post" data-category="tech" data-author="john">
-        <h2 class="title"><a href="/post-1">First Post</a></h2>
-        <div class="meta">
-            <a href="/author/john" class="author">John Doe</a>
-            <span class="date">2023-12-31</span>
-        </div>
-        <div class="content">
-            <p>First post content...</p>
-            <a href="/read-more-1" class="read-more">Read More</a>
-        </div>
-    </article>
-    <article class="post" data-category="science" data-author="jane">
-        <h2 class="title"><a href="/post-2">Second Post</a></h2>
-        <div class="meta">
-            <a href="/author/jane" class="author">Jane Smith</a>
-            <span class="date">2023-12-30</span>
-        </div>
-        <div class="content">
-            <p>Second post content...</p>
-            <a href="/read-more-2" class="read-more">Read More</a>
-        </div>
-    </article>
-</div>
-"""
-
-
-async def demo_ssl_features():
-    """
-    Enhanced SSL & Security Features Demo
-    -----------------------------------
-
-    This example demonstrates the new SSL certificate handling and security features:
-    1. Custom certificate paths
-    2. SSL verification options
-    3. HTTPS error handling
-    4. Certificate validation configurations
-
-    These features are particularly useful when:
-    - Working with self-signed certificates
-    - Dealing with corporate proxies
-    - Handling mixed content websites
-    - Managing different SSL security levels
-    """
-    print("\n1. Enhanced SSL & Security Demo")
-    print("--------------------------------")
-
-    browser_config = BrowserConfig()
-
-    run_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        fetch_ssl_certificate=True,  # Enable SSL certificate fetching
-    )
-
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(url="https://example.com", config=run_config)
-        print(f"SSL Crawl Success: {result.success}")
-        result.ssl_certificate.to_json(
-            os.path.join(os.getcwd(), "ssl_certificate.json")
-        )
-        if not result.success:
-            print(f"SSL Error: {result.error_message}")
-
-
-async def demo_content_filtering():
-    """
-    Smart Content Filtering Demo
-    ----------------------
-
-    Demonstrates advanced content filtering capabilities:
-    1. Custom filter to identify and extract specific content
-    2. Integration with markdown generation
-    3. Flexible pruning rules
-    """
-    print("\n2. Smart Content Filtering Demo")
-    print("--------------------------------")
-
-    # Create a custom content filter
-    class CustomNewsFilter(RelevantContentFilter):
-        def __init__(self):
-            super().__init__()
-            # Add news-specific patterns
-            self.negative_patterns = re.compile(
-                r"nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending",
-                re.I,
-            )
-            self.min_word_count = 30  # Higher threshold for news content
-
-        def filter_content(
-            self, html: str, min_word_threshold: int = None
-        ) -> List[str]:
-            """
-            Implements news-specific content filtering logic.
-
-            Args:
-                html (str): HTML content to be filtered
-                min_word_threshold (int, optional): Minimum word count threshold
-
-            Returns:
-                List[str]: List of filtered HTML content blocks
-            """
-            if not html or not isinstance(html, str):
-                return []
-
-            soup = BeautifulSoup(html, "lxml")
-            if not soup.body:
-                soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
-
-            body = soup.find("body")
-
-            # Extract chunks with metadata
-            chunks = self.extract_text_chunks(
-                body, min_word_threshold or self.min_word_count
-            )
-
-            # Filter chunks based on news-specific criteria
-            filtered_chunks = []
-            for _, text, tag_type, element in chunks:
-                # Skip if element has negative class/id
-                if self.is_excluded(element):
-                    continue
-
-                # Headers are important in news articles
-                if tag_type == "header":
-                    filtered_chunks.append(self.clean_element(element))
-                    continue
-
-                # For content, check word count and link density
-                text = element.get_text(strip=True)
-                if len(text.split()) >= (min_word_threshold or self.min_word_count):
-                    # Calculate link density
-                    links_text = " ".join(
-                        a.get_text(strip=True) for a in element.find_all("a")
-                    )
-                    link_density = len(links_text) / len(text) if text else 1
-
-                    # Accept if link density is reasonable
-                    if link_density < 0.5:
-                        filtered_chunks.append(self.clean_element(element))
-
-            return filtered_chunks
-
-    # Create markdown generator with custom filter
-    markdown_gen = DefaultMarkdownGenerator(content_filter=CustomNewsFilter())
-
-    run_config = CrawlerRunConfig(
-        markdown_generator=markdown_gen, cache_mode=CacheMode.BYPASS
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="https://news.ycombinator.com", config=run_config
-        )
-        print("Filtered Content Sample:")
-        print(result.markdown[:500])  # Show first 500 chars
-
-
-async def demo_json_extraction():
-    """
-    Improved JSON Extraction Demo
-    ---------------------------
-
-    Demonstrates the enhanced JSON extraction capabilities:
-    1. Base element attributes extraction
-    2. Complex nested structures
-    3. Multiple extraction patterns
-
-    Key features shown:
-    - Extracting attributes from base elements (href, data-* attributes)
-    - Processing repeated patterns
-    - Handling optional fields
-    """
-    print("\n3. Improved JSON Extraction Demo")
-    print("--------------------------------")
-
-    # Define the extraction schema with base element attributes
-    json_strategy = JsonCssExtractionStrategy(
-        schema={
-            "name": "Blog Posts",
-            "baseSelector": "div.article-list",
-            "baseFields": [
-                {"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
-                {"name": "category", "type": "attribute", "attribute": "data-category"},
-            ],
-            "fields": [
-                {
-                    "name": "posts",
-                    "selector": "article.post",
-                    "type": "nested_list",
-                    "baseFields": [
-                        {
-                            "name": "post_id",
-                            "type": "attribute",
-                            "attribute": "data-post-id",
-                        },
-                        {
-                            "name": "author_id",
-                            "type": "attribute",
-                            "attribute": "data-author",
-                        },
-                    ],
-                    "fields": [
-                        {
-                            "name": "title",
-                            "selector": "h2.title a",
-                            "type": "text",
-                            "baseFields": [
-                                {
-                                    "name": "url",
-                                    "type": "attribute",
-                                    "attribute": "href",
-                                }
-                            ],
-                        },
-                        {
-                            "name": "author",
-                            "selector": "div.meta a.author",
-                            "type": "text",
-                            "baseFields": [
-                                {
-                                    "name": "profile_url",
-                                    "type": "attribute",
-                                    "attribute": "href",
-                                }
-                            ],
-                        },
-                        {"name": "date", "selector": "span.date", "type": "text"},
-                        {
-                            "name": "read_more",
-                            "selector": "a.read-more",
-                            "type": "nested",
-                            "fields": [
-                                {"name": "text", "type": "text"},
-                                {
-                                    "name": "url",
-                                    "type": "attribute",
-                                    "attribute": "href",
-                                },
-                            ],
-                        },
-                    ],
-                }
-            ],
-        }
-    )
-
-    # Demonstrate extraction from raw HTML
-    run_config = CrawlerRunConfig(
-        extraction_strategy=json_strategy, cache_mode=CacheMode.BYPASS
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url="raw:" + SAMPLE_HTML,  # Use raw: prefix for raw HTML
-            config=run_config,
-        )
-        print("Extracted Content:")
-        print(result.extracted_content)
-
-
-async def demo_input_formats():
-    """
-    Input Format Handling Demo
-    ----------------------
-
-    Demonstrates how LLM extraction can work with different input formats:
-    1. Markdown (default) - Good for simple text extraction
-    2. HTML - Better when you need structure and attributes
-
-    This example shows how HTML input can be beneficial when:
-    - You need to understand the DOM structure
-    - You want to extract both visible text and HTML attributes
-    - The content has complex layouts like tables or forms
-    """
-    print("\n4. Input Format Handling Demo")
-    print("---------------------------")
-
-    # Create a dummy HTML with rich structure
-    dummy_html = """
-    <div class="job-posting" data-post-id="12345">
-        <header class="job-header">
-            <h1 class="job-title">Senior AI/ML Engineer</h1>
-            <div class="job-meta">
-                <span class="department">AI Research Division</span>
-                <span class="location" data-remote="hybrid">San Francisco (Hybrid)</span>
-            </div>
-            <div class="salary-info" data-currency="USD">
-                <span class="range">$150,000 - $220,000</span>
-                <span class="period">per year</span>
-            </div>
-        </header>
-        
-        <section class="requirements">
-            <div class="technical-skills">
-                <h3>Technical Requirements</h3>
-                <ul class="required-skills">
-                    <li class="skill required" data-priority="must-have">
-                        5+ years experience in Machine Learning
-                    </li>
-                    <li class="skill required" data-priority="must-have">
-                        Proficiency in Python and PyTorch/TensorFlow
-                    </li>
-                    <li class="skill preferred" data-priority="nice-to-have">
-                        Experience with distributed training systems
-                    </li>
-                </ul>
-            </div>
-            
-            <div class="soft-skills">
-                <h3>Professional Skills</h3>
-                <ul class="required-skills">
-                    <li class="skill required" data-priority="must-have">
-                        Strong problem-solving abilities
-                    </li>
-                    <li class="skill preferred" data-priority="nice-to-have">
-                        Experience leading technical teams
-                    </li>
-                </ul>
-            </div>
-        </section>
-        
-        <section class="timeline">
-            <time class="deadline" datetime="2024-02-28">
-                Application Deadline: February 28, 2024
-            </time>
-        </section>
-        
-        <footer class="contact-section">
-            <div class="hiring-manager">
-                <h4>Hiring Manager</h4>
-                <div class="contact-info">
-                    <span class="name">Dr. Sarah Chen</span>
-                    <span class="title">Director of AI Research</span>
-                    <span class="email">ai.hiring@example.com</span>
-                </div>
-            </div>
-            <div class="team-info">
-                <p>Join our team of 50+ researchers working on cutting-edge AI applications</p>
-            </div>
-        </footer>
-    </div>
-    """
-
-    # Use raw:// prefix to pass HTML content directly
-    url = f"raw://{dummy_html}"
-
-    from pydantic import BaseModel, Field
-    from typing import List, Optional
-
-    # Define our schema using Pydantic
-    class JobRequirement(BaseModel):
-        category: str = Field(
-            description="Category of the requirement (e.g., Technical, Soft Skills)"
-        )
-        items: List[str] = Field(
-            description="List of specific requirements in this category"
-        )
-        priority: str = Field(
-            description="Priority level (Required/Preferred) based on the HTML class or context"
-        )
-
-    class JobPosting(BaseModel):
-        title: str = Field(description="Job title")
-        department: str = Field(description="Department or team")
-        location: str = Field(description="Job location, including remote options")
-        salary_range: Optional[str] = Field(description="Salary range if specified")
-        requirements: List[JobRequirement] = Field(
-            description="Categorized job requirements"
-        )
-        application_deadline: Optional[str] = Field(
-            description="Application deadline if specified"
-        )
-        contact_info: Optional[dict] = Field(
-            description="Contact information from footer or contact section"
-        )
-
-    # First try with markdown (default)
-    markdown_strategy = LLMExtractionStrategy(
-        provider="openai/gpt-4o",
-        api_token=os.getenv("OPENAI_API_KEY"),
-        schema=JobPosting.model_json_schema(),
-        extraction_type="schema",
-        instruction="""
-        Extract job posting details into structured data. Focus on the visible text content 
-        and organize requirements into categories.
-        """,
-        input_format="markdown",  # default
-    )
-
-    # Then with HTML for better structure understanding
-    html_strategy = LLMExtractionStrategy(
-        provider="openai/gpt-4",
-        api_token=os.getenv("OPENAI_API_KEY"),
-        schema=JobPosting.model_json_schema(),
-        extraction_type="schema",
-        instruction="""
-        Extract job posting details, using HTML structure to:
-        1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred')
-        2. Extract contact info from the page footer or dedicated contact section
-        3. Parse salary information from specially formatted elements
-        4. Determine application deadline from timestamp or date elements
-        
-        Use HTML attributes and classes to enhance extraction accuracy.
-        """,
-        input_format="html",  # explicitly use HTML
-    )
-
-    async with AsyncWebCrawler() as crawler:
-        # Try with markdown first
-        markdown_config = CrawlerRunConfig(extraction_strategy=markdown_strategy)
-        markdown_result = await crawler.arun(url=url, config=markdown_config)
-        print("\nMarkdown-based Extraction Result:")
-        items = json.loads(markdown_result.extracted_content)
-        print(json.dumps(items, indent=2))
-
-        # Then with HTML for better structure understanding
-        html_config = CrawlerRunConfig(extraction_strategy=html_strategy)
-        html_result = await crawler.arun(url=url, config=html_config)
-        print("\nHTML-based Extraction Result:")
-        items = json.loads(html_result.extracted_content)
-        print(json.dumps(items, indent=2))
-
-
-# Main execution
-async def main():
-    print("Crawl4AI v0.4.24 Feature Walkthrough")
-    print("====================================")
-
-    # Run all demos
-    await demo_ssl_features()
-    await demo_content_filtering()
-    await demo_json_extraction()
-    # await demo_input_formats()
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/docs/examples/v0_4_3b2_features_demo.py
+++ b/docs/examples/v0_4_3b2_features_demo.py
@@ -1,351 +0,0 @@
-"""
-Crawl4ai v0.4.3b2 Features Demo
-============================
-
-This demonstration showcases three major categories of new features in Crawl4ai v0.4.3:
-
-1. Efficiency & Speed:
-   - Memory-efficient dispatcher strategies
-   - New scraping algorithm
-   - Streaming support for batch crawling
-
-2. LLM Integration:
-   - Automatic schema generation
-   - LLM-powered content filtering
-   - Smart markdown generation
-
-3. Core Improvements:
-   - Robots.txt compliance
-   - Proxy rotation
-   - Enhanced URL handling
-   - Shared data among hooks
-   - add page routes
-
-Each demo function can be run independently or as part of the full suite.
-"""
-
-import asyncio
-import os
-import json
-import re
-import random
-from typing import Optional, Dict
-from dotenv import load_dotenv
-
-load_dotenv()
-
-from crawl4ai import (
-    AsyncWebCrawler, 
-    BrowserConfig,
-    CrawlerRunConfig,
-    CacheMode,
-    DisplayMode,
-    MemoryAdaptiveDispatcher,
-    CrawlerMonitor,
-    DefaultMarkdownGenerator,
-    LXMLWebScrapingStrategy,
-    JsonCssExtractionStrategy,
-    LLMContentFilter
-)
-
-
-async def demo_memory_dispatcher():
-    """Demonstrates the new memory-efficient dispatcher system.
-    
-    Key Features:
-    - Adaptive memory management
-    - Real-time performance monitoring
-    - Concurrent session control
-    """
-    print("\n=== Memory Dispatcher Demo ===")
-    
-    try:
-        # Configuration
-        browser_config = BrowserConfig(headless=True, verbose=False)
-        crawler_config = CrawlerRunConfig(
-            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator()
-        )
-
-        # Test URLs
-        urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
-
-        print("\n📈 Initializing crawler with memory monitoring...")
-        async with AsyncWebCrawler(config=browser_config) as crawler:
-            monitor = CrawlerMonitor(
-                max_visible_rows=10,
-                display_mode=DisplayMode.DETAILED
-            )
-            
-            dispatcher = MemoryAdaptiveDispatcher(
-                memory_threshold_percent=80.0,
-                check_interval=0.5,
-                max_session_permit=5,
-                monitor=monitor
-            )
-            
-            print("\n🚀 Starting batch crawl...")
-            results = await crawler.arun_many(
-                urls=urls,
-                config=crawler_config,
-                dispatcher=dispatcher
-            )
-            print(f"\n✅ Completed {len(results)} URLs successfully")
-            
-    except Exception as e:
-        print(f"\n❌ Error in memory dispatcher demo: {str(e)}")
-
-async def demo_streaming_support():
-    """
-    2. Streaming Support Demo
-    ======================
-    Shows how to process URLs as they complete using streaming
-    """
-    print("\n=== 2. Streaming Support Demo ===")
-
-    browser_config = BrowserConfig(headless=True, verbose=False)
-    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)
-
-    # Test URLs
-    urls = ["http://example.com", "http://example.org", "http://example.net"] * 2
-
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        # Initialize dispatcher for streaming
-        dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5)
-
-        print("Starting streaming crawl...")
-        async for result in await crawler.arun_many(
-            urls=urls,
-            config=crawler_config,
-            dispatcher=dispatcher
-        ):
-            # Process each result as it arrives
-            print(
-                f"Received result for {result.url} - Success: {result.success}"
-            )
-            if result.success:
-                print(f"Content length: {len(result.markdown)}")
-
-async def demo_content_scraping():
-    """
-    3. Content Scraping Strategy Demo
-    ==============================
-    Demonstrates the new LXMLWebScrapingStrategy for faster content scraping.
-    """
-    print("\n=== 3. Content Scraping Strategy Demo ===")
-
-    crawler = AsyncWebCrawler()
-    url = "https://example.com/article"
-
-    # Configure with the new LXML strategy
-    config = CrawlerRunConfig(
-        scraping_strategy=LXMLWebScrapingStrategy(), 
-        verbose=True
-    )
-
-    print("Scraping content with LXML strategy...")
-    async with crawler:
-        result = await crawler.arun(url, config=config)
-        if result.success:
-            print("Successfully scraped content using LXML strategy")
-
-async def demo_llm_markdown():
-    """
-    4. LLM-Powered Markdown Generation Demo
-    ===================================
-    Shows how to use the new LLM-powered content filtering and markdown generation.
-    """
-    print("\n=== 4. LLM-Powered Markdown Generation Demo ===")
-
-    crawler = AsyncWebCrawler()
-    url = "https://docs.python.org/3/tutorial/classes.html"
-
-    content_filter = LLMContentFilter(
-        provider="openai/gpt-4o",
-        api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="""
-        Focus on extracting the core educational content about Python classes.
-        Include:
-        - Key concepts and their explanations
-        - Important code examples
-        - Essential technical details
-        Exclude:
-        - Navigation elements
-        - Sidebars
-        - Footer content
-        - Version information
-        - Any non-essential UI elements
-        
-        Format the output as clean markdown with proper code blocks and headers.
-        """,
-        verbose=True,
-    )
-
-    # Configure LLM-powered markdown generation
-    config = CrawlerRunConfig(
-        markdown_generator=DefaultMarkdownGenerator(
-            content_filter=content_filter
-        ), 
-        cache_mode = CacheMode.BYPASS,
-        verbose=True
-    )
-
-    print("Generating focused markdown with LLM...")
-    async with crawler:
-        result = await crawler.arun(url, config=config)
-        if result.success and result.markdown_v2:
-            print("Successfully generated LLM-filtered markdown")
-            print("First 500 chars of filtered content:")
-            print(result.markdown_v2.fit_markdown[:500])
-            print("Successfully generated LLM-filtered markdown")
-
-async def demo_robots_compliance():
-    """
-    5. Robots.txt Compliance Demo
-    ==========================
-    Demonstrates the new robots.txt compliance feature with SQLite caching.
-    """
-    print("\n=== 5. Robots.txt Compliance Demo ===")
-
-    crawler = AsyncWebCrawler()
-    urls = ["https://example.com", "https://facebook.com", "https://twitter.com"]
-
-    # Enable robots.txt checking
-    config = CrawlerRunConfig(check_robots_txt=True, verbose=True)
-
-    print("Crawling with robots.txt compliance...")
-    async with crawler:
-        results = await crawler.arun_many(urls, config=config)
-        for result in results:
-            if result.status_code == 403:
-                print(f"Access blocked by robots.txt: {result.url}")
-            elif result.success:
-                print(f"Successfully crawled: {result.url}")
-
-async def demo_json_schema_generation():
-    """
-    7. LLM-Powered Schema Generation Demo
-    =================================
-    Demonstrates automatic CSS and XPath schema generation using LLM models.
-    """
-    print("\n=== 7. LLM-Powered Schema Generation Demo ===")
-
-    # Example HTML content for a job listing
-    html_content = """
-    <div class="job-listing">
-        <h1 class="job-title">Senior Software Engineer</h1>
-        <div class="job-details">
-            <span class="location">San Francisco, CA</span>
-            <span class="salary">$150,000 - $200,000</span>
-            <div class="requirements">
-                <h2>Requirements</h2>
-                <ul>
-                    <li>5+ years Python experience</li>
-                    <li>Strong background in web crawling</li>
-                </ul>
-            </div>
-        </div>
-    </div>
-    """
-
-    print("Generating CSS selectors schema...")
-    # Generate CSS selectors with a specific query
-    css_schema = JsonCssExtractionStrategy.generate_schema(
-        html_content,
-        schema_type="CSS",
-        query="Extract job title, location, and salary information",
-        provider="openai/gpt-4o",  # or use other providers like "ollama"
-    )
-    print("\nGenerated CSS Schema:")
-    print(css_schema)
-
-    # Example of using the generated schema with crawler
-    crawler = AsyncWebCrawler()
-    url = "https://example.com/job-listing"
-
-    # Create an extraction strategy with the generated schema
-    extraction_strategy = JsonCssExtractionStrategy(schema=css_schema)
-
-    config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True)
-
-    print("\nTesting generated schema with crawler...")
-    async with crawler:
-        result = await crawler.arun(url, config=config)
-        if result.success:
-            print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None)
-            print("Successfully used generated schema for crawling")
-
-async def demo_proxy_rotation():
-    """
-    8. Proxy Rotation Demo
-    ===================
-    Demonstrates how to rotate proxies for each request using Crawl4ai.
-    """
-    print("\n=== 8. Proxy Rotation Demo ===")
-
-    async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
-        """Get next proxy from local file"""
-        try:
-            proxies = os.getenv("PROXIES", "").split(",")
-                
-            ip, port, username, password = random.choice(proxies).split(":")
-            return {
-                "server": f"http://{ip}:{port}",
-                "username": username,
-                "password": password,
-                "ip": ip  # Store original IP for verification
-            }
-        except Exception as e:
-            print(f"Error loading proxy: {e}")
-            return None    
-    
-    # Create 10 test requests to httpbin
-    urls = ["https://httpbin.org/ip"] * 2
-    
-    browser_config = BrowserConfig(headless=True, verbose=False)
-    run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
-    
-    async with AsyncWebCrawler(config=browser_config) as crawler:
-        for url in urls:
-            proxy = await get_next_proxy()
-            if not proxy:
-                print("No proxy available, skipping...")
-                continue
-                
-            # Create new config with proxy
-            current_config = run_config.clone(proxy_config=proxy, user_agent="")
-            result = await crawler.arun(url=url, config=current_config)
-            
-            if result.success:
-                ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
-                print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
-                verified = ip_match.group(0) == proxy['ip']
-                if verified:
-                    print(f"✅ Proxy working! IP matches: {proxy['ip']}")
-                else:
-                    print(f"❌ Proxy failed or IP mismatch!")
-            else:
-                print(f"Failed with proxy {proxy['ip']}")
-
-async def main():
-    """Run all feature demonstrations."""
-    print("\n📊 Running Crawl4ai v0.4.3 Feature Demos\n")
-    
-    # Efficiency & Speed Demos
-    print("\n🚀 EFFICIENCY & SPEED DEMOS")
-    await demo_memory_dispatcher()
-    await demo_streaming_support()
-    await demo_content_scraping()
-    
-    # # LLM Integration Demos
-    print("\n🤖 LLM INTEGRATION DEMOS")
-    await demo_json_schema_generation()
-    await demo_llm_markdown()
-    
-    # # Core Improvements
-    print("\n🔧 CORE IMPROVEMENT DEMOS")
-    await demo_robots_compliance()
-    await demo_proxy_rotation()
-
-if __name__ == "__main__":
-    asyncio.run(main())