Merge branch 'next' into feature/scraper

2025-01-21 12:35:45 +05:30
parent 1079965453 2cec527a22
commit 26d78d8512
183 changed files with 17289 additions and 15684 deletions
--- a/tests/20241401/test_async_crawler_strategy.py
+++ b/tests/20241401/test_async_crawler_strategy.py
@@ -0,0 +1,343 @@
+import pytest
+import pytest_asyncio
+import asyncio
+from typing import Dict, Any
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+import os
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
+from crawl4ai.models import AsyncCrawlResponse
+from crawl4ai.async_logger import AsyncLogger, LogLevel
+
+CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai")
+
+if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists():
+    CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True)
+
+# Test Config Files
+@pytest.fixture
+def basic_browser_config():
+    return BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        verbose=True
+    )
+
+@pytest.fixture
+def advanced_browser_config():
+    return BrowserConfig(
+        browser_type="chromium", 
+        headless=True,
+        use_managed_browser=True,
+        user_data_dir=CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile"),
+        # proxy="http://localhost:8080",
+        viewport_width=1920,
+        viewport_height=1080,
+        user_agent_mode="random"
+    )
+
+@pytest.fixture
+def basic_crawler_config():
+    return CrawlerRunConfig(
+        word_count_threshold=100,
+        wait_until="domcontentloaded",
+        page_timeout=30000
+    )
+
+@pytest.fixture
+def logger():
+    return AsyncLogger(verbose=True, log_level=LogLevel.DEBUG)
+
+@pytest_asyncio.fixture
+async def crawler_strategy(basic_browser_config, logger):
+    strategy = AsyncPlaywrightCrawlerStrategy(browser_config=basic_browser_config, logger=logger)
+    await strategy.start()
+    yield strategy
+    await strategy.close()
+
+# Browser Configuration Tests
+@pytest.mark.asyncio
+async def test_browser_config_initialization():
+    config = BrowserConfig(
+        browser_type="chromium",
+        user_agent_mode="random"
+    )
+    assert config.browser_type == "chromium"
+    assert config.user_agent is not None
+    assert config.headless is True
+
+@pytest.mark.asyncio 
+async def test_persistent_browser_config():
+    config = BrowserConfig(
+        use_persistent_context=True,
+        user_data_dir="/tmp/test_dir"
+    )
+    assert config.use_managed_browser is True
+    assert config.user_data_dir == "/tmp/test_dir"
+
+# Crawler Strategy Tests
+@pytest.mark.asyncio
+async def test_basic_page_load(crawler_strategy):
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        CrawlerRunConfig()
+    )
+    assert response.status_code == 200
+    assert len(response.html) > 0
+    assert "Example Domain" in response.html
+
+@pytest.mark.asyncio
+async def test_screenshot_capture(crawler_strategy):
+    config = CrawlerRunConfig(screenshot=True)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.screenshot is not None
+    assert len(response.screenshot) > 0
+
+@pytest.mark.asyncio
+async def test_pdf_generation(crawler_strategy):
+    config = CrawlerRunConfig(pdf=True)
+    response = await crawler_strategy.crawl(
+        "https://example.com", 
+        config
+    )
+    assert response.pdf_data is not None
+    assert len(response.pdf_data) > 0
+
+@pytest.mark.asyncio
+async def test_handle_js_execution(crawler_strategy):
+    config = CrawlerRunConfig(
+        js_code="document.body.style.backgroundColor = 'red';"
+    )
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'background-color: red' in response.html.lower()
+
+@pytest.mark.asyncio
+async def test_multiple_js_commands(crawler_strategy):
+    js_commands = [
+        "document.body.style.backgroundColor = 'blue';",
+        "document.title = 'Modified Title';",
+        "const div = document.createElement('div'); div.id = 'test'; div.textContent = 'Test Content'; document.body.appendChild(div);"
+    ]
+    config = CrawlerRunConfig(js_code=js_commands)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'background-color: blue' in response.html.lower()
+    assert 'id="test"' in response.html
+    assert '>Test Content<' in response.html
+    assert '<title>Modified Title</title>' in response.html
+
+@pytest.mark.asyncio
+async def test_complex_dom_manipulation(crawler_strategy):
+    js_code = """
+    // Create a complex structure
+    const container = document.createElement('div');
+    container.className = 'test-container';
+    
+    const list = document.createElement('ul');
+    list.className = 'test-list';
+    
+    for (let i = 1; i <= 3; i++) {
+        const item = document.createElement('li');
+        item.textContent = `Item ${i}`;
+        item.className = `item-${i}`;
+        list.appendChild(item);
+    }
+    
+    container.appendChild(list);
+    document.body.appendChild(container);
+    """
+    config = CrawlerRunConfig(js_code=js_code)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'class="test-container"' in response.html
+    assert 'class="test-list"' in response.html
+    assert 'class="item-1"' in response.html
+    assert '>Item 1<' in response.html
+    assert '>Item 2<' in response.html
+    assert '>Item 3<' in response.html
+
+@pytest.mark.asyncio
+async def test_style_modifications(crawler_strategy):
+    js_code = """
+    const testDiv = document.createElement('div');
+    testDiv.id = 'style-test';
+    testDiv.style.cssText = 'color: green; font-size: 20px; margin: 10px;';
+    testDiv.textContent = 'Styled Content';
+    document.body.appendChild(testDiv);
+    """
+    config = CrawlerRunConfig(js_code=js_code)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'id="style-test"' in response.html
+    assert 'color: green' in response.html.lower()
+    assert 'font-size: 20px' in response.html.lower()
+    assert 'margin: 10px' in response.html.lower()
+    assert '>Styled Content<' in response.html
+
+@pytest.mark.asyncio
+async def test_dynamic_content_loading(crawler_strategy):
+    js_code = """
+    // Simulate dynamic content loading
+    setTimeout(() => {
+        const dynamic = document.createElement('div');
+        dynamic.id = 'dynamic-content';
+        dynamic.textContent = 'Dynamically Loaded';
+        document.body.appendChild(dynamic);
+    }, 1000);
+    
+    // Add a loading indicator immediately
+    const loading = document.createElement('div');
+    loading.id = 'loading';
+    loading.textContent = 'Loading...';
+    document.body.appendChild(loading);
+    """
+    config = CrawlerRunConfig(js_code=js_code, delay_before_return_html=2.0)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'id="loading"' in response.html
+    assert '>Loading...</' in response.html
+    assert 'dynamic-content' in response.html
+    assert '>Dynamically Loaded<' in response.html
+
+# @pytest.mark.asyncio
+# async def test_js_return_values(crawler_strategy):
+#     js_code = """
+#     return {
+#         title: document.title,
+#         metaCount: document.getElementsByTagName('meta').length,
+#         bodyClass: document.body.className
+#     };
+#     """
+#     config = CrawlerRunConfig(js_code=js_code)
+#     response = await crawler_strategy.crawl(
+#         "https://example.com",
+#         config
+#     )
+#     assert response.status_code == 200
+#     assert 'Example Domain' in response.html
+#     assert 'meta name="viewport"' in response.html
+#     assert 'class="main"' in response.html
+
+@pytest.mark.asyncio
+async def test_async_js_execution(crawler_strategy):
+    js_code = """
+    await new Promise(resolve => setTimeout(resolve, 1000));
+    document.body.style.color = 'green';
+    const computedStyle = window.getComputedStyle(document.body);
+    return computedStyle.color;
+    """
+    config = CrawlerRunConfig(js_code=js_code)
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response.status_code == 200
+    assert 'color: green' in response.html.lower()
+
+# @pytest.mark.asyncio
+# async def test_js_error_handling(crawler_strategy):
+#     js_code = """
+#     // Intentionally cause different types of errors
+#     const results = [];
+#     try {
+#         nonExistentFunction();
+#     } catch (e) {
+#         results.push(e.name);
+#     }
+#     try {
+#         JSON.parse('{invalid}');
+#     } catch (e) {
+#         results.push(e.name);
+#     }
+#     return results;
+#     """
+#     config = CrawlerRunConfig(js_code=js_code)
+#     response = await crawler_strategy.crawl(
+#         "https://example.com",
+#         config
+#     )
+#     assert response.status_code == 200
+#     assert 'ReferenceError' in response.html
+#     assert 'SyntaxError' in response.html
+
+@pytest.mark.asyncio
+async def test_handle_navigation_timeout():
+    config = CrawlerRunConfig(page_timeout=1)  # 1ms timeout
+    with pytest.raises(Exception):
+        async with AsyncPlaywrightCrawlerStrategy() as strategy:
+            await strategy.crawl("https://example.com", config)
+
+@pytest.mark.asyncio
+async def test_session_management(crawler_strategy):
+    config = CrawlerRunConfig(session_id="test_session")
+    response1 = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    response2 = await crawler_strategy.crawl(
+        "https://example.com",
+        config
+    )
+    assert response1.status_code == 200
+    assert response2.status_code == 200
+
+@pytest.mark.asyncio
+async def test_process_iframes(crawler_strategy):
+    config = CrawlerRunConfig(
+        process_iframes=True,
+        wait_for_images=True
+    )
+    response = await crawler_strategy.crawl(
+        "https://example.com",
+        config  
+    )
+    assert response.status_code == 200
+
+@pytest.mark.asyncio
+async def test_stealth_mode(crawler_strategy):
+    config = CrawlerRunConfig(
+        simulate_user=True,
+        override_navigator=True
+    )
+    response = await crawler_strategy.crawl(
+        "https://bot.sannysoft.com",
+        config
+    )
+    assert response.status_code == 200
+
+# Error Handling Tests  
+@pytest.mark.asyncio
+async def test_invalid_url():
+    with pytest.raises(ValueError):
+        async with AsyncPlaywrightCrawlerStrategy() as strategy:
+            await strategy.crawl("not_a_url", CrawlerRunConfig())
+
+@pytest.mark.asyncio 
+async def test_network_error_handling():
+    config = CrawlerRunConfig()
+    with pytest.raises(Exception):
+        async with AsyncPlaywrightCrawlerStrategy() as strategy:
+            await strategy.crawl("https://invalid.example.com", config)
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/tests/20241401/test_async_markdown_generator.py
+++ b/tests/20241401/test_async_markdown_generator.py
@@ -0,0 +1,171 @@
+import asyncio
+from typing import Dict
+from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+import time
+
+# Test HTML samples
+TEST_HTML_SAMPLES = {
+    "basic": """
+        <body>
+            <h1>Test Title</h1>
+            <p>This is a test paragraph with <a href="http://example.com">a link</a>.</p>
+            <div class="content">
+                <h2>Section 1</h2>
+                <p>More content here with <b>bold text</b>.</p>
+            </div>
+        </body>
+    """,
+    
+    "complex": """
+        <body>
+            <nav>Navigation menu that should be removed</nav>
+            <header>Header content to remove</header>
+            <main>
+                <article>
+                    <h1>Main Article</h1>
+                    <p>Important content paragraph with <a href="http://test.com">useful link</a>.</p>
+                    <section>
+                        <h2>Key Section</h2>
+                        <p>Detailed explanation with multiple sentences. This should be kept 
+                           in the final output. Very important information here.</p>
+                    </section>
+                </article>
+                <aside>Sidebar content to remove</aside>
+            </main>
+            <footer>Footer content to remove</footer>
+        </body>
+    """,
+    
+    "edge_cases": """
+        <body>
+            <div>
+                <p></p>
+                <p>   </p>
+                <script>alert('remove me');</script>
+                <div class="advertisement">Ad content to remove</div>
+                <p class="social-share">Share buttons to remove</p>
+                <h1>!!Special>> Characters## Title!!</h1>
+                <pre><code>def test(): pass</code></pre>
+            </div>
+        </body>
+    """,
+    
+    "links_citations": """
+        <body>
+            <h1>Document with Links</h1>
+            <p>First link to <a href="http://example.com/1">Example 1</a></p>
+            <p>Second link to <a href="http://example.com/2" title="Example 2">Test 2</a></p>
+            <p>Image link: <img src="test.jpg" alt="test image"></p>
+            <p>Repeated link to <a href="http://example.com/1">Example 1 again</a></p>
+        </body>
+    """,
+}
+
+def test_content_filters() -> Dict[str, Dict[str, int]]:
+    """Test various content filtering strategies and return length comparisons."""
+    results = {}
+    
+    # Initialize filters
+    pruning_filter = PruningContentFilter(
+        threshold=0.48,
+        threshold_type="fixed",
+        min_word_threshold=2
+    )
+    
+    bm25_filter = BM25ContentFilter(
+        bm25_threshold=1.0,
+        user_query="test article content important"
+    )
+    
+    # Test each HTML sample
+    for test_name, html in TEST_HTML_SAMPLES.items():
+        # Store results for this test case
+        results[test_name] = {}
+        
+        # Test PruningContentFilter
+        start_time = time.time()
+        pruned_content = pruning_filter.filter_content(html)
+        pruning_time = time.time() - start_time
+        
+        # Test BM25ContentFilter
+        start_time = time.time()
+        bm25_content = bm25_filter.filter_content(html)
+        bm25_time = time.time() - start_time
+        
+        # Store results
+        results[test_name] = {
+            "original_length": len(html),
+            "pruned_length": sum(len(c) for c in pruned_content),
+            "bm25_length": sum(len(c) for c in bm25_content),
+            "pruning_time": pruning_time,
+            "bm25_time": bm25_time
+        }
+        
+    return results
+
+def test_markdown_generation():
+    """Test markdown generation with different configurations."""
+    results = []
+    
+    # Initialize generators with different configurations
+    generators = {
+        "no_filter": DefaultMarkdownGenerator(),
+        "pruning": DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(threshold=0.48)
+        ),
+        "bm25": DefaultMarkdownGenerator(
+            content_filter=BM25ContentFilter(
+                user_query="test article content important"
+            )
+        )
+    }
+    
+    # Test each generator with each HTML sample
+    for test_name, html in TEST_HTML_SAMPLES.items():
+        for gen_name, generator in generators.items():
+            start_time = time.time()
+            result = generator.generate_markdown(
+                html,
+                base_url="http://example.com",
+                citations=True
+            )
+            
+            results.append({
+                "test_case": test_name,
+                "generator": gen_name,
+                "time": time.time() - start_time,
+                "raw_length": len(result.raw_markdown),
+                "fit_length": len(result.fit_markdown) if result.fit_markdown else 0,
+                "citations": len(result.references_markdown)
+            })
+    
+    return results
+
+def main():
+    """Run all tests and print results."""
+    print("Starting content filter tests...")
+    filter_results = test_content_filters()
+    
+    print("\nContent Filter Results:")
+    print("-" * 50)
+    for test_name, metrics in filter_results.items():
+        print(f"\nTest case: {test_name}")
+        print(f"Original length: {metrics['original_length']}")
+        print(f"Pruned length: {metrics['pruned_length']} ({metrics['pruning_time']:.3f}s)")
+        print(f"BM25 length: {metrics['bm25_length']} ({metrics['bm25_time']:.3f}s)")
+        
+    print("\nStarting markdown generation tests...")
+    markdown_results = test_markdown_generation()
+    
+    print("\nMarkdown Generation Results:")
+    print("-" * 50)
+    for result in markdown_results:
+        print(f"\nTest: {result['test_case']} - Generator: {result['generator']}")
+        print(f"Time: {result['time']:.3f}s")
+        print(f"Raw length: {result['raw_length']}")
+        print(f"Fit length: {result['fit_length']}")
+        print(f"Citations: {result['citations']}")
+
+if __name__ == "__main__":
+    main()
--- a/tests/20241401/test_async_webcrawler.py
+++ b/tests/20241401/test_async_webcrawler.py
@@ -0,0 +1,149 @@
+import asyncio
+import pytest
+from typing import List
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig, 
+    CrawlerRunConfig,
+    MemoryAdaptiveDispatcher,
+    RateLimiter,
+    CacheMode
+)
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("viewport", [
+    (800, 600),
+    (1024, 768),
+    (1920, 1080)
+])
+async def test_viewport_config(viewport):
+    """Test different viewport configurations"""
+    width, height = viewport
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        viewport_width=width,
+        viewport_height=height
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=CrawlerRunConfig(
+                # cache_mode=CacheMode.BYPASS,
+                page_timeout=30000  # 30 seconds
+            )
+        )
+        assert result.success
+
+@pytest.mark.asyncio
+async def test_memory_management():
+    """Test memory-adaptive dispatching"""
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        viewport_width=1024,
+        viewport_height=768
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,
+        check_interval=1.0,
+        max_session_permit=5
+    )
+    
+    urls = ["https://example.com"] * 3  # Test with multiple identical URLs
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = await crawler.arun_many(
+            urls=urls,
+            config=CrawlerRunConfig(page_timeout=30000),
+            dispatcher=dispatcher
+        )
+        assert len(results) == len(urls)
+
+@pytest.mark.asyncio
+async def test_rate_limiting():
+    """Test rate limiting functionality"""
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        rate_limiter=RateLimiter(
+            base_delay=(1.0, 2.0),
+            max_delay=5.0,
+            max_retries=2
+        ),
+        memory_threshold_percent=70.0
+    )
+    
+    urls = [
+        "https://example.com",
+        "https://example.org",
+        "https://example.net"
+    ]
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        results = await crawler.arun_many(
+            urls=urls,
+            config=CrawlerRunConfig(page_timeout=30000),
+            dispatcher=dispatcher
+        )
+        assert len(results) == len(urls)
+
+@pytest.mark.asyncio
+async def test_javascript_execution():
+    """Test JavaScript execution capabilities"""
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True,
+        java_script_enabled=True
+    )
+    
+    js_code = """
+        document.body.style.backgroundColor = 'red';
+        return document.body.style.backgroundColor;
+    """
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=CrawlerRunConfig(
+                js_code=js_code,
+                page_timeout=30000
+            )
+        )
+        assert result.success
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("error_url", [
+    "https://invalid.domain.test",
+    "https://httpbin.org/status/404",
+    "https://httpbin.org/status/503",
+    "https://httpbin.org/status/403"
+])
+async def test_error_handling(error_url):
+    """Test error handling for various failure scenarios"""
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url=error_url,
+            config=CrawlerRunConfig(
+                page_timeout=10000,  # Short timeout for error cases
+                cache_mode=CacheMode.BYPASS
+            )
+        )
+        assert not result.success
+        assert result.error_message is not None
+
+if __name__ == "__main__":
+    asyncio.run(test_viewport_config((1024, 768)))
+    asyncio.run(test_memory_management())
+    asyncio.run(test_rate_limiting())
+    asyncio.run(test_javascript_execution())
--- a/tests/20241401/test_cache_context.py
+++ b/tests/20241401/test_cache_context.py
@@ -0,0 +1,85 @@
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+async def test_reuse_context_by_config():
+    # We will store each context ID in these maps to confirm reuse
+    context_ids_for_A = []
+    context_ids_for_B = []
+
+    # Create a small hook to track context creation
+    async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs):
+        c_id = id(context)
+        print(f"[HOOK] on_page_context_created - Context ID: {c_id}")
+        # Distinguish which config we used by checking a custom hook param
+        config_label = config.shared_data.get("config_label", "unknown")
+        if config_label == "A":
+            context_ids_for_A.append(c_id)
+        elif config_label == "B":
+            context_ids_for_B.append(c_id)
+        return page
+
+    # Browser config - Headless, verbose so we see logs
+    browser_config = BrowserConfig(headless=True, verbose=True)
+
+    # Two crawler run configs that differ (for example, text_mode):
+    configA = CrawlerRunConfig(
+        only_text=True,
+        cache_mode=CacheMode.BYPASS,
+        wait_until="domcontentloaded",
+        shared_data = {
+            "config_label" : "A"
+        }
+    )
+    configB = CrawlerRunConfig(
+        only_text=False,
+        cache_mode=CacheMode.BYPASS,
+        wait_until="domcontentloaded",
+        shared_data = {
+            "config_label" : "B"
+        }
+    )
+
+    # Create the crawler
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    # Attach our custom hook
+    # Note: "on_page_context_created" will be called each time a new context+page is generated
+    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
+
+    # Start the crawler (launches the browser)
+    await crawler.start()
+
+    # For demonstration, we’ll crawl a benign site multiple times with each config
+    test_url = "https://example.com"
+    print("\n--- Crawling with config A (text_mode=True) ---")
+    for _ in range(2):
+        # Pass an extra kwarg to the hook so we know which config is being used
+        await crawler.arun(test_url, config=configA)
+
+    print("\n--- Crawling with config B (text_mode=False) ---")
+    for _ in range(2):
+        await crawler.arun(test_url, config=configB)
+
+    # Close the crawler (shuts down the browser, closes contexts)
+    await crawler.close()
+
+    # Validate and show the results
+    print("\n=== RESULTS ===")
+    print(f"Config A context IDs: {context_ids_for_A}")
+    print(f"Config B context IDs: {context_ids_for_B}")
+    if len(set(context_ids_for_A)) == 1:
+        print("✅ All config A crawls used the SAME BrowserContext.")
+    else:
+        print("❌ Config A crawls created multiple contexts unexpectedly.")
+    if len(set(context_ids_for_B)) == 1:
+        print("✅ All config B crawls used the SAME BrowserContext.")
+    else:
+        print("❌ Config B crawls created multiple contexts unexpectedly.")
+    if set(context_ids_for_A).isdisjoint(context_ids_for_B):
+        print("✅ Config A context is different from Config B context.")
+    else:
+        print("❌ A and B ended up sharing the same context somehow!")
+
+if __name__ == "__main__":
+    asyncio.run(test_reuse_context_by_config())
--- a/tests/20241401/test_llm_filter.py
+++ b/tests/20241401/test_llm_filter.py
@@ -0,0 +1,87 @@
+import os
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.content_filter_strategy import LLMContentFilter
+
+async def test_llm_filter():
+    # Create an HTML source that needs intelligent filtering
+    url = "https://docs.python.org/3/tutorial/classes.html"
+    
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True
+    )
+    
+    # run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+    run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # First get the raw HTML
+        result = await crawler.arun(url, config=run_config)
+        html = result.cleaned_html
+
+        # Initialize LLM filter with focused instruction
+        filter = LLMContentFilter(
+            provider="openai/gpt-4o",
+            api_token=os.getenv('OPENAI_API_KEY'),
+            instruction="""
+            Focus on extracting the core educational content about Python classes.
+            Include:
+            - Key concepts and their explanations
+            - Important code examples
+            - Essential technical details
+            Exclude:
+            - Navigation elements
+            - Sidebars
+            - Footer content
+            - Version information
+            - Any non-essential UI elements
+            
+            Format the output as clean markdown with proper code blocks and headers.
+            """,
+            verbose=True
+        )
+        
+        filter = LLMContentFilter(
+            provider="openai/gpt-4o",
+            api_token=os.getenv('OPENAI_API_KEY'),
+            chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
+            instruction="""
+            Extract the main educational content while preserving its original wording and substance completely. Your task is to:
+
+            1. Maintain the exact language and terminology used in the main content
+            2. Keep all technical explanations, examples, and educational content intact
+            3. Preserve the original flow and structure of the core content
+            4. Remove only clearly irrelevant elements like:
+            - Navigation menus
+            - Advertisement sections
+            - Cookie notices
+            - Footers with site information
+            - Sidebars with external links
+            - Any UI elements that don't contribute to learning
+
+            The goal is to create a clean markdown version that reads exactly like the original article, 
+            keeping all valuable content but free from distracting elements. Imagine you're creating 
+            a perfect reading experience where nothing valuable is lost, but all noise is removed.
+            """,
+            verbose=True
+        )        
+
+        # Apply filtering
+        filtered_content = filter.filter_content(html, ignore_cache = True)
+        
+        # Show results
+        print("\nFiltered Content Length:", len(filtered_content))
+        print("\nFirst 500 chars of filtered content:")
+        if filtered_content:
+            print(filtered_content[0][:500])
+        
+        # Save on disc the markdown version
+        with open("filtered_content.md", "w", encoding="utf-8") as f:
+            f.write("\n".join(filtered_content))
+        
+        # Show token usage
+        filter.show_usage()
+
+if __name__ == "__main__":
+    asyncio.run(test_llm_filter())
--- a/tests/20241401/test_schema_builder.py
+++ b/tests/20241401/test_schema_builder.py
@@ -0,0 +1,111 @@
+# https://claude.ai/chat/c4bbe93d-fb54-44ce-92af-76b4c8086c6b
+# https://claude.ai/chat/c24a768c-d8b2-478a-acc7-d76d42a308da
+import os, sys
+
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+import json
+
+# Test HTML - A complex job board with companies, departments, and positions
+test_html = """
+<div class="company-listings">
+    <div class="company" data-company-id="123">
+        <div class="company-header">
+            <img class="company-logo" src="google.png" alt="Google">
+            <h1 class="company-name">Google</h1>
+            <div class="company-meta">
+                <span class="company-size">10,000+ employees</span>
+                <span class="company-industry">Technology</span>
+                <a href="https://google.careers" class="careers-link">Careers Page</a>
+            </div>
+        </div>
+        
+        <div class="departments">
+            <div class="department">
+                <h2 class="department-name">Engineering</h2>
+                <div class="positions">
+                    <div class="position-card" data-position-id="eng-1">
+                        <h3 class="position-title">Senior Software Engineer</h3>
+                        <span class="salary-range">$150,000 - $250,000</span>
+                        <div class="position-meta">
+                            <span class="location">Mountain View, CA</span>
+                            <span class="job-type">Full-time</span>
+                            <span class="experience">5+ years</span>
+                        </div>
+                        <div class="skills-required">
+                            <span class="skill">Python</span>
+                            <span class="skill">Kubernetes</span>
+                            <span class="skill">Machine Learning</span>
+                        </div>
+                        <p class="position-description">Join our core engineering team...</p>
+                        <div class="application-info">
+                            <span class="posting-date">Posted: 2024-03-15</span>
+                            <button class="apply-btn" data-req-id="REQ12345">Apply Now</button>
+                        </div>
+                    </div>
+                    <!-- More positions -->
+                </div>
+            </div>
+            
+            <div class="department">
+                <h2 class="department-name">Marketing</h2>
+                <div class="positions">
+                    <div class="position-card" data-position-id="mkt-1">
+                        <h3 class="position-title">Growth Marketing Manager</h3>
+                        <span class="salary-range">$120,000 - $180,000</span>
+                        <div class="position-meta">
+                            <span class="location">New York, NY</span>
+                            <span class="job-type">Full-time</span>
+                            <span class="experience">3+ years</span>
+                        </div>
+                        <div class="skills-required">
+                            <span class="skill">SEO</span>
+                            <span class="skill">Analytics</span>
+                            <span class="skill">Content Strategy</span>
+                        </div>
+                        <p class="position-description">Drive our growth initiatives...</p>
+                        <div class="application-info">
+                            <span class="posting-date">Posted: 2024-03-14</span>
+                            <button class="apply-btn" data-req-id="REQ12346">Apply Now</button>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+</div>
+"""
+
+# Test cases
+def test_schema_generation():
+    # Test 1: No query (should extract everything)
+    print("\nTest 1: No Query (Full Schema)")
+    schema1 = JsonCssExtractionStrategy.generate_schema(test_html)
+    print(json.dumps(schema1, indent=2))
+    
+    # Test 2: Query for just basic job info
+    print("\nTest 2: Basic Job Info Query")
+    query2 = "I only need job titles, salaries, and locations"
+    schema2 = JsonCssExtractionStrategy.generate_schema(test_html, query2)
+    print(json.dumps(schema2, indent=2))
+    
+    # Test 3: Query for company and department structure
+    print("\nTest 3: Organizational Structure Query")
+    query3 = "Extract company details and department names, without position details"
+    schema3 = JsonCssExtractionStrategy.generate_schema(test_html, query3)
+    print(json.dumps(schema3, indent=2))
+    
+    # Test 4: Query for specific skills tracking
+    print("\nTest 4: Skills Analysis Query")
+    query4 = "I want to analyze required skills across all positions"
+    schema4 = JsonCssExtractionStrategy.generate_schema(test_html, query4)
+    print(json.dumps(schema4, indent=2))
+
+if __name__ == "__main__":
+    test_schema_generation()
--- a/tests/20241401/test_stream.py
+++ b/tests/20241401/test_stream.py
@@ -0,0 +1,50 @@
+import os, sys
+# append 2 parent directories to sys.path to import crawl4ai
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+
+import asyncio
+from crawl4ai import *
+
+async def test_crawler():
+    # Setup configurations
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, 
+                threshold_type="fixed", 
+                min_word_threshold=0
+            )
+        ),
+    )
+
+    # Test URLs - mix of different sites
+    urls = [
+        "http://example.com",
+        "http://example.org",
+        "http://example.net",
+    ] * 10  # 15 total URLs
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        print("\n=== Testing Streaming Mode ===")
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=crawler_config.clone(stream=True),
+        ):
+            print(f"Received result for: {result.url} - Success: {result.success}")
+            
+        print("\n=== Testing Batch Mode ===")
+        results = await crawler.arun_many(
+            urls=urls,
+            config=crawler_config,
+        )
+        print(f"Received all {len(results)} results at once")
+        for result in results:
+            print(f"Batch result for: {result.url} - Success: {result.success}")
+
+if __name__ == "__main__":
+    asyncio.run(test_crawler())
--- a/tests/20241401/test_stream_dispatch.py
+++ b/tests/20241401/test_stream_dispatch.py
@@ -0,0 +1,39 @@
+import os, sys
+# append 2 parent directories to sys.path to import crawl4ai
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+
+
+import asyncio
+from typing import List
+from crawl4ai import *
+from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
+
+async def test_streaming():
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        markdown_generator=DefaultMarkdownGenerator(
+            # content_filter=PruningContentFilter(
+            #     threshold=0.48, 
+            #     threshold_type="fixed", 
+            #     min_word_threshold=0
+            # )
+        ),
+    )
+
+    urls = ["http://example.com"] * 10
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        dispatcher = MemoryAdaptiveDispatcher(
+            max_session_permit=5,
+            check_interval=0.5
+        )
+        
+        async for result in dispatcher.run_urls_stream(urls, crawler, crawler_config):
+            print(f"Got result for {result.url} - Success: {result.result.success}")
+
+if __name__ == "__main__":
+    asyncio.run(test_streaming())
--- a/tests/async/test_0.4.2_browser_manager.py
+++ b/tests/async/test_0.4.2_browser_manager.py
@@ -1,17 +1,18 @@
-import os, sys
-parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-sys.path.append(parent_dir)
-__location__ = os.path.realpath(    os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-import os, sys
+import os
+import sys
 import asyncio
 from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

-# Assuming that the changes made allow different configurations 
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+
+# Assuming that the changes made allow different configurations
 # for managed browser, persistent context, and so forth.

+
 async def test_default_headless():
    async with AsyncWebCrawler(
        headless=True,
@@ -24,13 +25,14 @@ async def test_default_headless():
        # Testing normal ephemeral context
    ) as crawler:
        result = await crawler.arun(
-            url='https://www.kidocode.com/degrees/technology',
+            url="https://www.kidocode.com/degrees/technology",
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
        )
        print("[test_default_headless] success:", result.success)
        print("HTML length:", len(result.html if result.html else ""))
-        
+
+
 async def test_managed_browser_persistent():
    # Treating use_persistent_context=True as managed_browser scenario.
    async with AsyncWebCrawler(
@@ -44,13 +46,14 @@ async def test_managed_browser_persistent():
        # This should store and reuse profile data across runs
    ) as crawler:
        result = await crawler.arun(
-            url='https://www.google.com',
+            url="https://www.google.com",
            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
        )
        print("[test_managed_browser_persistent] success:", result.success)
        print("HTML length:", len(result.html if result.html else ""))

+
 async def test_session_reuse():
    # Test creating a session, using it for multiple calls
    session_id = "my_session"
@@ -62,25 +65,25 @@ async def test_session_reuse():
        use_managed_browser=False,
        use_persistent_context=False,
    ) as crawler:
-        
        # First call: create session
        result1 = await crawler.arun(
-            url='https://www.example.com',
+            url="https://www.example.com",
            cache_mode=CacheMode.BYPASS,
            session_id=session_id,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
        )
        print("[test_session_reuse first call] success:", result1.success)
-        
+
        # Second call: same session, possibly cookie retained
        result2 = await crawler.arun(
-            url='https://www.example.com/about',
+            url="https://www.example.com/about",
            cache_mode=CacheMode.BYPASS,
            session_id=session_id,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
        )
        print("[test_session_reuse second call] success:", result2.success)

+
 async def test_magic_mode():
    # Test magic mode with override_navigator and simulate_user
    async with AsyncWebCrawler(
@@ -95,13 +98,14 @@ async def test_magic_mode():
        simulate_user=True,
    ) as crawler:
        result = await crawler.arun(
-            url='https://www.kidocode.com/degrees/business',
+            url="https://www.kidocode.com/degrees/business",
            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
        )
        print("[test_magic_mode] success:", result.success)
        print("HTML length:", len(result.html if result.html else ""))

+
 async def test_proxy_settings():
    # Test with a proxy (if available) to ensure code runs with proxy
    async with AsyncWebCrawler(
@@ -113,14 +117,15 @@ async def test_proxy_settings():
        use_persistent_context=False,
    ) as crawler:
        result = await crawler.arun(
-            url='https://httpbin.org/ip',
+            url="https://httpbin.org/ip",
            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
        )
        print("[test_proxy_settings] success:", result.success)
        if result.success:
            print("HTML preview:", result.html[:200] if result.html else "")

+
 async def test_ignore_https_errors():
    # Test ignore HTTPS errors with a self-signed or invalid cert domain
    # This is just conceptual, the domain should be one that triggers SSL error.
@@ -134,12 +139,13 @@ async def test_ignore_https_errors():
        use_persistent_context=False,
    ) as crawler:
        result = await crawler.arun(
-            url='https://self-signed.badssl.com/',
+            url="https://self-signed.badssl.com/",
            cache_mode=CacheMode.BYPASS,
-            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
        )
        print("[test_ignore_https_errors] success:", result.success)

+
 async def main():
    print("Running tests...")
    # await test_default_headless()
@@ -149,5 +155,6 @@ async def main():
    # await test_proxy_settings()
    await test_ignore_https_errors()

+
 if __name__ == "__main__":
    asyncio.run(main())
--- a/tests/async/test_0.4.2_config_params.py
+++ b/tests/async/test_0.4.2_config_params.py
@@ -1,15 +1,16 @@
 import os, sys
+
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

 import asyncio
 from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig      
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from crawl4ai.chunking_strategy import RegexChunking
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+

 # Category 1: Browser Configuration Tests
 async def test_browser_config_object():
@@ -21,29 +22,31 @@ async def test_browser_config_object():
        viewport_height=1080,
        use_managed_browser=True,
        user_agent_mode="random",
-        user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}
+        user_agent_generator_config={"device_type": "desktop", "os_type": "windows"},
    )
-    
+
    async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
-        result = await crawler.arun('https://example.com', cache_mode=CacheMode.BYPASS)
+        result = await crawler.arun("https://example.com", cache_mode=CacheMode.BYPASS)
        assert result.success, "Browser config crawl failed"
        assert len(result.html) > 0, "No HTML content retrieved"

+
 async def test_browser_performance_config():
    """Test browser configurations focused on performance"""
    browser_config = BrowserConfig(
        text_mode=True,
        light_mode=True,
-        extra_args=['--disable-gpu', '--disable-software-rasterizer'],
+        extra_args=["--disable-gpu", "--disable-software-rasterizer"],
        ignore_https_errors=True,
-        java_script_enabled=False
+        java_script_enabled=False,
    )
-    
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun('https://example.com')
+        result = await crawler.arun("https://example.com")
        assert result.success, "Performance optimized crawl failed"
        assert result.status_code == 200, "Unexpected status code"

+
 # Category 2: Content Processing Tests
 async def test_content_extraction_config():
    """Test content extraction with various strategies"""
@@ -53,24 +56,20 @@ async def test_content_extraction_config():
            schema={
                "name": "article",
                "baseSelector": "div",
-                "fields": [{
-                    "name": "title",
-                    "selector": "h1",
-                    "type": "text"
-                }]
+                "fields": [{"name": "title", "selector": "h1", "type": "text"}],
            }
        ),
        chunking_strategy=RegexChunking(),
-        content_filter=PruningContentFilter()
+        content_filter=PruningContentFilter(),
    )
-    
+
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
-            'https://example.com/article',
-            config=crawler_config
+            "https://example.com/article", config=crawler_config
        )
        assert result.extracted_content is not None, "Content extraction failed"
-        assert 'title' in result.extracted_content, "Missing expected content field"
+        assert "title" in result.extracted_content, "Missing expected content field"
+

 # Category 3: Cache and Session Management Tests
 async def test_cache_and_session_management():
@@ -79,25 +78,20 @@ async def test_cache_and_session_management():
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.WRITE_ONLY,
        process_iframes=True,
-        remove_overlay_elements=True
+        remove_overlay_elements=True,
    )
-    
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # First request - should write to cache
-        result1 = await crawler.arun(
-            'https://example.com',
-            config=crawler_config
-        )
-        
+        result1 = await crawler.arun("https://example.com", config=crawler_config)
+
        # Second request - should use fresh fetch due to WRITE_ONLY mode
-        result2 = await crawler.arun(
-            'https://example.com',
-            config=crawler_config
-        )
-        
+        result2 = await crawler.arun("https://example.com", config=crawler_config)
+
        assert result1.success and result2.success, "Cache mode crawl failed"
        assert result1.html == result2.html, "Inconsistent results between requests"

+
 # Category 4: Media Handling Tests
 async def test_media_handling_config():
    """Test configurations related to media handling"""
@@ -107,24 +101,22 @@ async def test_media_handling_config():
        viewport_width=1920,
        viewport_height=1080,
        accept_downloads=True,
-        downloads_path= os.path.expanduser("~/.crawl4ai/downloads")
+        downloads_path=os.path.expanduser("~/.crawl4ai/downloads"),
    )
    crawler_config = CrawlerRunConfig(
        screenshot=True,
        pdf=True,
        adjust_viewport_to_content=True,
        wait_for_images=True,
-        screenshot_height_threshold=20000
+        screenshot_height_threshold=20000,
    )
-    
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            'https://example.com',
-            config=crawler_config
-        )
+        result = await crawler.arun("https://example.com", config=crawler_config)
        assert result.screenshot is not None, "Screenshot capture failed"
        assert result.pdf is not None, "PDF generation failed"

+
 # Category 5: Anti-Bot and Site Interaction Tests
 async def test_antibot_config():
    """Test configurations for handling anti-bot measures"""
@@ -135,76 +127,64 @@ async def test_antibot_config():
        wait_for="js:()=>document.querySelector('body')",
        delay_before_return_html=1.0,
        log_console=True,
-        cache_mode=CacheMode.BYPASS
+        cache_mode=CacheMode.BYPASS,
    )
-    
+
    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            'https://example.com',
-            config=crawler_config
-        )
+        result = await crawler.arun("https://example.com", config=crawler_config)
        assert result.success, "Anti-bot measure handling failed"

+
 # Category 6: Parallel Processing Tests
 async def test_parallel_processing():
    """Test parallel processing capabilities"""
-    crawler_config = CrawlerRunConfig(
-        mean_delay=0.5,
-        max_range=1.0,
-        semaphore_count=5
-    )
-    
-    urls = [
-        'https://example.com/1',
-        'https://example.com/2',
-        'https://example.com/3'
-    ]
-    
+    crawler_config = CrawlerRunConfig(mean_delay=0.5, max_range=1.0, semaphore_count=5)
+
+    urls = ["https://example.com/1", "https://example.com/2", "https://example.com/3"]
+
    async with AsyncWebCrawler() as crawler:
-        results = await crawler.arun_many(
-            urls,
-            config=crawler_config
-        )
+        results = await crawler.arun_many(urls, config=crawler_config)
        assert len(results) == len(urls), "Not all URLs were processed"
        assert all(r.success for r in results), "Some parallel requests failed"

+
 # Category 7: Backwards Compatibility Tests
 async def test_legacy_parameter_support():
    """Test that legacy parameters still work"""
    async with AsyncWebCrawler(
-        headless=True,
-        browser_type="chromium",
-        viewport_width=1024,
-        viewport_height=768
+        headless=True, browser_type="chromium", viewport_width=1024, viewport_height=768
    ) as crawler:
        result = await crawler.arun(
-            'https://example.com',
+            "https://example.com",
            screenshot=True,
            word_count_threshold=200,
            bypass_cache=True,
-            css_selector=".main-content"
+            css_selector=".main-content",
        )
        assert result.success, "Legacy parameter support failed"

+
 # Category 8: Mixed Configuration Tests
 async def test_mixed_config_usage():
    """Test mixing new config objects with legacy parameters"""
    browser_config = BrowserConfig(headless=True)
    crawler_config = CrawlerRunConfig(screenshot=True)
-    
+
    async with AsyncWebCrawler(
        config=browser_config,
-        verbose=True  # legacy parameter
+        verbose=True,  # legacy parameter
    ) as crawler:
        result = await crawler.arun(
-            'https://example.com',
+            "https://example.com",
            config=crawler_config,
            cache_mode=CacheMode.BYPASS,  # legacy parameter
-            css_selector="body"  # legacy parameter
+            css_selector="body",  # legacy parameter
        )
        assert result.success, "Mixed configuration usage failed"

+
 if __name__ == "__main__":
+
    async def run_tests():
        test_functions = [
            test_browser_config_object,
@@ -217,7 +197,7 @@ if __name__ == "__main__":
            # test_legacy_parameter_support,
            # test_mixed_config_usage
        ]
-        
+
        for test in test_functions:
            print(f"\nRunning {test.__name__}...")
            try:
@@ -227,5 +207,5 @@ if __name__ == "__main__":
                print(f"✗ {test.__name__} failed: {str(e)}")
            except Exception as e:
                print(f"✗ {test.__name__} error: {str(e)}")
-    
-    asyncio.run(run_tests())
+
+    asyncio.run(run_tests())
--- a/tests/async/test_async_doanloader.py
+++ b/tests/async/test_async_doanloader.py
@@ -4,7 +4,6 @@ import asyncio
 import shutil
 from typing import List
 import tempfile
-import time

 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -12,28 +11,27 @@ sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
 class TestDownloads:
    def __init__(self):
        self.temp_dir = tempfile.mkdtemp(prefix="crawl4ai_test_")
        self.download_dir = os.path.join(self.temp_dir, "downloads")
        os.makedirs(self.download_dir, exist_ok=True)
        self.results: List[str] = []
-        
+
    def cleanup(self):
        shutil.rmtree(self.temp_dir)
-        
+
    def log_result(self, test_name: str, success: bool, message: str = ""):
        result = f"{'✅' if success else '❌'} {test_name}: {message}"
        self.results.append(result)
        print(result)
-        
+
    async def test_basic_download(self):
        """Test basic file download functionality"""
        try:
            async with AsyncWebCrawler(
-                accept_downloads=True,
-                downloads_path=self.download_dir,
-                verbose=True
+                accept_downloads=True, downloads_path=self.download_dir, verbose=True
            ) as crawler:
                # Python.org downloads page typically has stable download links
                result = await crawler.arun(
@@ -42,14 +40,19 @@ class TestDownloads:
                    // Click first download link
                    const downloadLink = document.querySelector('a[href$=".exe"]');
                    if (downloadLink) downloadLink.click();
-                    """
+                    """,
+                )
+
+                success = (
+                    result.downloaded_files is not None
+                    and len(result.downloaded_files) > 0
                )
-                
-                success = result.downloaded_files is not None and len(result.downloaded_files) > 0
                self.log_result(
                    "Basic Download",
                    success,
-                    f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
+                    f"Downloaded {len(result.downloaded_files or [])} files"
+                    if success
+                    else "No files downloaded",
                )
        except Exception as e:
            self.log_result("Basic Download", False, str(e))
@@ -59,27 +62,32 @@ class TestDownloads:
        try:
            user_data_dir = os.path.join(self.temp_dir, "user_data")
            os.makedirs(user_data_dir, exist_ok=True)
-            
+
            async with AsyncWebCrawler(
                accept_downloads=True,
                downloads_path=self.download_dir,
                use_persistent_context=True,
                user_data_dir=user_data_dir,
-                verbose=True
+                verbose=True,
            ) as crawler:
                result = await crawler.arun(
                    url="https://www.python.org/downloads/",
                    js_code="""
                    const downloadLink = document.querySelector('a[href$=".exe"]');
                    if (downloadLink) downloadLink.click();
-                    """
+                    """,
+                )
+
+                success = (
+                    result.downloaded_files is not None
+                    and len(result.downloaded_files) > 0
                )
-                
-                success = result.downloaded_files is not None and len(result.downloaded_files) > 0
                self.log_result(
                    "Persistent Context Download",
                    success,
-                    f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
+                    f"Downloaded {len(result.downloaded_files or [])} files"
+                    if success
+                    else "No files downloaded",
                )
        except Exception as e:
            self.log_result("Persistent Context Download", False, str(e))
@@ -88,9 +96,7 @@ class TestDownloads:
        """Test multiple simultaneous downloads"""
        try:
            async with AsyncWebCrawler(
-                accept_downloads=True,
-                downloads_path=self.download_dir,
-                verbose=True
+                accept_downloads=True, downloads_path=self.download_dir, verbose=True
            ) as crawler:
                result = await crawler.arun(
                    url="https://www.python.org/downloads/",
@@ -98,14 +104,19 @@ class TestDownloads:
                    // Click multiple download links
                    const downloadLinks = document.querySelectorAll('a[href$=".exe"]');
                    downloadLinks.forEach(link => link.click());
-                    """
+                    """,
+                )
+
+                success = (
+                    result.downloaded_files is not None
+                    and len(result.downloaded_files) > 1
                )
-                
-                success = result.downloaded_files is not None and len(result.downloaded_files) > 1
                self.log_result(
                    "Multiple Downloads",
                    success,
-                    f"Downloaded {len(result.downloaded_files or [])} files" if success else "Not enough files downloaded"
+                    f"Downloaded {len(result.downloaded_files or [])} files"
+                    if success
+                    else "Not enough files downloaded",
                )
        except Exception as e:
            self.log_result("Multiple Downloads", False, str(e))
@@ -113,49 +124,51 @@ class TestDownloads:
    async def test_different_browsers(self):
        """Test downloads across different browser types"""
        browsers = ["chromium", "firefox", "webkit"]
-        
+
        for browser_type in browsers:
            try:
                async with AsyncWebCrawler(
                    accept_downloads=True,
                    downloads_path=self.download_dir,
                    browser_type=browser_type,
-                    verbose=True
+                    verbose=True,
                ) as crawler:
                    result = await crawler.arun(
                        url="https://www.python.org/downloads/",
                        js_code="""
                        const downloadLink = document.querySelector('a[href$=".exe"]');
                        if (downloadLink) downloadLink.click();
-                        """
+                        """,
+                    )
+
+                    success = (
+                        result.downloaded_files is not None
+                        and len(result.downloaded_files) > 0
                    )
-                    
-                    success = result.downloaded_files is not None and len(result.downloaded_files) > 0
                    self.log_result(
                        f"{browser_type.title()} Download",
                        success,
-                        f"Downloaded {len(result.downloaded_files or [])} files" if success else "No files downloaded"
+                        f"Downloaded {len(result.downloaded_files or [])} files"
+                        if success
+                        else "No files downloaded",
                    )
            except Exception as e:
                self.log_result(f"{browser_type.title()} Download", False, str(e))

    async def test_edge_cases(self):
        """Test various edge cases"""
-        
+
        # Test 1: Downloads without specifying download path
        try:
-            async with AsyncWebCrawler(
-                accept_downloads=True,
-                verbose=True
-            ) as crawler:
+            async with AsyncWebCrawler(accept_downloads=True, verbose=True) as crawler:
                result = await crawler.arun(
                    url="https://www.python.org/downloads/",
-                    js_code="document.querySelector('a[href$=\".exe\"]').click()"
+                    js_code="document.querySelector('a[href$=\".exe\"]').click()",
                )
                self.log_result(
                    "Default Download Path",
                    True,
-                    f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}"
+                    f"Downloaded to default path: {result.downloaded_files[0] if result.downloaded_files else 'None'}",
                )
        except Exception as e:
            self.log_result("Default Download Path", False, str(e))
@@ -165,31 +178,34 @@ class TestDownloads:
            async with AsyncWebCrawler(
                accept_downloads=True,
                downloads_path="/invalid/path/that/doesnt/exist",
-                verbose=True
+                verbose=True,
            ) as crawler:
                result = await crawler.arun(
                    url="https://www.python.org/downloads/",
-                    js_code="document.querySelector('a[href$=\".exe\"]').click()"
+                    js_code="document.querySelector('a[href$=\".exe\"]').click()",
                )
-                self.log_result("Invalid Download Path", False, "Should have raised an error")
-        except Exception as e:
-            self.log_result("Invalid Download Path", True, "Correctly handled invalid path")
+                self.log_result(
+                    "Invalid Download Path", False, "Should have raised an error"
+                )
+        except Exception:
+            self.log_result(
+                "Invalid Download Path", True, "Correctly handled invalid path"
+            )

        # Test 3: Download with accept_downloads=False
        try:
-            async with AsyncWebCrawler(
-                accept_downloads=False,
-                verbose=True
-            ) as crawler:
+            async with AsyncWebCrawler(accept_downloads=False, verbose=True) as crawler:
                result = await crawler.arun(
                    url="https://www.python.org/downloads/",
-                    js_code="document.querySelector('a[href$=\".exe\"]').click()"
+                    js_code="document.querySelector('a[href$=\".exe\"]').click()",
                )
                success = result.downloaded_files is None
                self.log_result(
                    "Disabled Downloads",
                    success,
-                    "Correctly ignored downloads" if success else "Unexpectedly downloaded files"
+                    "Correctly ignored downloads"
+                    if success
+                    else "Unexpectedly downloaded files",
                )
        except Exception as e:
            self.log_result("Disabled Downloads", False, str(e))
@@ -197,33 +213,35 @@ class TestDownloads:
    async def run_all_tests(self):
        """Run all test cases"""
        print("\n🧪 Running Download Tests...\n")
-        
+
        test_methods = [
            self.test_basic_download,
            self.test_persistent_context_download,
            self.test_multiple_downloads,
            self.test_different_browsers,
-            self.test_edge_cases
+            self.test_edge_cases,
        ]
-        
+
        for test in test_methods:
            print(f"\n📝 Running {test.__doc__}...")
            await test()
            await asyncio.sleep(2)  # Brief pause between tests
-            
+
        print("\n📊 Test Results Summary:")
        for result in self.results:
            print(result)
-            
-        successes = len([r for r in self.results if '✅' in r])
+
+        successes = len([r for r in self.results if "✅" in r])
        total = len(self.results)
        print(f"\nTotal: {successes}/{total} tests passed")
-        
+
        self.cleanup()

+
 async def main():
    tester = TestDownloads()
    await tester.run_all_tests()

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/tests/async/test_basic_crawling.py
+++ b/tests/async/test_basic_crawling.py
@@ -1,15 +1,17 @@
 import os
 import sys
 import pytest
-import asyncio
 import time

 # Add the parent directory to the Python path
-parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
 sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
@pytest.mark.asyncio
 async def test_successful_crawl():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -21,6 +23,7 @@ async def test_successful_crawl():
        assert result.markdown
        assert result.cleaned_html

+
@pytest.mark.asyncio
 async def test_invalid_url():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -29,19 +32,21 @@ async def test_invalid_url():
        assert not result.success
        assert result.error_message

+
@pytest.mark.asyncio
 async def test_multiple_urls():
    async with AsyncWebCrawler(verbose=True) as crawler:
        urls = [
            "https://www.nbcnews.com/business",
            "https://www.example.com",
-            "https://www.python.org"
+            "https://www.python.org",
        ]
        results = await crawler.arun_many(urls=urls, bypass_cache=True)
        assert len(results) == len(urls)
        assert all(result.success for result in results)
        assert all(result.html for result in results)

+
@pytest.mark.asyncio
 async def test_javascript_execution():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -51,6 +56,7 @@ async def test_javascript_execution():
        assert result.success
        assert "<h1>Modified by JS</h1>" in result.html

+
@pytest.mark.asyncio
 async def test_concurrent_crawling_performance():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -59,23 +65,26 @@ async def test_concurrent_crawling_performance():
            "https://www.example.com",
            "https://www.python.org",
            "https://www.github.com",
-            "https://www.stackoverflow.com"
+            "https://www.stackoverflow.com",
        ]
-        
+
        start_time = time.time()
        results = await crawler.arun_many(urls=urls, bypass_cache=True)
        end_time = time.time()
-        
+
        total_time = end_time - start_time
        print(f"Total time for concurrent crawling: {total_time:.2f} seconds")
-        
+
        assert all(result.success for result in results)
        assert len(results) == len(urls)
-        
+
        # Assert that concurrent crawling is faster than sequential
        # This multiplier may need adjustment based on the number of URLs and their complexity
-        assert total_time < len(urls) * 5, f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
+        assert (
+            total_time < len(urls) * 5
+        ), f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
+

 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/async/test_caching.py
+++ b/tests/async/test_caching.py
@@ -9,74 +9,79 @@ sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
@pytest.mark.asyncio
 async def test_caching():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
-        
+
        # First crawl (should not use cache)
        start_time = asyncio.get_event_loop().time()
        result1 = await crawler.arun(url=url, bypass_cache=True)
        end_time = asyncio.get_event_loop().time()
        time_taken1 = end_time - start_time
-        
+
        assert result1.success
-        
+
        # Second crawl (should use cache)
        start_time = asyncio.get_event_loop().time()
        result2 = await crawler.arun(url=url, bypass_cache=False)
        end_time = asyncio.get_event_loop().time()
        time_taken2 = end_time - start_time
-        
+
        assert result2.success
        assert time_taken2 < time_taken1  # Cached result should be faster

+
@pytest.mark.asyncio
 async def test_bypass_cache():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
-        
+
        # First crawl
        result1 = await crawler.arun(url=url, bypass_cache=False)
        assert result1.success
-        
+
        # Second crawl with bypass_cache=True
        result2 = await crawler.arun(url=url, bypass_cache=True)
        assert result2.success
-        
+
        # Content should be different (or at least, not guaranteed to be the same)
        assert result1.html != result2.html or result1.markdown != result2.markdown

+
@pytest.mark.asyncio
 async def test_clear_cache():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
-        
+
        # Crawl and cache
        await crawler.arun(url=url, bypass_cache=False)
-        
+
        # Clear cache
        await crawler.aclear_cache()
-        
+
        # Check cache size
        cache_size = await crawler.aget_cache_size()
        assert cache_size == 0

+
@pytest.mark.asyncio
 async def test_flush_cache():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
-        
+
        # Crawl and cache
        await crawler.arun(url=url, bypass_cache=False)
-        
+
        # Flush cache
        await crawler.aflush_cache()
-        
+
        # Check cache size
        cache_size = await crawler.aget_cache_size()
        assert cache_size == 0

+
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/async/test_chunking_and_extraction_strategies.py
+++ b/tests/async/test_chunking_and_extraction_strategies.py
@@ -1,7 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
 import json

 # Add the parent directory to the Python path
@@ -9,8 +8,9 @@ parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler
-from crawl4ai.chunking_strategy import RegexChunking, NlpSentenceChunking
-from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy
+from crawl4ai.chunking_strategy import RegexChunking
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+

@pytest.mark.asyncio
 async def test_regex_chunking():
@@ -18,15 +18,14 @@ async def test_regex_chunking():
        url = "https://www.nbcnews.com/business"
        chunking_strategy = RegexChunking(patterns=["\n\n"])
        result = await crawler.arun(
-            url=url,
-            chunking_strategy=chunking_strategy,
-            bypass_cache=True
+            url=url, chunking_strategy=chunking_strategy, bypass_cache=True
        )
        assert result.success
        assert result.extracted_content
        chunks = json.loads(result.extracted_content)
        assert len(chunks) > 1  # Ensure multiple chunks were created

+
 # @pytest.mark.asyncio
 # async def test_cosine_strategy():
 #     async with AsyncWebCrawler(verbose=True) as crawler:
@@ -43,25 +42,25 @@ async def test_regex_chunking():
 #         assert len(extracted_data) > 0
 #         assert all('tags' in item for item in extracted_data)

+
@pytest.mark.asyncio
 async def test_llm_extraction_strategy():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        extraction_strategy = LLMExtractionStrategy(
            provider="openai/gpt-4o-mini",
-            api_token=os.getenv('OPENAI_API_KEY'),
-            instruction="Extract only content related to technology"
+            api_token=os.getenv("OPENAI_API_KEY"),
+            instruction="Extract only content related to technology",
        )
        result = await crawler.arun(
-            url=url,
-            extraction_strategy=extraction_strategy,
-            bypass_cache=True
+            url=url, extraction_strategy=extraction_strategy, bypass_cache=True
        )
        assert result.success
        assert result.extracted_content
        extracted_data = json.loads(result.extracted_content)
        assert len(extracted_data) > 0
-        assert all('content' in item for item in extracted_data)
+        assert all("content" in item for item in extracted_data)
+

 # @pytest.mark.asyncio
 # async def test_combined_chunking_and_extraction():
@@ -84,4 +83,4 @@ async def test_llm_extraction_strategy():

 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/async/test_content_extraction.py
+++ b/tests/async/test_content_extraction.py
@@ -1,8 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
-import json

 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -10,6 +8,7 @@ sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
@pytest.mark.asyncio
 async def test_extract_markdown():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -20,6 +19,7 @@ async def test_extract_markdown():
        assert isinstance(result.markdown, str)
        assert len(result.markdown) > 0

+
@pytest.mark.asyncio
 async def test_extract_cleaned_html():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -30,6 +30,7 @@ async def test_extract_cleaned_html():
        assert isinstance(result.cleaned_html, str)
        assert len(result.cleaned_html) > 0

+
@pytest.mark.asyncio
 async def test_extract_media():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -46,6 +47,7 @@ async def test_extract_media():
            assert "alt" in image
            assert "type" in image

+
@pytest.mark.asyncio
 async def test_extract_links():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -63,6 +65,7 @@ async def test_extract_links():
            assert "href" in link
            assert "text" in link

+
@pytest.mark.asyncio
 async def test_extract_metadata():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -75,16 +78,20 @@ async def test_extract_metadata():
        assert "title" in metadata
        assert isinstance(metadata["title"], str)

+
@pytest.mark.asyncio
 async def test_css_selector_extraction():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        css_selector = "h1, h2, h3"
-        result = await crawler.arun(url=url, bypass_cache=True, css_selector=css_selector)
+        result = await crawler.arun(
+            url=url, bypass_cache=True, css_selector=css_selector
+        )
        assert result.success
        assert result.markdown
        assert all(heading in result.markdown for heading in ["#", "##", "###"])

+
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/async/test_content_filter_bm25.py
+++ b/tests/async/test_content_filter_bm25.py
@@ -1,7 +1,6 @@
 import os, sys
 import pytest
 from bs4 import BeautifulSoup
-from typing import List

 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -9,6 +8,7 @@ sys.path.append(parent_dir)

 from crawl4ai.content_filter_strategy import BM25ContentFilter

+
@pytest.fixture
 def basic_html():
    return """
@@ -28,6 +28,7 @@ def basic_html():
    </html>
    """

+
@pytest.fixture
 def wiki_html():
    return """
@@ -46,6 +47,7 @@ def wiki_html():
    </html>
    """

+
@pytest.fixture
 def no_meta_html():
    return """
@@ -57,26 +59,27 @@ def no_meta_html():
    </html>
    """

+
 class TestBM25ContentFilter:
    def test_basic_extraction(self, basic_html):
        """Test basic content extraction functionality"""
        filter = BM25ContentFilter()
        contents = filter.filter_content(basic_html)
-        
+
        assert contents, "Should extract content"
        assert len(contents) >= 1, "Should extract at least one content block"
-        assert "long paragraph" in ' '.join(contents).lower()
-        assert "navigation" not in ' '.join(contents).lower()
+        assert "long paragraph" in " ".join(contents).lower()
+        assert "navigation" not in " ".join(contents).lower()

    def test_user_query_override(self, basic_html):
        """Test that user query overrides metadata extraction"""
        user_query = "specific test query"
        filter = BM25ContentFilter(user_query=user_query)
-        
+
        # Access internal state to verify query usage
-        soup = BeautifulSoup(basic_html, 'lxml')
-        extracted_query = filter.extract_page_query(soup.find('head'))
-        
+        soup = BeautifulSoup(basic_html, "lxml")
+        extracted_query = filter.extract_page_query(soup.find("head"))
+
        assert extracted_query == user_query
        assert "Test description" not in extracted_query

@@ -84,8 +87,8 @@ class TestBM25ContentFilter:
        """Test that headers are properly extracted despite length"""
        filter = BM25ContentFilter()
        contents = filter.filter_content(wiki_html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
        assert "section 1" in combined_content, "Should include section header"
        assert "article title" in combined_content, "Should include main title"

@@ -93,9 +96,11 @@ class TestBM25ContentFilter:
        """Test fallback behavior when no metadata is present"""
        filter = BM25ContentFilter()
        contents = filter.filter_content(no_meta_html)
-        
+
        assert contents, "Should extract content even without metadata"
-        assert "First paragraph" in ' '.join(contents), "Should use first paragraph content"
+        assert "First paragraph" in " ".join(
+            contents
+        ), "Should use first paragraph content"

    def test_empty_input(self):
        """Test handling of empty input"""
@@ -108,29 +113,30 @@ class TestBM25ContentFilter:
        malformed_html = "<p>Unclosed paragraph<div>Nested content</p></div>"
        filter = BM25ContentFilter()
        contents = filter.filter_content(malformed_html)
-        
+
        assert isinstance(contents, list), "Should return list even with malformed HTML"
-        
+
    def test_threshold_behavior(self, basic_html):
        """Test different BM25 threshold values"""
        strict_filter = BM25ContentFilter(bm25_threshold=2.0)
        lenient_filter = BM25ContentFilter(bm25_threshold=0.5)
-        
+
        strict_contents = strict_filter.filter_content(basic_html)
        lenient_contents = lenient_filter.filter_content(basic_html)
-        
-        assert len(strict_contents) <= len(lenient_contents), \
-            "Strict threshold should extract fewer elements"
+
+        assert len(strict_contents) <= len(
+            lenient_contents
+        ), "Strict threshold should extract fewer elements"

    def test_html_cleaning(self, basic_html):
        """Test HTML cleaning functionality"""
        filter = BM25ContentFilter()
        contents = filter.filter_content(basic_html)
-        
-        cleaned_content = ' '.join(contents)
-        assert 'class=' not in cleaned_content, "Should remove class attributes"
-        assert 'style=' not in cleaned_content, "Should remove style attributes"
-        assert '<script' not in cleaned_content, "Should remove script tags"
+
+        cleaned_content = " ".join(contents)
+        assert "class=" not in cleaned_content, "Should remove class attributes"
+        assert "style=" not in cleaned_content, "Should remove style attributes"
+        assert "<script" not in cleaned_content, "Should remove script tags"

    def test_large_content(self):
        """Test handling of large content blocks"""
@@ -143,9 +149,9 @@ class TestBM25ContentFilter:
        contents = filter.filter_content(large_html)
        assert contents, "Should handle large content blocks"

-    @pytest.mark.parametrize("unwanted_tag", [
-        'script', 'style', 'nav', 'footer', 'header'
-    ])
+    @pytest.mark.parametrize(
+        "unwanted_tag", ["script", "style", "nav", "footer", "header"]
+    )
    def test_excluded_tags(self, unwanted_tag):
        """Test that specific tags are properly excluded"""
        html = f"""
@@ -156,20 +162,22 @@ class TestBM25ContentFilter:
        """
        filter = BM25ContentFilter()
        contents = filter.filter_content(html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
        assert "should not appear" not in combined_content
-        
+
    def test_performance(self, basic_html):
        """Test performance with timer"""
        filter = BM25ContentFilter()
-        
+
        import time
+
        start = time.perf_counter()
        filter.filter_content(basic_html)
        duration = time.perf_counter() - start
-        
+
        assert duration < 1.0, f"Processing took too long: {duration:.2f} seconds"

+
 if __name__ == "__main__":
-    pytest.main([__file__])
+    pytest.main([__file__])
--- a/tests/async/test_content_filter_prune.py
+++ b/tests/async/test_content_filter_prune.py
@@ -1,12 +1,12 @@
 import os, sys
 import pytest
-from bs4 import BeautifulSoup

 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)

 from crawl4ai.content_filter_strategy import PruningContentFilter

+
@pytest.fixture
 def basic_html():
    return """
@@ -22,6 +22,7 @@ def basic_html():
    </html>
    """

+
@pytest.fixture
 def link_heavy_html():
    return """
@@ -40,6 +41,7 @@ def link_heavy_html():
    </html>
    """

+
@pytest.fixture
 def mixed_content_html():
    return """
@@ -60,13 +62,14 @@ def mixed_content_html():
    </html>
    """

+
 class TestPruningContentFilter:
    def test_basic_pruning(self, basic_html):
        """Test basic content pruning functionality"""
        filter = PruningContentFilter(min_word_threshold=5)
        contents = filter.filter_content(basic_html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
        assert "high-quality paragraph" in combined_content
        assert "sidebar content" not in combined_content
        assert "share buttons" not in combined_content
@@ -75,40 +78,42 @@ class TestPruningContentFilter:
        """Test minimum word threshold filtering"""
        filter = PruningContentFilter(min_word_threshold=10)
        contents = filter.filter_content(mixed_content_html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
        assert "short summary" not in combined_content
        assert "long high-quality paragraph" in combined_content
        assert "short comment" not in combined_content

    def test_threshold_types(self, basic_html):
        """Test fixed vs dynamic thresholds"""
-        fixed_filter = PruningContentFilter(threshold_type='fixed', threshold=0.48)
-        dynamic_filter = PruningContentFilter(threshold_type='dynamic', threshold=0.45)
-        
+        fixed_filter = PruningContentFilter(threshold_type="fixed", threshold=0.48)
+        dynamic_filter = PruningContentFilter(threshold_type="dynamic", threshold=0.45)
+
        fixed_contents = fixed_filter.filter_content(basic_html)
        dynamic_contents = dynamic_filter.filter_content(basic_html)
-        
-        assert len(fixed_contents) != len(dynamic_contents), \
-            "Fixed and dynamic thresholds should yield different results"
+
+        assert len(fixed_contents) != len(
+            dynamic_contents
+        ), "Fixed and dynamic thresholds should yield different results"

    def test_link_density_impact(self, link_heavy_html):
        """Test handling of link-heavy content"""
-        filter = PruningContentFilter(threshold_type='dynamic')
+        filter = PruningContentFilter(threshold_type="dynamic")
        contents = filter.filter_content(link_heavy_html)
-        
-        combined_content = ' '.join(contents).lower()
+
+        combined_content = " ".join(contents).lower()
        assert "good content paragraph" in combined_content
-        assert len([c for c in contents if 'href' in c]) < 2, \
-            "Should prune link-heavy sections"
+        assert (
+            len([c for c in contents if "href" in c]) < 2
+        ), "Should prune link-heavy sections"

    def test_tag_importance(self, mixed_content_html):
        """Test tag importance in scoring"""
-        filter = PruningContentFilter(threshold_type='dynamic')
+        filter = PruningContentFilter(threshold_type="dynamic")
        contents = filter.filter_content(mixed_content_html)
-        
-        has_article = any('article' in c.lower() for c in contents)
-        has_h1 = any('h1' in c.lower() for c in contents)
+
+        has_article = any("article" in c.lower() for c in contents)
+        has_h1 = any("h1" in c.lower() for c in contents)
        assert has_article or has_h1, "Should retain important tags"

    def test_empty_input(self):
@@ -127,26 +132,31 @@ class TestPruningContentFilter:
    def test_performance(self, basic_html):
        """Test performance with timer"""
        filter = PruningContentFilter()
-        
+
        import time
+
        start = time.perf_counter()
        filter.filter_content(basic_html)
        duration = time.perf_counter() - start
-        
+
        # Extra strict on performance since you mentioned milliseconds matter
        assert duration < 0.1, f"Processing took too long: {duration:.3f} seconds"

-    @pytest.mark.parametrize("threshold,expected_count", [
-        (0.3, 4),  # Very lenient
-        (0.48, 2), # Default
-        (0.7, 1),  # Very strict
-    ])
+    @pytest.mark.parametrize(
+        "threshold,expected_count",
+        [
+            (0.3, 4),  # Very lenient
+            (0.48, 2),  # Default
+            (0.7, 1),  # Very strict
+        ],
+    )
    def test_threshold_levels(self, mixed_content_html, threshold, expected_count):
        """Test different threshold levels"""
-        filter = PruningContentFilter(threshold_type='fixed', threshold=threshold)
+        filter = PruningContentFilter(threshold_type="fixed", threshold=threshold)
        contents = filter.filter_content(mixed_content_html)
-        assert len(contents) <= expected_count, \
-            f"Expected {expected_count} or fewer elements with threshold {threshold}"
+        assert (
+            len(contents) <= expected_count
+        ), f"Expected {expected_count} or fewer elements with threshold {threshold}"

    def test_consistent_output(self, basic_html):
        """Test output consistency across multiple runs"""
@@ -155,5 +165,6 @@ class TestPruningContentFilter:
        second_run = filter.filter_content(basic_html)
        assert first_run == second_run, "Output should be consistent"

+
 if __name__ == "__main__":
-    pytest.main([__file__])
+    pytest.main([__file__])
--- a/tests/async/test_content_scraper_strategy.py
+++ b/tests/async/test_content_scraper_strategy.py
@@ -1,22 +1,24 @@
-import asyncio
-from bs4 import BeautifulSoup
-from typing import Dict, Any
 import os
 import sys
 import time
 import csv
 from tabulate import tabulate
 from dataclasses import dataclass
-from typing import List, Dict
+from typing import List

-parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
 sys.path.append(parent_dir)
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

 from crawl4ai.content_scraping_strategy import WebScrapingStrategy
-from crawl4ai.content_scraping_strategy import WebScrapingStrategy as WebScrapingStrategyCurrent
+from crawl4ai.content_scraping_strategy import (
+    WebScrapingStrategy as WebScrapingStrategyCurrent,
+)
 # from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent

+
@dataclass
 class TestResult:
    name: str
@@ -27,69 +29,71 @@ class TestResult:
    markdown_length: int
    execution_time: float

+
 class StrategyTester:
    def __init__(self):
        self.new_scraper = WebScrapingStrategy()
        self.current_scraper = WebScrapingStrategyCurrent()
-        with open(__location__ + '/sample_wikipedia.html', 'r', encoding='utf-8') as f:
+        with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
            self.WIKI_HTML = f.read()
-        self.results = {'new': [], 'current': []}
-        
+        self.results = {"new": [], "current": []}
+
    def run_test(self, name: str, **kwargs) -> tuple[TestResult, TestResult]:
        results = []
        for scraper in [self.new_scraper, self.current_scraper]:
            start_time = time.time()
            result = scraper._get_content_of_website_optimized(
-                url="https://en.wikipedia.org/wiki/Test",
-                html=self.WIKI_HTML,
-                **kwargs
+                url="https://en.wikipedia.org/wiki/Test", html=self.WIKI_HTML, **kwargs
            )
            execution_time = time.time() - start_time
-            
+
            test_result = TestResult(
                name=name,
-                success=result['success'],
-                images=len(result['media']['images']),
-                internal_links=len(result['links']['internal']),
-                external_links=len(result['links']['external']),
-                markdown_length=len(result['markdown']),
-                execution_time=execution_time
+                success=result["success"],
+                images=len(result["media"]["images"]),
+                internal_links=len(result["links"]["internal"]),
+                external_links=len(result["links"]["external"]),
+                markdown_length=len(result["markdown"]),
+                execution_time=execution_time,
            )
            results.append(test_result)
-        
+
        return results[0], results[1]  # new, current

    def run_all_tests(self):
        test_cases = [
            ("Basic Extraction", {}),
-            ("Exclude Tags", {'excluded_tags': ['table', 'div.infobox', 'div.navbox']}),
-            ("Word Threshold", {'word_count_threshold': 50}),
-            ("CSS Selector", {'css_selector': 'div.mw-parser-output > p'}),
-            ("Link Exclusions", {
-                'exclude_external_links': True,
-                'exclude_social_media_links': True,
-                'exclude_domains': ['facebook.com', 'twitter.com']
-            }),
-            ("Media Handling", {
-                'exclude_external_images': True,
-                'image_description_min_word_threshold': 20
-            }),
-            ("Text Only", {
-                'only_text': True,
-                'remove_forms': True
-            }),
-            ("HTML Cleaning", {
-                'clean_html': True,
-                'keep_data_attributes': True
-            }),
-            ("HTML2Text Options", {
-                'html2text': {
-                    'skip_internal_links': True,
-                    'single_line_break': True,
-                    'mark_code': True,
-                    'preserve_tags': ['pre', 'code']
-                }
-            })
+            ("Exclude Tags", {"excluded_tags": ["table", "div.infobox", "div.navbox"]}),
+            ("Word Threshold", {"word_count_threshold": 50}),
+            ("CSS Selector", {"css_selector": "div.mw-parser-output > p"}),
+            (
+                "Link Exclusions",
+                {
+                    "exclude_external_links": True,
+                    "exclude_social_media_links": True,
+                    "exclude_domains": ["facebook.com", "twitter.com"],
+                },
+            ),
+            (
+                "Media Handling",
+                {
+                    "exclude_external_images": True,
+                    "image_description_min_word_threshold": 20,
+                },
+            ),
+            ("Text Only", {"only_text": True, "remove_forms": True}),
+            ("HTML Cleaning", {"clean_html": True, "keep_data_attributes": True}),
+            (
+                "HTML2Text Options",
+                {
+                    "html2text": {
+                        "skip_internal_links": True,
+                        "single_line_break": True,
+                        "mark_code": True,
+                        "preserve_tags": ["pre", "code"],
+                    }
+                },
+            ),
        ]

        all_results = []
@@ -99,64 +103,117 @@ class StrategyTester:
                all_results.append((name, new_result, current_result))
            except Exception as e:
                print(f"Error in {name}: {str(e)}")
-                
+
        self.save_results_to_csv(all_results)
        self.print_comparison_table(all_results)

    def save_results_to_csv(self, all_results: List[tuple]):
-        csv_file = os.path.join(__location__, 'strategy_comparison_results.csv')
-        with open(csv_file, 'w', newline='') as f:
+        csv_file = os.path.join(__location__, "strategy_comparison_results.csv")
+        with open(csv_file, "w", newline="") as f:
            writer = csv.writer(f)
-            writer.writerow(['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', 
-                           'External Links', 'Markdown Length', 'Execution Time'])
-            
+            writer.writerow(
+                [
+                    "Test Name",
+                    "Strategy",
+                    "Success",
+                    "Images",
+                    "Internal Links",
+                    "External Links",
+                    "Markdown Length",
+                    "Execution Time",
+                ]
+            )
+
            for name, new_result, current_result in all_results:
-                writer.writerow([name, 'New', new_result.success, new_result.images,
-                               new_result.internal_links, new_result.external_links,
-                               new_result.markdown_length, f"{new_result.execution_time:.3f}"])
-                writer.writerow([name, 'Current', current_result.success, current_result.images,
-                               current_result.internal_links, current_result.external_links,
-                               current_result.markdown_length, f"{current_result.execution_time:.3f}"])
+                writer.writerow(
+                    [
+                        name,
+                        "New",
+                        new_result.success,
+                        new_result.images,
+                        new_result.internal_links,
+                        new_result.external_links,
+                        new_result.markdown_length,
+                        f"{new_result.execution_time:.3f}",
+                    ]
+                )
+                writer.writerow(
+                    [
+                        name,
+                        "Current",
+                        current_result.success,
+                        current_result.images,
+                        current_result.internal_links,
+                        current_result.external_links,
+                        current_result.markdown_length,
+                        f"{current_result.execution_time:.3f}",
+                    ]
+                )

    def print_comparison_table(self, all_results: List[tuple]):
        table_data = []
-        headers = ['Test Name', 'Strategy', 'Success', 'Images', 'Internal Links', 
-                  'External Links', 'Markdown Length', 'Time (s)']
+        headers = [
+            "Test Name",
+            "Strategy",
+            "Success",
+            "Images",
+            "Internal Links",
+            "External Links",
+            "Markdown Length",
+            "Time (s)",
+        ]

        for name, new_result, current_result in all_results:
            # Check for differences
            differences = []
-            if new_result.images != current_result.images: differences.append('images')
-            if new_result.internal_links != current_result.internal_links: differences.append('internal_links')
-            if new_result.external_links != current_result.external_links: differences.append('external_links')
-            if new_result.markdown_length != current_result.markdown_length: differences.append('markdown')
-            
+            if new_result.images != current_result.images:
+                differences.append("images")
+            if new_result.internal_links != current_result.internal_links:
+                differences.append("internal_links")
+            if new_result.external_links != current_result.external_links:
+                differences.append("external_links")
+            if new_result.markdown_length != current_result.markdown_length:
+                differences.append("markdown")
+
            # Add row for new strategy
            new_row = [
-                name, 'New', new_result.success, new_result.images,
-                new_result.internal_links, new_result.external_links,
-                new_result.markdown_length, f"{new_result.execution_time:.3f}"
+                name,
+                "New",
+                new_result.success,
+                new_result.images,
+                new_result.internal_links,
+                new_result.external_links,
+                new_result.markdown_length,
+                f"{new_result.execution_time:.3f}",
            ]
            table_data.append(new_row)
-            
+
            # Add row for current strategy
            current_row = [
-                '', 'Current', current_result.success, current_result.images,
-                current_result.internal_links, current_result.external_links,
-                current_result.markdown_length, f"{current_result.execution_time:.3f}"
+                "",
+                "Current",
+                current_result.success,
+                current_result.images,
+                current_result.internal_links,
+                current_result.external_links,
+                current_result.markdown_length,
+                f"{current_result.execution_time:.3f}",
            ]
            table_data.append(current_row)
-            
+
            # Add difference summary if any
            if differences:
-                table_data.append(['', '⚠️ Differences', ', '.join(differences), '', '', '', '', ''])
-            
+                table_data.append(
+                    ["", "⚠️ Differences", ", ".join(differences), "", "", "", "", ""]
+                )
+
            # Add empty row for better readability
-            table_data.append([''] * len(headers))
+            table_data.append([""] * len(headers))

        print("\nStrategy Comparison Results:")
-        print(tabulate(table_data, headers=headers, tablefmt='grid'))
+        print(tabulate(table_data, headers=headers, tablefmt="grid"))
+

 if __name__ == "__main__":
    tester = StrategyTester()
-    tester.run_all_tests()
+    tester.run_all_tests()
--- a/tests/async/test_crawler_strategy.py
+++ b/tests/async/test_crawler_strategy.py
@@ -1,14 +1,13 @@
 import os
 import sys
 import pytest
-import asyncio

 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler
-from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
+

@pytest.mark.asyncio
 async def test_custom_user_agent():
@@ -20,6 +19,7 @@ async def test_custom_user_agent():
        assert result.success
        assert custom_user_agent in result.html

+
@pytest.mark.asyncio
 async def test_custom_headers():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -31,6 +31,7 @@ async def test_custom_headers():
        assert "X-Test-Header" in result.html
        assert "TestValue" in result.html

+
@pytest.mark.asyncio
 async def test_javascript_execution():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -40,19 +41,22 @@ async def test_javascript_execution():
        assert result.success
        assert "<h1>Modified by JS</h1>" in result.html

+
@pytest.mark.asyncio
 async def test_hook_execution():
    async with AsyncWebCrawler(verbose=True) as crawler:
+
        async def test_hook(page):
            await page.evaluate("document.body.style.backgroundColor = 'red';")
            return page

-        crawler.crawler_strategy.set_hook('after_goto', test_hook)
+        crawler.crawler_strategy.set_hook("after_goto", test_hook)
        url = "https://www.example.com"
        result = await crawler.arun(url=url, bypass_cache=True)
        assert result.success
        assert "background-color: red" in result.html

+
@pytest.mark.asyncio
 async def test_screenshot():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -63,6 +67,7 @@ async def test_screenshot():
        assert isinstance(result.screenshot, str)
        assert len(result.screenshot) > 0

+
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/async/test_database_operations.py
+++ b/tests/async/test_database_operations.py
@@ -1,8 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
-import json

 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -10,6 +8,7 @@ sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
@pytest.mark.asyncio
 async def test_cache_url():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -23,6 +22,7 @@ async def test_cache_url():
        assert result2.success
        assert result2.html == result1.html

+
@pytest.mark.asyncio
 async def test_bypass_cache():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -34,25 +34,29 @@ async def test_bypass_cache():
        # Second run bypassing cache
        result2 = await crawler.arun(url=url, bypass_cache=True)
        assert result2.success
-        assert result2.html != result1.html  # Content might be different due to dynamic nature of websites
+        assert (
+            result2.html != result1.html
+        )  # Content might be different due to dynamic nature of websites
+

@pytest.mark.asyncio
 async def test_cache_size():
    async with AsyncWebCrawler(verbose=True) as crawler:
        initial_size = await crawler.aget_cache_size()
-        
+
        url = "https://www.nbcnews.com/business"
        await crawler.arun(url=url, bypass_cache=True)
-        
+
        new_size = await crawler.aget_cache_size()
        assert new_size == initial_size + 1

+
@pytest.mark.asyncio
 async def test_clear_cache():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.example.org"
        await crawler.arun(url=url, bypass_cache=True)
-        
+
        initial_size = await crawler.aget_cache_size()
        assert initial_size > 0

@@ -60,12 +64,13 @@ async def test_clear_cache():
        new_size = await crawler.aget_cache_size()
        assert new_size == 0

+
@pytest.mark.asyncio
 async def test_flush_cache():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.example.net"
        await crawler.arun(url=url, bypass_cache=True)
-        
+
        initial_size = await crawler.aget_cache_size()
        assert initial_size > 0

@@ -75,8 +80,11 @@ async def test_flush_cache():

        # Try to retrieve the previously cached URL
        result = await crawler.arun(url=url, bypass_cache=False)
-        assert result.success  # The crawler should still succeed, but it will fetch the content anew
+        assert (
+            result.success
+        )  # The crawler should still succeed, but it will fetch the content anew
+

 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/async/test_dispatchers.py
+++ b/tests/async/test_dispatchers.py
@@ -0,0 +1,170 @@
+import pytest
+import time
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    MemoryAdaptiveDispatcher,
+    SemaphoreDispatcher,
+    RateLimiter,
+    CrawlerMonitor,
+    DisplayMode,
+    CacheMode,
+)
+
+
+@pytest.fixture
+def browser_config():
+    return BrowserConfig(headless=True, verbose=False)
+
+
+@pytest.fixture
+def run_config():
+    return CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
+
+
+@pytest.fixture
+def test_urls():
+    return [
+        "http://example.com",
+        "http://example.com/page1",
+        "http://example.com/page2",
+    ]
+
+
+@pytest.mark.asyncio
+class TestDispatchStrategies:
+    async def test_memory_adaptive_basic(self, browser_config, run_config, test_urls):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(
+                memory_threshold_percent=70.0, max_session_permit=2, check_interval=0.1
+            )
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            assert all(r.success for r in results)
+
+    async def test_memory_adaptive_with_rate_limit(
+        self, browser_config, run_config, test_urls
+    ):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(
+                memory_threshold_percent=70.0,
+                max_session_permit=2,
+                check_interval=0.1,
+                rate_limiter=RateLimiter(
+                    base_delay=(0.1, 0.2), max_delay=1.0, max_retries=2
+                ),
+            )
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            assert all(r.success for r in results)
+
+    async def test_semaphore_basic(self, browser_config, run_config, test_urls):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = SemaphoreDispatcher(semaphore_count=2)
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            assert all(r.success for r in results)
+
+    async def test_semaphore_with_rate_limit(
+        self, browser_config, run_config, test_urls
+    ):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = SemaphoreDispatcher(
+                semaphore_count=2,
+                rate_limiter=RateLimiter(
+                    base_delay=(0.1, 0.2), max_delay=1.0, max_retries=2
+                ),
+            )
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            assert all(r.success for r in results)
+
+    async def test_memory_adaptive_memory_error(
+        self, browser_config, run_config, test_urls
+    ):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(
+                memory_threshold_percent=1.0,  # Set unrealistically low threshold
+                max_session_permit=2,
+                check_interval=0.1,
+                memory_wait_timeout=1.0,  # Short timeout for testing
+            )
+            with pytest.raises(MemoryError):
+                await crawler.arun_many(
+                    test_urls, config=run_config, dispatcher=dispatcher
+                )
+
+    async def test_empty_urls(self, browser_config, run_config):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
+            results = await crawler.arun_many(
+                [], config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == 0
+
+    async def test_single_url(self, browser_config, run_config):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
+            results = await crawler.arun_many(
+                ["http://example.com"], config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == 1
+            assert results[0].success
+
+    async def test_invalid_urls(self, browser_config, run_config):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
+            results = await crawler.arun_many(
+                ["http://invalid.url.that.doesnt.exist"],
+                config=run_config,
+                dispatcher=dispatcher,
+            )
+            assert len(results) == 1
+            assert not results[0].success
+
+    async def test_rate_limit_backoff(self, browser_config, run_config):
+        urls = ["http://example.com"] * 5  # Multiple requests to same domain
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            dispatcher = MemoryAdaptiveDispatcher(
+                max_session_permit=2,
+                rate_limiter=RateLimiter(
+                    base_delay=(0.1, 0.2),
+                    max_delay=1.0,
+                    max_retries=2,
+                    rate_limit_codes=[200],  # Force rate limiting for testing
+                ),
+            )
+            start_time = time.time()
+            results = await crawler.arun_many(
+                urls, config=run_config, dispatcher=dispatcher
+            )
+            duration = time.time() - start_time
+            assert len(results) == len(urls)
+            assert duration > 1.0  # Ensure rate limiting caused delays
+
+    async def test_monitor_integration(self, browser_config, run_config, test_urls):
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            monitor = CrawlerMonitor(
+                max_visible_rows=5, display_mode=DisplayMode.DETAILED
+            )
+            dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2, monitor=monitor)
+            results = await crawler.arun_many(
+                test_urls, config=run_config, dispatcher=dispatcher
+            )
+            assert len(results) == len(test_urls)
+            # Check monitor stats
+            assert len(monitor.stats) == len(test_urls)
+            assert all(stat.end_time is not None for stat in monitor.stats.values())
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--asyncio-mode=auto"])
--- a/tests/async/test_edge_cases.py
+++ b/tests/async/test_edge_cases.py
@@ -2,9 +2,9 @@ import os
 import re
 import sys
 import pytest
-import json
 from bs4 import BeautifulSoup
 import asyncio
+
 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
@@ -59,19 +59,21 @@ from crawl4ai.async_webcrawler import AsyncWebCrawler
 #         assert result.success
 #         assert "github" in result.html.lower()

+
 # Add this test to your existing test file
@pytest.mark.asyncio
 async def test_typescript_commits_multi_page():
    first_commit = ""
+
    async def on_execution_started(page):
-        nonlocal first_commit 
+        nonlocal first_commit
        try:
            # Check if the page firct commit h4 text is different from the first commit (use document.querySelector('li.Box-sc-g0xbh4-0 h4'))
            while True:
-                await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')
-                commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')
-                commit = await commit.evaluate('(element) => element.textContent')
-                commit = re.sub(r'\s+', '', commit)
+                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await commit.evaluate("(element) => element.textContent")
+                commit = re.sub(r"\s+", "", commit)
                if commit and commit != first_commit:
                    first_commit = commit
                    break
@@ -79,9 +81,8 @@ async def test_typescript_commits_multi_page():
        except Exception as e:
            print(f"Warning: New content didn't appear after JavaScript execution: {e}")

-
    async with AsyncWebCrawler(verbose=True) as crawler:
-        crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)

        url = "https://github.com/microsoft/TypeScript/commits/main"
        session_id = "typescript_commits_session"
@@ -97,19 +98,21 @@ async def test_typescript_commits_multi_page():
                url=url,  # Only use URL for the first page
                session_id=session_id,
                css_selector="li.Box-sc-g0xbh4-0",
-                js=js_next_page if page > 0 else None,  # Don't click 'next' on the first page
+                js=js_next_page
+                if page > 0
+                else None,  # Don't click 'next' on the first page
                bypass_cache=True,
-                js_only=page > 0  # Use js_only for subsequent pages
+                js_only=page > 0,  # Use js_only for subsequent pages
            )

            assert result.success, f"Failed to crawl page {page + 1}"

            # Parse the HTML and extract commits
-            soup = BeautifulSoup(result.cleaned_html, 'html.parser')
+            soup = BeautifulSoup(result.cleaned_html, "html.parser")
            commits = soup.select("li")
            # Take first commit find h4 extract text
            first_commit = commits[0].find("h4").text
-            first_commit = re.sub(r'\s+', '', first_commit)
+            first_commit = re.sub(r"\s+", "", first_commit)
            all_commits.extend(commits)

            print(f"Page {page + 1}: Found {len(commits)} commits")
@@ -118,10 +121,13 @@ async def test_typescript_commits_multi_page():
        await crawler.crawler_strategy.kill_session(session_id)

        # Assertions
-        assert len(all_commits) >= 90, f"Expected at least 90 commits, but got {len(all_commits)}"
-        
-        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")                      
+        assert (
+            len(all_commits) >= 90
+        ), f"Expected at least 90 commits, but got {len(all_commits)}"
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+

 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/async/test_error_handling.py
+++ b/tests/async/test_error_handling.py
@@ -75,4 +75,4 @@

 # # Entry point for debugging
 # if __name__ == "__main__":
-#     pytest.main([__file__, "-v"])
+#     pytest.main([__file__, "-v"])
--- a/tests/async/test_evaluation_scraping_methods_performance.configs.py
+++ b/tests/async/test_evaluation_scraping_methods_performance.configs.py
@@ -0,0 +1,705 @@
+import json
+import time
+from bs4 import BeautifulSoup
+from crawl4ai.content_scraping_strategy import (
+    WebScrapingStrategy,
+    LXMLWebScrapingStrategy,
+)
+from typing import Dict, List, Tuple
+import difflib
+from lxml import html as lhtml, etree
+
+
+def normalize_dom(element):
+    """
+    Recursively normalizes an lxml HTML element:
+      - Removes comment nodes
+      - Sorts attributes on each node
+      - Removes <head> if you want (optional)
+    Returns the same element (mutated).
+    """
+    # Remove comment nodes
+    comments = element.xpath("//comment()")
+    for c in comments:
+        p = c.getparent()
+        if p is not None:
+            p.remove(c)
+
+    # If you'd like to remove <head>, or unify <html>/<body>, you could do so here.
+    # For example, remove <head> entirely:
+    # heads = element.xpath('//head')
+    # for h in heads:
+    #     parent = h.getparent()
+    #     if parent is not None:
+    #         parent.remove(h)
+
+    # Sort attributes (to avoid false positives due to attr order)
+    for el in element.iter():
+        if el.attrib:
+            # Convert to a sorted list of (k, v), then reassign
+            sorted_attribs = sorted(el.attrib.items())
+            el.attrib.clear()
+            for k, v in sorted_attribs:
+                el.set(k, v)
+
+    return element
+
+
+def strip_html_body(root):
+    """
+    If 'root' is <html>, find its <body> child and move all of <body>'s children
+    into a new <div>. Return that <div>.
+
+    If 'root' is <body>, similarly move all of its children into a new <div> and return it.
+
+    Otherwise, return 'root' as-is.
+    """
+    tag_name = (root.tag or "").lower()
+
+    # Case 1: The root is <html>
+    if tag_name == "html":
+        bodies = root.xpath("./body")
+        if bodies:
+            body = bodies[0]
+            new_div = lhtml.Element("div")
+            for child in body:
+                new_div.append(child)
+            return new_div
+        else:
+            # No <body> found; just return the <html> root
+            return root
+
+    # Case 2: The root is <body>
+    elif tag_name == "body":
+        new_div = lhtml.Element("div")
+        for child in root:
+            new_div.append(child)
+        return new_div
+
+    # Case 3: Neither <html> nor <body>
+    else:
+        return root
+
+
+def compare_nodes(node1, node2, differences, path="/"):
+    """
+    Recursively compare two lxml nodes, appending textual differences to `differences`.
+    `path` is used to indicate the location in the tree (like an XPath).
+    """
+    # 1) Compare tag names
+    if node1.tag != node2.tag:
+        differences.append(f"Tag mismatch at {path}: '{node1.tag}' vs. '{node2.tag}'")
+        return
+
+    # 2) Compare attributes
+    # By now, they are sorted in normalize_dom()
+    attrs1 = list(node1.attrib.items())
+    attrs2 = list(node2.attrib.items())
+    if attrs1 != attrs2:
+        differences.append(
+            f"Attribute mismatch at {path}/{node1.tag}: {attrs1} vs. {attrs2}"
+        )
+
+    # 3) Compare text (trim or unify whitespace as needed)
+    text1 = (node1.text or "").strip()
+    text2 = (node2.text or "").strip()
+    # Normalize whitespace
+    text1 = " ".join(text1.split())
+    text2 = " ".join(text2.split())
+    if text1 != text2:
+        # If you prefer ignoring newlines or multiple whitespace, do a more robust cleanup
+        differences.append(
+            f"Text mismatch at {path}/{node1.tag}: '{text1}' vs. '{text2}'"
+        )
+
+    # 4) Compare number of children
+    children1 = list(node1)
+    children2 = list(node2)
+    if len(children1) != len(children2):
+        differences.append(
+            f"Child count mismatch at {path}/{node1.tag}: {len(children1)} vs. {len(children2)}"
+        )
+        return  # If counts differ, no point comparing child by child
+
+    # 5) Recursively compare each child
+    for i, (c1, c2) in enumerate(zip(children1, children2)):
+        # Build a path for child
+        child_path = f"{path}/{node1.tag}[{i}]"
+        compare_nodes(c1, c2, differences, child_path)
+
+    # 6) Compare tail text
+    tail1 = (node1.tail or "").strip()
+    tail2 = (node2.tail or "").strip()
+    if tail1 != tail2:
+        differences.append(
+            f"Tail mismatch after {path}/{node1.tag}: '{tail1}' vs. '{tail2}'"
+        )
+
+
+def compare_html_structurally(html1, html2):
+    """
+    Compare two HTML strings using a structural approach with lxml.
+    Returns a list of differences (if any). If empty, they're effectively the same.
+    """
+    # 1) Parse both
+    try:
+        tree1 = lhtml.fromstring(html1)
+    except etree.ParserError:
+        return ["Error parsing HTML1"]
+
+    try:
+        tree2 = lhtml.fromstring(html2)
+    except etree.ParserError:
+        return ["Error parsing HTML2"]
+
+    # 2) Normalize both DOMs (remove comments, sort attributes, etc.)
+    tree1 = normalize_dom(tree1)
+    tree2 = normalize_dom(tree2)
+
+    # 3) Possibly strip <html>/<body> wrappers for better apples-to-apples comparison
+    tree1 = strip_html_body(tree1)
+    tree2 = strip_html_body(tree2)
+
+    # 4) Compare recursively
+    differences = []
+    compare_nodes(tree1, tree2, differences, path="")
+    return differences
+
+
+def generate_large_html(n_elements=1000):
+    html = ["<!DOCTYPE html><html><head></head><body>"]
+    for i in range(n_elements):
+        html.append(
+            f"""
+            <div class="article">
+                <h2>Heading {i}</h2>
+                <p>This is paragraph {i} with some content and a <a href="http://example.com/{i}">link</a></p>
+                <img src="image{i}.jpg" alt="Image {i}">
+                <ul>
+                    <li>List item {i}.1</li>
+                    <li>List item {i}.2</li>
+                </ul>
+            </div>
+        """
+        )
+    html.append("</body></html>")
+    return "".join(html)
+
+
+def generate_complicated_html():
+    """
+    HTML with multiple domains, forms, data attributes,
+    various images, comments, style, and noscript to test all parameter toggles.
+    """
+    return """
+    <!DOCTYPE html>
+    <html>
+      <head>
+        <title>Complicated Test Page</title>
+        <meta name="description" content="A very complicated page for testing.">
+        
+        <style>
+          .hidden { display: none; }
+          .highlight { color: red; }
+        </style>
+      </head>
+      <body>
+        <!-- This is a comment that we may remove if remove_comments=True -->
+        
+        <header>
+          <h1>Main Title of the Page</h1>
+          <nav>
+            <a href="http://example.com/home">Home</a>
+            <a href="http://social.com/profile">Social Profile</a>
+            <a href="javascript:void(0)">JS Void Link</a>
+          </nav>
+        </header>
+        
+        <noscript>
+          <p>JavaScript is disabled or not supported.</p>
+        </noscript>
+        
+        <form action="submit.php" method="post">
+          <input type="text" name="username" />
+          <button type="submit">Submit</button>
+        </form>
+        
+        <section>
+          <article>
+            <h2>Article Title</h2>
+            <p>
+              This paragraph has a good amount of text to exceed word_count_threshold if it's 
+              set to something small. But it might not exceed a very high threshold.
+            </p>
+            
+            <img src="http://images.example.com/photo.jpg" alt="Descriptive alt text"
+                 style="width:200px;height:150px;" data-lazy="true">
+            
+            <img src="icon.png" alt="Icon" style="display:none;">
+            
+            <p>Another short text. <a href="/local-link">Local Link</a></p>
+          </article>
+        </section>
+        
+        <section id="promo-section">
+          <p>Promo text <a href="http://ads.example.com/ad">Ad Link</a></p>
+        </section>
+        
+        <aside class="sidebar">
+          <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAA..." alt="Base64 Image">
+          <div data-info="secret" class="social-widget">
+            <p>Follow us on <a href="http://facebook.com/brand">Facebook</a></p>
+          </div>
+        </aside>
+        
+        <!-- Another comment below this line -->
+        <script>console.log("script that might be removed");</script>
+        
+        <div style="display:none;">
+          <p>This is hidden</p>
+        </div>
+        
+        <footer>
+          <small>Footer Info &copy; 2025</small>
+        </footer>
+      </body>
+    </html>
+    """
+
+
+def get_test_scenarios():
+    """
+    Returns a dictionary of parameter sets (test scenarios) for the scraper.
+    Each scenario name maps to a dictionary of keyword arguments
+    that will be passed into scrap() for testing various features.
+    """
+    TEST_SCENARIOS = {
+        # "default": {},
+        # "exclude_domains": {
+        #     "exclude_domains": {"images.example.com", "ads.example.com"}
+        # },
+        # "exclude_social_media_links": {
+        #     "exclude_social_media_links": True
+        # },
+        # "high_word_threshold": {
+        #     "word_count_threshold": 100
+        # },
+        # "keep_data_attrs": {
+        #     "keep_data_attributes": True
+        # },
+        # "remove_forms_and_comments": {
+        #     "remove_forms": True,
+        #     "remove_comments": True
+        # },
+        # "exclude_tags_and_selector": {
+        #     "excluded_tags": ["aside", "script"],
+        #     "excluded_selector": ".social-widget"
+        # },
+        # "only_text_mode": {
+        #     "only_text": True
+        # },
+        # "combo_mode": {
+        #     "exclude_domains": {"images.example.com", "ads.example.com"},
+        #     "exclude_social_media_links": True,
+        #     "remove_forms": True,
+        #     "remove_comments": True,
+        #     "excluded_tags": ["aside"],
+        #     "excluded_selector": "#promo-section",
+        #     "only_text": False,
+        #     "keep_data_attributes": True,
+        #     "word_count_threshold": 20
+        # },
+        # "exclude_external_images": {
+        #     "exclude_external_images": True,
+        #     "exclude_social_media_links": True
+        # },
+        # "strict_image_scoring": {
+        #     "image_score_threshold": 3,
+        #     "image_description_min_word_threshold": 10
+        # },
+        # "custom_css_selector": {
+        #     "css_selector": "section#promo-section"
+        # },
+        # "remove_noscript": {
+        #     "excluded_tags": ["noscript"]
+        # },
+        # "exclude_external_links": {
+        #     "exclude_external_links": True
+        # },
+        # "large_word_count": {
+        #     "word_count_threshold": 500
+        # },
+        # "super_strict_images": {
+        #     "image_score_threshold": 5,
+        #     "image_description_min_word_threshold": 15
+        # },
+        # "exclude_style_and_script": {
+        #     "excluded_tags": ["style", "script"]
+        # },
+        # "keep_data_and_remove_forms": {
+        #     "keep_data_attributes": True,
+        #     "remove_forms": True
+        # },
+        # "only_text_high_word_count": {
+        #     "only_text": True,
+        #     "word_count_threshold": 40
+        # },
+        # "reduce_to_selector": {
+        #     "css_selector": "section > article"
+        # },
+        # "exclude_all_links": {
+        #     # Removes all external links and also excludes example.com & social.com
+        #     "exclude_domains": {"example.com", "social.com", "facebook.com"},
+        #     "exclude_external_links": True
+        # },
+        # "comprehensive_removal": {
+        #     # Exclude multiple tags, remove forms & comments,
+        #     # and also remove targeted selectors
+        #     "excluded_tags": ["aside", "noscript", "script"],
+        #     "excluded_selector": "#promo-section, .social-widget",
+        #     "remove_comments": True,
+        #     "remove_forms": True
+        # }
+    }
+    return TEST_SCENARIOS
+
+
+class ScraperEquivalenceTester:
+    def __init__(self):
+        self.test_cases = {
+            "basic": self.generate_basic_html(),
+            "complex": self.generate_complex_html(),
+            "malformed": self.generate_malformed_html(),
+            # 'real_world': self.load_real_samples()
+        }
+
+    def generate_basic_html(self):
+        return generate_large_html(1000)  # Your existing function
+
+    def generate_complex_html(self):
+        return """
+        <html><body>
+            <div class="nested-content">
+                <article>
+                    <h1>Main Title</h1>
+                    <img src="test.jpg" srcset="test-1x.jpg 1x, test-2x.jpg 2x" data-src="lazy.jpg">
+                    <p>Text with <a href="http://test.com">mixed <b>formatting</b></a></p>
+                    <iframe src="embedded.html"></iframe>
+                </article>
+                <nav>
+                    <ul>
+                        <li><a href="/page1">Link 1</a></li>
+                        <li><a href="javascript:void(0)">JS Link</a></li>
+                    </ul>
+                </nav>
+            </div>
+        </body></html>
+        """
+
+    def generate_malformed_html(self):
+        return """
+        <div>Unclosed div
+        <p>Unclosed paragraph
+        <a href="test.com">Link</a>
+        <img src=no-quotes>
+        <script>document.write("<div>Dynamic</div>");</script>
+        <!-- Malformed comment -- > -->
+        <![CDATA[Test CDATA]]>
+        """
+
+    def load_real_samples(self):
+        # Load some real-world HTML samples you've collected
+        samples = {
+            "article": open("tests/samples/article.html").read(),
+            "product": open("tests/samples/product.html").read(),
+            "blog": open("tests/samples/blog.html").read(),
+        }
+        return samples
+
+    def deep_compare_links(self, old_links: Dict, new_links: Dict) -> List[str]:
+        """Detailed comparison of link structures"""
+        differences = []
+
+        for category in ["internal", "external"]:
+            old_urls = {link["href"] for link in old_links[category]}
+            new_urls = {link["href"] for link in new_links[category]}
+
+            missing = old_urls - new_urls
+            extra = new_urls - old_urls
+
+            if missing:
+                differences.append(f"Missing {category} links: {missing}")
+            if extra:
+                differences.append(f"Extra {category} links: {extra}")
+
+            # Compare link attributes for common URLs
+            common = old_urls & new_urls
+            for url in common:
+                old_link = next(l for l in old_links[category] if l["href"] == url)
+                new_link = next(l for l in new_links[category] if l["href"] == url)
+
+                for attr in ["text", "title"]:
+                    if old_link[attr] != new_link[attr]:
+                        differences.append(
+                            f"Link attribute mismatch for {url} - {attr}:"
+                            f" old='{old_link[attr]}' vs new='{new_link[attr]}'"
+                        )
+
+        return differences
+
+    def deep_compare_media(self, old_media: Dict, new_media: Dict) -> List[str]:
+        """Detailed comparison of media elements"""
+        differences = []
+
+        for media_type in ["images", "videos", "audios"]:
+            old_srcs = {item["src"] for item in old_media[media_type]}
+            new_srcs = {item["src"] for item in new_media[media_type]}
+
+            missing = old_srcs - new_srcs
+            extra = new_srcs - old_srcs
+
+            if missing:
+                differences.append(f"Missing {media_type}: {missing}")
+            if extra:
+                differences.append(f"Extra {media_type}: {extra}")
+
+            # Compare media attributes for common sources
+            common = old_srcs & new_srcs
+            for src in common:
+                old_item = next(m for m in old_media[media_type] if m["src"] == src)
+                new_item = next(m for m in new_media[media_type] if m["src"] == src)
+
+                for attr in ["alt", "description"]:
+                    if old_item.get(attr) != new_item.get(attr):
+                        differences.append(
+                            f"{media_type} attribute mismatch for {src} - {attr}:"
+                            f" old='{old_item.get(attr)}' vs new='{new_item.get(attr)}'"
+                        )
+
+        return differences
+
+    def compare_html_content(self, old_html: str, new_html: str) -> List[str]:
+        """Compare HTML content structure and text"""
+        # return compare_html_structurally(old_html, new_html)
+        differences = []
+
+        def normalize_html(html: str) -> Tuple[str, str]:
+            soup = BeautifulSoup(html, "lxml")
+            # Get both structure and text
+            structure = " ".join(tag.name for tag in soup.find_all())
+            text = " ".join(soup.get_text().split())
+            return structure, text
+
+        old_structure, old_text = normalize_html(old_html)
+        new_structure, new_text = normalize_html(new_html)
+
+        # Compare structure
+        if abs(len(old_structure) - len(new_structure)) > 100:
+            # if old_structure != new_structure:
+            diff = difflib.unified_diff(
+                old_structure.split(), new_structure.split(), lineterm=""
+            )
+            differences.append("HTML structure differences:\n" + "\n".join(diff))
+
+        # Compare text content
+        if abs(len(old_text) - len(new_text)) > 100:
+            # if old_text != new_text:
+            # Show detailed text differences
+            text_diff = difflib.unified_diff(
+                old_text.split(), new_text.split(), lineterm=""
+            )
+            differences.append("Text content differences:\n" + "\n".join(text_diff))
+
+        return differences
+
+    def compare_results(
+        self, old_result: Dict, new_result: Dict
+    ) -> Dict[str, List[str]]:
+        """Comprehensive comparison of scraper outputs"""
+        differences = {}
+
+        # Compare links
+        link_differences = self.deep_compare_links(
+            old_result["links"], new_result["links"]
+        )
+        if link_differences:
+            differences["links"] = link_differences
+
+        # Compare media
+        media_differences = self.deep_compare_media(
+            old_result["media"], new_result["media"]
+        )
+        if media_differences:
+            differences["media"] = media_differences
+
+        # Compare HTML
+        html_differences = self.compare_html_content(
+            old_result["cleaned_html"], new_result["cleaned_html"]
+        )
+        if html_differences:
+            differences["html"] = html_differences
+
+        return differences
+
+    def run_tests(self) -> Dict:
+        """Run comparison tests using the complicated HTML with multiple parameter scenarios."""
+        # We'll still keep some "test_cases" logic from above (basic, complex, malformed).
+        # But we add a new section for the complicated HTML scenarios.
+
+        results = {"tests": [], "summary": {"passed": 0, "failed": 0}}
+
+        # 1) First, run the existing 3 built-in test cases (basic, complex, malformed).
+        # for case_name, html in self.test_cases.items():
+        #     print(f"\nTesting built-in case: {case_name}...")
+
+        #     original = WebScrapingStrategy()
+        #     lxml = LXMLWebScrapingStrategy()
+
+        #     start = time.time()
+        #     orig_result = original.scrap("http://test.com", html)
+        #     orig_time = time.time() - start
+
+        #     print("\nOriginal Mode:")
+        #     print(f"Cleaned HTML size: {len(orig_result['cleaned_html'])/1024:.2f} KB")
+        #     print(f"Images: {len(orig_result['media']['images'])}")
+        #     print(f"External links: {len(orig_result['links']['external'])}")
+        #     print(f"Times - Original: {orig_time:.3f}s")
+
+        #     start = time.time()
+        #     lxml_result = lxml.scrap("http://test.com", html)
+        #     lxml_time = time.time() - start
+
+        #     print("\nLXML Mode:")
+        #     print(f"Cleaned HTML size: {len(lxml_result['cleaned_html'])/1024:.2f} KB")
+        #     print(f"Images: {len(lxml_result['media']['images'])}")
+        #     print(f"External links: {len(lxml_result['links']['external'])}")
+        #     print(f"Times - LXML: {lxml_time:.3f}s")
+
+        #     # Compare
+        #     diffs = {}
+        #     link_diff = self.deep_compare_links(orig_result['links'], lxml_result['links'])
+        #     if link_diff:
+        #         diffs['links'] = link_diff
+
+        #     media_diff = self.deep_compare_media(orig_result['media'], lxml_result['media'])
+        #     if media_diff:
+        #         diffs['media'] = media_diff
+
+        #     html_diff = self.compare_html_content(orig_result['cleaned_html'], lxml_result['cleaned_html'])
+        #     if html_diff:
+        #         diffs['html'] = html_diff
+
+        #     test_result = {
+        #         'case': case_name,
+        #         'lxml_mode': {
+        #             'differences': diffs,
+        #             'execution_time': lxml_time
+        #         },
+        #         'original_time': orig_time
+        #     }
+        #     results['tests'].append(test_result)
+
+        #     if not diffs:
+        #         results['summary']['passed'] += 1
+        #     else:
+        #         results['summary']['failed'] += 1
+
+        # 2) Now, run the complicated HTML with multiple parameter scenarios.
+        complicated_html = generate_complicated_html()
+        print("\n=== Testing complicated HTML with multiple parameter scenarios ===")
+
+        # Create the scrapers once (or you can re-create if needed)
+        original = WebScrapingStrategy()
+        lxml = LXMLWebScrapingStrategy()
+
+        for scenario_name, params in get_test_scenarios().items():
+            print(f"\nScenario: {scenario_name}")
+
+            start = time.time()
+            orig_result = original.scrap("http://test.com", complicated_html, **params)
+            orig_time = time.time() - start
+
+            start = time.time()
+            lxml_result = lxml.scrap("http://test.com", complicated_html, **params)
+            lxml_time = time.time() - start
+
+            diffs = {}
+            link_diff = self.deep_compare_links(
+                orig_result["links"], lxml_result["links"]
+            )
+            if link_diff:
+                diffs["links"] = link_diff
+
+            media_diff = self.deep_compare_media(
+                orig_result["media"], lxml_result["media"]
+            )
+            if media_diff:
+                diffs["media"] = media_diff
+
+            html_diff = self.compare_html_content(
+                orig_result["cleaned_html"], lxml_result["cleaned_html"]
+            )
+            if html_diff:
+                diffs["html"] = html_diff
+
+            test_result = {
+                "case": f"complicated_{scenario_name}",
+                "lxml_mode": {"differences": diffs, "execution_time": lxml_time},
+                "original_time": orig_time,
+            }
+            results["tests"].append(test_result)
+
+            if not diffs:
+                results["summary"]["passed"] += 1
+                print(
+                    f"✅ [OK] No differences found. Time(Orig: {orig_time:.3f}s, LXML: {lxml_time:.3f}s)"
+                )
+            else:
+                results["summary"]["failed"] += 1
+                print("❌ Differences found:")
+                for category, dlist in diffs.items():
+                    print(f"  {category}:")
+                    for d in dlist:
+                        print(f"    - {d}")
+
+        return results
+
+    def print_report(self, results: Dict):
+        """Generate detailed equivalence report"""
+        print("\n=== Scraper Equivalence Test Report ===\n")
+        print(f"Total Cases: {len(results['tests'])}")
+        print(f"Passed: {results['summary']['passed']}")
+        print(f"Failed: {results['summary']['failed']}")
+
+        for test in results["tests"]:
+            print(f"\nTest Case: {test['case']}")
+
+            if not test["lxml_mode"]["differences"]:
+                print("✅ All implementations produced identical results")
+                print(
+                    f"Times - Original: {test['original_time']:.3f}s, "
+                    f"LXML: {test['lxml_mode']['execution_time']:.3f}s"
+                )
+            else:
+                print("❌ Differences found:")
+
+                if test["lxml_mode"]["differences"]:
+                    print("\nLXML Mode Differences:")
+                    for category, diffs in test["lxml_mode"]["differences"].items():
+                        print(f"\n{category}:")
+                        for diff in diffs:
+                            print(f"  - {diff}")
+
+
+def main():
+    tester = ScraperEquivalenceTester()
+    results = tester.run_tests()
+    tester.print_report(results)
+
+    # Save detailed results for debugging
+    with open("scraper_equivalence_results.json", "w") as f:
+        json.dump(results, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/async/test_markdown_genertor.py
+++ b/tests/async/test_markdown_genertor.py
@@ -4,10 +4,10 @@
 # - **State:** open

 import os, sys, time
+
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
-__location__ = os.path.realpath(    os.path.join(os.getcwd(), os.path.dirname(__file__)))
-import asyncio
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 import os
 import time
 from typing import Dict, Any
@@ -16,18 +16,18 @@ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 # Get current directory
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

+
 def print_test_result(name: str, result: Dict[str, Any], execution_time: float):
    """Helper function to print test results."""
    print(f"\n{'='*20} {name} {'='*20}")
    print(f"Execution time: {execution_time:.4f} seconds")
-    
-    
+
    # Save markdown to files
    for key, content in result.items():
        if isinstance(content, str):
            with open(__location__ + f"/output/{name.lower()}_{key}.md", "w") as f:
                f.write(content)
-    
+
    # # Print first few lines of each markdown version
    # for key, content in result.items():
    #     if isinstance(content, str):
@@ -36,32 +36,39 @@ def print_test_result(name: str, result: Dict[str, Any], execution_time: float):
    #         print(preview)
    #         print(f"Total length: {len(content)} characters")

+
 def test_basic_markdown_conversion():
    """Test basic markdown conversion with links."""
    with open(__location__ + "/data/wikipedia.html", "r") as f:
        cleaned_html = f.read()

    generator = DefaultMarkdownGenerator()
-    
+
    start_time = time.perf_counter()
    result = generator.generate_markdown(
-        cleaned_html=cleaned_html,
-        base_url="https://en.wikipedia.org"
+        cleaned_html=cleaned_html, base_url="https://en.wikipedia.org"
    )
    execution_time = time.perf_counter() - start_time
-    
-    print_test_result("Basic Markdown Conversion", {
-        'raw': result.raw_markdown,
-        'with_citations': result.markdown_with_citations,
-        'references': result.references_markdown
-    }, execution_time)
-    
+
+    print_test_result(
+        "Basic Markdown Conversion",
+        {
+            "raw": result.raw_markdown,
+            "with_citations": result.markdown_with_citations,
+            "references": result.references_markdown,
+        },
+        execution_time,
+    )
+
    # Basic assertions
    assert result.raw_markdown, "Raw markdown should not be empty"
    assert result.markdown_with_citations, "Markdown with citations should not be empty"
    assert result.references_markdown, "References should not be empty"
    assert "⟨" in result.markdown_with_citations, "Citations should use ⟨⟩ brackets"
-    assert "## References" in result.references_markdown, "Should contain references section"
+    assert (
+        "## References" in result.references_markdown
+    ), "Should contain references section"
+

 def test_relative_links():
    """Test handling of relative links with base URL."""
@@ -69,97 +76,106 @@ def test_relative_links():
    Here's a [relative link](/wiki/Apple) and an [absolute link](https://example.com).
    Also an [image](/images/test.png) and another [page](/wiki/Banana).
    """
-    
+
    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
-        cleaned_html=markdown,
-        base_url="https://en.wikipedia.org"
+        cleaned_html=markdown, base_url="https://en.wikipedia.org"
    )
-    
+
    assert "https://en.wikipedia.org/wiki/Apple" in result.references_markdown
    assert "https://example.com" in result.references_markdown
    assert "https://en.wikipedia.org/images/test.png" in result.references_markdown

+
 def test_duplicate_links():
    """Test handling of duplicate links."""
    markdown = """
    Here's a [link](/test) and another [link](/test) and a [different link](/other).
    """
-    
+
    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
-        cleaned_html=markdown,
-        base_url="https://example.com"
+        cleaned_html=markdown, base_url="https://example.com"
    )
-    
+
    # Count citations in markdown
    citations = result.markdown_with_citations.count("⟨1⟩")
    assert citations == 2, "Same link should use same citation number"

+
 def test_link_descriptions():
    """Test handling of link titles and descriptions."""
    markdown = """
    Here's a [link with title](/test "Test Title") and a [link with description](/other) to test.
    """
-    
+
    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
-        cleaned_html=markdown,
-        base_url="https://example.com"
+        cleaned_html=markdown, base_url="https://example.com"
    )
-    
-    assert "Test Title" in result.references_markdown, "Link title should be in references"
-    assert "link with description" in result.references_markdown, "Link text should be in references"
+
+    assert (
+        "Test Title" in result.references_markdown
+    ), "Link title should be in references"
+    assert (
+        "link with description" in result.references_markdown
+    ), "Link text should be in references"
+

 def test_performance_large_document():
    """Test performance with large document."""
    with open(__location__ + "/data/wikipedia.md", "r") as f:
        markdown = f.read()
-    
+
    # Test with multiple iterations
    iterations = 5
    times = []
-    
+
    generator = DefaultMarkdownGenerator()
-    
+
    for i in range(iterations):
        start_time = time.perf_counter()
        result = generator.generate_markdown(
-            cleaned_html=markdown,
-            base_url="https://en.wikipedia.org"
+            cleaned_html=markdown, base_url="https://en.wikipedia.org"
        )
        end_time = time.perf_counter()
        times.append(end_time - start_time)
-    
+
    avg_time = sum(times) / len(times)
    print(f"\n{'='*20} Performance Test {'='*20}")
-    print(f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds")
+    print(
+        f"Average execution time over {iterations} iterations: {avg_time:.4f} seconds"
+    )
    print(f"Min time: {min(times):.4f} seconds")
    print(f"Max time: {max(times):.4f} seconds")

+
 def test_image_links():
    """Test handling of image links."""
    markdown = """
    Here's an ![image](/image.png "Image Title") and another ![image](/other.jpg).
    And a regular [link](/page).
    """
-    
+
    generator = DefaultMarkdownGenerator()
    result = generator.generate_markdown(
-        cleaned_html=markdown,
-        base_url="https://example.com"
+        cleaned_html=markdown, base_url="https://example.com"
    )
-    
-    assert "![" in result.markdown_with_citations, "Image markdown syntax should be preserved"
-    assert "Image Title" in result.references_markdown, "Image title should be in references"
+
+    assert (
+        "![" in result.markdown_with_citations
+    ), "Image markdown syntax should be preserved"
+    assert (
+        "Image Title" in result.references_markdown
+    ), "Image title should be in references"
+

 if __name__ == "__main__":
    print("Running markdown generation strategy tests...")
-    
+
    test_basic_markdown_conversion()
    test_relative_links()
    test_duplicate_links()
    test_link_descriptions()
    test_performance_large_document()
    test_image_links()
-    
--- a/tests/async/test_parameters_and_options.py
+++ b/tests/async/test_parameters_and_options.py
@@ -1,8 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
-import json

 # Add the parent directory to the Python path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -10,24 +8,37 @@ sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
@pytest.mark.asyncio
 async def test_word_count_threshold():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
-        result_no_threshold = await crawler.arun(url=url, word_count_threshold=0, bypass_cache=True)
-        result_with_threshold = await crawler.arun(url=url, word_count_threshold=50, bypass_cache=True)
-        
+        result_no_threshold = await crawler.arun(
+            url=url, word_count_threshold=0, bypass_cache=True
+        )
+        result_with_threshold = await crawler.arun(
+            url=url, word_count_threshold=50, bypass_cache=True
+        )
+
        assert len(result_no_threshold.markdown) > len(result_with_threshold.markdown)

+
@pytest.mark.asyncio
 async def test_css_selector():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        css_selector = "h1, h2, h3"
-        result = await crawler.arun(url=url, css_selector=css_selector, bypass_cache=True)
-        
+        result = await crawler.arun(
+            url=url, css_selector=css_selector, bypass_cache=True
+        )
+
        assert result.success
-        assert "<h1" in result.cleaned_html or "<h2" in result.cleaned_html or "<h3" in result.cleaned_html
+        assert (
+            "<h1" in result.cleaned_html
+            or "<h2" in result.cleaned_html
+            or "<h3" in result.cleaned_html
+        )
+

@pytest.mark.asyncio
 async def test_javascript_execution():
@@ -36,59 +47,70 @@ async def test_javascript_execution():

        # Crawl without JS
        result_without_more = await crawler.arun(url=url, bypass_cache=True)
-        
-        js_code = ["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"]
+
+        js_code = [
+            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
+        ]
        result_with_more = await crawler.arun(url=url, js=js_code, bypass_cache=True)
-        
+
        assert result_with_more.success
        assert len(result_with_more.markdown) > len(result_without_more.markdown)

+
@pytest.mark.asyncio
 async def test_screenshot():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        result = await crawler.arun(url=url, screenshot=True, bypass_cache=True)
-        
+
        assert result.success
        assert result.screenshot
        assert isinstance(result.screenshot, str)  # Should be a base64 encoded string

+
@pytest.mark.asyncio
 async def test_custom_user_agent():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        custom_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Crawl4AI/1.0"
-        result = await crawler.arun(url=url, user_agent=custom_user_agent, bypass_cache=True)
-        
+        result = await crawler.arun(
+            url=url, user_agent=custom_user_agent, bypass_cache=True
+        )
+
        assert result.success
        # Note: We can't directly verify the user agent in the result, but we can check if the crawl was successful

+
@pytest.mark.asyncio
 async def test_extract_media_and_links():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        result = await crawler.arun(url=url, bypass_cache=True)
-        
+
        assert result.success
        assert result.media
        assert isinstance(result.media, dict)
-        assert 'images' in result.media
+        assert "images" in result.media
        assert result.links
        assert isinstance(result.links, dict)
-        assert 'internal' in result.links and 'external' in result.links
+        assert "internal" in result.links and "external" in result.links
+

@pytest.mark.asyncio
 async def test_metadata_extraction():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
        result = await crawler.arun(url=url, bypass_cache=True)
-        
+
        assert result.success
        assert result.metadata
        assert isinstance(result.metadata, dict)
        # Check for common metadata fields
-        assert any(key in result.metadata for key in ['title', 'description', 'keywords'])
+        assert any(
+            key in result.metadata for key in ["title", "description", "keywords"]
+        )
+

 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/async/test_performance.py
+++ b/tests/async/test_performance.py
@@ -1,7 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
 import time

 # Add the parent directory to the Python path
@@ -10,6 +9,7 @@ sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
@pytest.mark.asyncio
 async def test_crawl_speed():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -17,13 +17,14 @@ async def test_crawl_speed():
        start_time = time.time()
        result = await crawler.arun(url=url, bypass_cache=True)
        end_time = time.time()
-        
+
        assert result.success
        crawl_time = end_time - start_time
        print(f"Crawl time: {crawl_time:.2f} seconds")
-        
+
        assert crawl_time < 10, f"Crawl took too long: {crawl_time:.2f} seconds"

+
@pytest.mark.asyncio
 async def test_concurrent_crawling_performance():
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -32,41 +33,47 @@ async def test_concurrent_crawling_performance():
            "https://www.example.com",
            "https://www.python.org",
            "https://www.github.com",
-            "https://www.stackoverflow.com"
+            "https://www.stackoverflow.com",
        ]
-        
+
        start_time = time.time()
        results = await crawler.arun_many(urls=urls, bypass_cache=True)
        end_time = time.time()
-        
+
        total_time = end_time - start_time
        print(f"Total time for concurrent crawling: {total_time:.2f} seconds")
-        
+
        assert all(result.success for result in results)
        assert len(results) == len(urls)
-        
-        assert total_time < len(urls) * 5, f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
+
+        assert (
+            total_time < len(urls) * 5
+        ), f"Concurrent crawling not significantly faster: {total_time:.2f} seconds"
+

@pytest.mark.asyncio
 async def test_crawl_speed_with_caching():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nbcnews.com/business"
-        
+
        start_time = time.time()
        result1 = await crawler.arun(url=url, bypass_cache=True)
        end_time = time.time()
        first_crawl_time = end_time - start_time
-        
+
        start_time = time.time()
        result2 = await crawler.arun(url=url, bypass_cache=False)
        end_time = time.time()
        second_crawl_time = end_time - start_time
-        
+
        assert result1.success and result2.success
        print(f"First crawl time: {first_crawl_time:.2f} seconds")
        print(f"Second crawl time (cached): {second_crawl_time:.2f} seconds")
-        
-        assert second_crawl_time < first_crawl_time / 2, "Cached crawl not significantly faster"
+
+        assert (
+            second_crawl_time < first_crawl_time / 2
+        ), "Cached crawl not significantly faster"
+

 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/async/test_screenshot.py
+++ b/tests/async/test_screenshot.py
@@ -1,7 +1,6 @@
 import os
 import sys
 import pytest
-import asyncio
 import base64
 from PIL import Image
 import io
@@ -12,113 +11,112 @@ sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
@pytest.mark.asyncio
 async def test_basic_screenshot():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://example.com"  # A static website
        result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
-        
+
        assert result.success
        assert result.screenshot is not None
-        
+
        # Verify the screenshot is a valid image
        image_data = base64.b64decode(result.screenshot)
        image = Image.open(io.BytesIO(image_data))
        assert image.format == "PNG"

+
@pytest.mark.asyncio
 async def test_screenshot_with_wait_for():
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Using a website with dynamic content
        url = "https://www.youtube.com"
        wait_for = "css:#content"  # Wait for the main content to load
-        
+
        result = await crawler.arun(
-            url=url, 
-            bypass_cache=True, 
-            screenshot=True, 
-            wait_for=wait_for
+            url=url, bypass_cache=True, screenshot=True, wait_for=wait_for
        )
-        
+
        assert result.success
        assert result.screenshot is not None
-        
+
        # Verify the screenshot is a valid image
        image_data = base64.b64decode(result.screenshot)
        image = Image.open(io.BytesIO(image_data))
        assert image.format == "PNG"
-        
+
        # You might want to add more specific checks here, like image dimensions
        # or even use image recognition to verify certain elements are present

+
@pytest.mark.asyncio
 async def test_screenshot_with_js_wait_for():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.amazon.com"
        wait_for = "js:() => document.querySelector('#nav-logo-sprites') !== null"
-        
+
        result = await crawler.arun(
-            url=url, 
-            bypass_cache=True, 
-            screenshot=True, 
-            wait_for=wait_for
+            url=url, bypass_cache=True, screenshot=True, wait_for=wait_for
        )
-        
+
        assert result.success
        assert result.screenshot is not None
-        
+
        image_data = base64.b64decode(result.screenshot)
        image = Image.open(io.BytesIO(image_data))
        assert image.format == "PNG"

+
@pytest.mark.asyncio
 async def test_screenshot_without_wait_for():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.nytimes.com"  # A website with lots of dynamic content
-        
+
        result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
-        
+
        assert result.success
        assert result.screenshot is not None
-        
+
        image_data = base64.b64decode(result.screenshot)
        image = Image.open(io.BytesIO(image_data))
        assert image.format == "PNG"

+
@pytest.mark.asyncio
 async def test_screenshot_comparison():
    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://www.reddit.com"
        wait_for = "css:#SHORTCUT_FOCUSABLE_DIV"
-        
+
        # Take screenshot without wait_for
        result_without_wait = await crawler.arun(
-            url=url, 
-            bypass_cache=True, 
-            screenshot=True
+            url=url, bypass_cache=True, screenshot=True
        )
-        
+
        # Take screenshot with wait_for
        result_with_wait = await crawler.arun(
-            url=url, 
-            bypass_cache=True, 
-            screenshot=True, 
-            wait_for=wait_for
+            url=url, bypass_cache=True, screenshot=True, wait_for=wait_for
        )
-        
+
        assert result_without_wait.success and result_with_wait.success
        assert result_without_wait.screenshot is not None
        assert result_with_wait.screenshot is not None
-        
+
        # Compare the two screenshots
-        image_without_wait = Image.open(io.BytesIO(base64.b64decode(result_without_wait.screenshot)))
-        image_with_wait = Image.open(io.BytesIO(base64.b64decode(result_with_wait.screenshot)))
-        
+        image_without_wait = Image.open(
+            io.BytesIO(base64.b64decode(result_without_wait.screenshot))
+        )
+        image_with_wait = Image.open(
+            io.BytesIO(base64.b64decode(result_with_wait.screenshot))
+        )
+
        # This is a simple size comparison. In a real-world scenario, you might want to use
        # more sophisticated image comparison techniques.
        assert image_with_wait.size[0] >= image_without_wait.size[0]
        assert image_with_wait.size[1] >= image_without_wait.size[1]

+
 # Entry point for debugging
 if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
+    pytest.main([__file__, "-v"])
--- a/tests/docker_example.py
+++ b/tests/docker_example.py
@@ -6,53 +6,72 @@ import base64
 import os
 from typing import Dict, Any

+
 class Crawl4AiTester:
    def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
        self.base_url = base_url
-        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN')  # Check environment variable as fallback
-        self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
-        
-    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+        self.api_token = api_token or os.getenv(
+            "CRAWL4AI_API_TOKEN"
+        )  # Check environment variable as fallback
+        self.headers = (
+            {"Authorization": f"Bearer {self.api_token}"} if self.api_token else {}
+        )
+
+    def submit_and_wait(
+        self, request_data: Dict[str, Any], timeout: int = 300
+    ) -> Dict[str, Any]:
        # Submit crawl job
-        response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
+        response = requests.post(
+            f"{self.base_url}/crawl", json=request_data, headers=self.headers
+        )
        if response.status_code == 403:
            raise Exception("API token is invalid or missing")
        task_id = response.json()["task_id"]
        print(f"Task ID: {task_id}")
-        
+
        # Poll for result
        start_time = time.time()
        while True:
            if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
-                
-            result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )
+
+            result = requests.get(
+                f"{self.base_url}/task/{task_id}", headers=self.headers
+            )
            status = result.json()
-            
+
            if status["status"] == "failed":
                print("Task failed:", status.get("error"))
                raise Exception(f"Task failed: {status.get('error')}")
-                
+
            if status["status"] == "completed":
                return status
-                
+
            time.sleep(2)
-            
+
    def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
-        response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
+        response = requests.post(
+            f"{self.base_url}/crawl_sync",
+            json=request_data,
+            headers=self.headers,
+            timeout=60,
+        )
        if response.status_code == 408:
            raise TimeoutError("Task did not complete within server timeout")
        response.raise_for_status()
        return response.json()

+
 def test_docker_deployment(version="basic"):
    tester = Crawl4AiTester(
        # base_url="http://localhost:11235" ,
        base_url="https://crawl4ai-sby74.ondigitalocean.app",
-        api_token="test"
+        api_token="test",
    )
    print(f"Testing Crawl4AI Docker {version} version")
-    
+
    # Health check with timeout and retry
    max_retries = 5
    for i in range(max_retries):
@@ -60,18 +79,18 @@ def test_docker_deployment(version="basic"):
            health = requests.get(f"{tester.base_url}/health", timeout=10)
            print("Health check:", health.json())
            break
-        except requests.exceptions.RequestException as e:
+        except requests.exceptions.RequestException:
            if i == max_retries - 1:
                print(f"Failed to connect after {max_retries} attempts")
                sys.exit(1)
            print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
            time.sleep(5)
-    
+
    # Test cases based on version
    test_basic_crawl(tester)
    test_basic_crawl(tester)
    test_basic_crawl_sync(tester)
-    
+
    # if version in ["full", "transformer"]:
    #     test_cosine_extraction(tester)

@@ -81,35 +100,37 @@ def test_docker_deployment(version="basic"):
    # test_llm_extraction(tester)
    # test_llm_with_ollama(tester)
    # test_screenshot(tester)
-    
+

 def test_basic_crawl(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
-        "priority": 10, 
-        "session_id": "test"
+        "priority": 10,
+        "session_id": "test",
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]
    assert len(result["result"]["markdown"]) > 0

+
 def test_basic_crawl_sync(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl (Sync) ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 10,
-        "session_id": "test"
+        "session_id": "test",
    }
-    
+
    result = tester.submit_sync(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
-    assert result['status'] == 'completed'
-    assert result['result']['success']
-    assert len(result['result']['markdown']) > 0
-    
+    assert result["status"] == "completed"
+    assert result["result"]["success"]
+    assert len(result["result"]["markdown"]) > 0
+
+
 def test_js_execution(tester: Crawl4AiTester):
    print("\n=== Testing JS Execution ===")
    request = {
@@ -119,32 +140,29 @@ def test_js_execution(tester: Crawl4AiTester):
            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
        ],
        "wait_for": "article.tease-card:nth-child(10)",
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"JS execution result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]

+
 def test_css_selector(tester: Crawl4AiTester):
    print("\n=== Testing CSS Selector ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 7,
        "css_selector": ".wide-tease-item__description",
-        "crawler_params": {
-            "headless": True
-        },
-        "extra": {"word_count_threshold": 10}
-        
+        "crawler_params": {"headless": True},
+        "extra": {"word_count_threshold": 10},
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"CSS selector result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]

+
 def test_structured_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Structured Extraction ===")
    schema = {
@@ -165,21 +183,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
                "name": "price",
                "selector": "td:nth-child(2)",
                "type": "text",
-            }
+            },
        ],
    }
-    
+
    request = {
        "urls": "https://www.coinbase.com/explore",
        "priority": 9,
-        "extraction_config": {
-            "type": "json_css",
-            "params": {
-                "schema": schema
-            }
-        }
+        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
    }
-    
+
    result = tester.submit_and_wait(request)
    extracted = json.loads(result["result"]["extracted_content"])
    print(f"Extracted {len(extracted)} items")
@@ -187,6 +200,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
    assert result["result"]["success"]
    assert len(extracted) > 0

+
 def test_llm_extraction(tester: Crawl4AiTester):
    print("\n=== Testing LLM Extraction ===")
    schema = {
@@ -194,20 +208,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
        "properties": {
            "model_name": {
                "type": "string",
-                "description": "Name of the OpenAI model."
+                "description": "Name of the OpenAI model.",
            },
            "input_fee": {
                "type": "string",
-                "description": "Fee for input token for the OpenAI model."
+                "description": "Fee for input token for the OpenAI model.",
            },
            "output_fee": {
                "type": "string",
-                "description": "Fee for output token for the OpenAI model."
-            }
+                "description": "Fee for output token for the OpenAI model.",
+            },
        },
-        "required": ["model_name", "input_fee", "output_fee"]
+        "required": ["model_name", "input_fee", "output_fee"],
    }
-    
+
    request = {
        "urls": "https://openai.com/api/pricing",
        "priority": 8,
@@ -218,12 +232,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
                "api_token": os.getenv("OPENAI_API_KEY"),
                "schema": schema,
                "extraction_type": "schema",
-                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
-            }
+                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
+            },
        },
-        "crawler_params": {"word_count_threshold": 1}
+        "crawler_params": {"word_count_threshold": 1},
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -233,6 +247,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
    except Exception as e:
        print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")

+
 def test_llm_with_ollama(tester: Crawl4AiTester):
    print("\n=== Testing LLM with Ollama ===")
    schema = {
@@ -240,20 +255,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
        "properties": {
            "article_title": {
                "type": "string",
-                "description": "The main title of the news article"
+                "description": "The main title of the news article",
            },
            "summary": {
                "type": "string",
-                "description": "A brief summary of the article content"
+                "description": "A brief summary of the article content",
            },
            "main_topics": {
                "type": "array",
                "items": {"type": "string"},
-                "description": "Main topics or themes discussed in the article"
-            }
-        }
+                "description": "Main topics or themes discussed in the article",
+            },
+        },
    }
-    
+
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 8,
@@ -263,13 +278,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
                "provider": "ollama/llama2",
                "schema": schema,
                "extraction_type": "schema",
-                "instruction": "Extract the main article information including title, summary, and main topics."
-            }
+                "instruction": "Extract the main article information including title, summary, and main topics.",
+            },
        },
        "extra": {"word_count_threshold": 1},
-        "crawler_params": {"verbose": True}
+        "crawler_params": {"verbose": True},
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -278,6 +293,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
    except Exception as e:
        print(f"Ollama extraction test failed: {str(e)}")

+
 def test_cosine_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Cosine Extraction ===")
    request = {
@@ -289,11 +305,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
                "semantic_filter": "business finance economy",
                "word_count_threshold": 10,
                "max_dist": 0.2,
-                "top_k": 3
-            }
-        }
+                "top_k": 3,
+            },
+        },
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -303,30 +319,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
    except Exception as e:
        print(f"Cosine extraction test failed: {str(e)}")

+
 def test_screenshot(tester: Crawl4AiTester):
    print("\n=== Testing Screenshot ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 5,
        "screenshot": True,
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
    }
-    
+
    result = tester.submit_and_wait(request)
    print("Screenshot captured:", bool(result["result"]["screenshot"]))
-    
+
    if result["result"]["screenshot"]:
        # Save screenshot
        screenshot_data = base64.b64decode(result["result"]["screenshot"])
        with open("test_screenshot.jpg", "wb") as f:
            f.write(screenshot_data)
        print("Screenshot saved as test_screenshot.jpg")
-    
+
    assert result["result"]["success"]

+
 if __name__ == "__main__":
    version = sys.argv[1] if len(sys.argv) > 1 else "basic"
    # version = "full"
-    test_docker_deployment(version)
+    test_docker_deployment(version)
--- a/tests/test_cli_docs.py
+++ b/tests/test_cli_docs.py
@@ -1,13 +1,13 @@
 import asyncio
-from pathlib import Path
 from crawl4ai.docs_manager import DocsManager
 from click.testing import CliRunner
 from crawl4ai.cli import cli

+
 def test_cli():
    """Test all CLI commands"""
    runner = CliRunner()
-    
+
    print("\n1. Testing docs update...")
    # Use sync version for testing
    docs_manager = DocsManager()
@@ -27,17 +27,18 @@ def test_cli():
    # print("\n3. Testing search...")
    # result = runner.invoke(cli, ['docs', 'search', 'how to use crawler', '--build-index'])
    # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
-    # print(f"First 200 chars: {result.output[:200]}...")    
-    
+    # print(f"First 200 chars: {result.output[:200]}...")
+
    # print("\n4. Testing combine with sections...")
    # result = runner.invoke(cli, ['docs', 'combine', 'chunking_strategies', 'extraction_strategies', '--mode', 'extended'])
    # print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
    # print(f"First 200 chars: {result.output[:200]}...")

    print("\n5. Testing combine all sections...")
-    result = runner.invoke(cli, ['docs', 'combine', '--mode', 'condensed'])
+    result = runner.invoke(cli, ["docs", "combine", "--mode", "condensed"])
    print(f"Status: {'✅' if result.exit_code == 0 else '❌'}")
    print(f"First 200 chars: {result.output[:200]}...")

+
 if __name__ == "__main__":
-    test_cli()
+    test_cli()
--- a/tests/test_docker.py
+++ b/tests/test_docker.py
@@ -6,38 +6,44 @@ import base64
 import os
 from typing import Dict, Any

+
 class Crawl4AiTester:
    def __init__(self, base_url: str = "http://localhost:11235"):
        self.base_url = base_url
-        
-    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+
+    def submit_and_wait(
+        self, request_data: Dict[str, Any], timeout: int = 300
+    ) -> Dict[str, Any]:
        # Submit crawl job
        response = requests.post(f"{self.base_url}/crawl", json=request_data)
        task_id = response.json()["task_id"]
        print(f"Task ID: {task_id}")
-        
+
        # Poll for result
        start_time = time.time()
        while True:
            if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
-                
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )
+
            result = requests.get(f"{self.base_url}/task/{task_id}")
            status = result.json()
-            
+
            if status["status"] == "failed":
                print("Task failed:", status.get("error"))
                raise Exception(f"Task failed: {status.get('error')}")
-                
+
            if status["status"] == "completed":
                return status
-                
+
            time.sleep(2)

+
 def test_docker_deployment(version="basic"):
    tester = Crawl4AiTester()
    print(f"Testing Crawl4AI Docker {version} version")
-    
+
    # Health check with timeout and retry
    max_retries = 5
    for i in range(max_retries):
@@ -45,16 +51,16 @@ def test_docker_deployment(version="basic"):
            health = requests.get(f"{tester.base_url}/health", timeout=10)
            print("Health check:", health.json())
            break
-        except requests.exceptions.RequestException as e:
+        except requests.exceptions.RequestException:
            if i == max_retries - 1:
                print(f"Failed to connect after {max_retries} attempts")
                sys.exit(1)
            print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
            time.sleep(5)
-    
+
    # Test cases based on version
    test_basic_crawl(tester)
-    
+
    # if version in ["full", "transformer"]:
    #     test_cosine_extraction(tester)

@@ -64,20 +70,18 @@ def test_docker_deployment(version="basic"):
    # test_llm_extraction(tester)
    # test_llm_with_ollama(tester)
    # test_screenshot(tester)
-    
+

 def test_basic_crawl(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl ===")
-    request = {
-        "urls": "https://www.nbcnews.com/business",
-        "priority": 10
-    }
-    
+    request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
+
    result = tester.submit_and_wait(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]
    assert len(result["result"]["markdown"]) > 0

+
 def test_js_execution(tester: Crawl4AiTester):
    print("\n=== Testing JS Execution ===")
    request = {
@@ -87,32 +91,29 @@ def test_js_execution(tester: Crawl4AiTester):
            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
        ],
        "wait_for": "article.tease-card:nth-child(10)",
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"JS execution result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]

+
 def test_css_selector(tester: Crawl4AiTester):
    print("\n=== Testing CSS Selector ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 7,
        "css_selector": ".wide-tease-item__description",
-        "crawler_params": {
-            "headless": True
-        },
-        "extra": {"word_count_threshold": 10}
-        
+        "crawler_params": {"headless": True},
+        "extra": {"word_count_threshold": 10},
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"CSS selector result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]

+
 def test_structured_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Structured Extraction ===")
    schema = {
@@ -133,21 +134,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
                "name": "price",
                "selector": "td:nth-child(2)",
                "type": "text",
-            }
+            },
        ],
    }
-    
+
    request = {
        "urls": "https://www.coinbase.com/explore",
        "priority": 9,
-        "extraction_config": {
-            "type": "json_css",
-            "params": {
-                "schema": schema
-            }
-        }
+        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
    }
-    
+
    result = tester.submit_and_wait(request)
    extracted = json.loads(result["result"]["extracted_content"])
    print(f"Extracted {len(extracted)} items")
@@ -155,6 +151,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
    assert result["result"]["success"]
    assert len(extracted) > 0

+
 def test_llm_extraction(tester: Crawl4AiTester):
    print("\n=== Testing LLM Extraction ===")
    schema = {
@@ -162,20 +159,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
        "properties": {
            "model_name": {
                "type": "string",
-                "description": "Name of the OpenAI model."
+                "description": "Name of the OpenAI model.",
            },
            "input_fee": {
                "type": "string",
-                "description": "Fee for input token for the OpenAI model."
+                "description": "Fee for input token for the OpenAI model.",
            },
            "output_fee": {
                "type": "string",
-                "description": "Fee for output token for the OpenAI model."
-            }
+                "description": "Fee for output token for the OpenAI model.",
+            },
        },
-        "required": ["model_name", "input_fee", "output_fee"]
+        "required": ["model_name", "input_fee", "output_fee"],
    }
-    
+
    request = {
        "urls": "https://openai.com/api/pricing",
        "priority": 8,
@@ -186,12 +183,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
                "api_token": os.getenv("OPENAI_API_KEY"),
                "schema": schema,
                "extraction_type": "schema",
-                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
-            }
+                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
+            },
        },
-        "crawler_params": {"word_count_threshold": 1}
+        "crawler_params": {"word_count_threshold": 1},
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -201,6 +198,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
    except Exception as e:
        print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")

+
 def test_llm_with_ollama(tester: Crawl4AiTester):
    print("\n=== Testing LLM with Ollama ===")
    schema = {
@@ -208,20 +206,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
        "properties": {
            "article_title": {
                "type": "string",
-                "description": "The main title of the news article"
+                "description": "The main title of the news article",
            },
            "summary": {
                "type": "string",
-                "description": "A brief summary of the article content"
+                "description": "A brief summary of the article content",
            },
            "main_topics": {
                "type": "array",
                "items": {"type": "string"},
-                "description": "Main topics or themes discussed in the article"
-            }
-        }
+                "description": "Main topics or themes discussed in the article",
+            },
+        },
    }
-    
+
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 8,
@@ -231,13 +229,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
                "provider": "ollama/llama2",
                "schema": schema,
                "extraction_type": "schema",
-                "instruction": "Extract the main article information including title, summary, and main topics."
-            }
+                "instruction": "Extract the main article information including title, summary, and main topics.",
+            },
        },
        "extra": {"word_count_threshold": 1},
-        "crawler_params": {"verbose": True}
+        "crawler_params": {"verbose": True},
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -246,6 +244,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
    except Exception as e:
        print(f"Ollama extraction test failed: {str(e)}")

+
 def test_cosine_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Cosine Extraction ===")
    request = {
@@ -257,11 +256,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
                "semantic_filter": "business finance economy",
                "word_count_threshold": 10,
                "max_dist": 0.2,
-                "top_k": 3
-            }
-        }
+                "top_k": 3,
+            },
+        },
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -271,30 +270,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
    except Exception as e:
        print(f"Cosine extraction test failed: {str(e)}")

+
 def test_screenshot(tester: Crawl4AiTester):
    print("\n=== Testing Screenshot ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 5,
        "screenshot": True,
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
    }
-    
+
    result = tester.submit_and_wait(request)
    print("Screenshot captured:", bool(result["result"]["screenshot"]))
-    
+
    if result["result"]["screenshot"]:
        # Save screenshot
        screenshot_data = base64.b64decode(result["result"]["screenshot"])
        with open("test_screenshot.jpg", "wb") as f:
            f.write(screenshot_data)
        print("Screenshot saved as test_screenshot.jpg")
-    
+
    assert result["result"]["success"]

+
 if __name__ == "__main__":
    version = sys.argv[1] if len(sys.argv) > 1 else "basic"
    # version = "full"
-    test_docker_deployment(version)
+    test_docker_deployment(version)
--- a/tests/test_llmtxt.py
+++ b/tests/test_llmtxt.py
@@ -3,20 +3,21 @@ from crawl4ai.async_logger import AsyncLogger
 from pathlib import Path
 import asyncio

+
 async def main():
    current_file = Path(__file__).resolve()
    # base_dir = current_file.parent.parent / "local/_docs/llm.txt/test_docs"
    base_dir = current_file.parent.parent / "local/_docs/llm.txt"
    docs_dir = base_dir
-    
+
    # Create directory if it doesn't exist
    docs_dir.mkdir(parents=True, exist_ok=True)
-   
+
    # Initialize logger
    logger = AsyncLogger()
    # Updated initialization with default batching params
    # manager = AsyncLLMTextManager(docs_dir, logger, max_concurrent_calls=3, batch_size=2)
-    manager = AsyncLLMTextManager(docs_dir, logger,  batch_size=2)
+    manager = AsyncLLMTextManager(docs_dir, logger, batch_size=2)

    # Let's first check what files we have
    print("\nAvailable files:")
@@ -26,8 +27,7 @@ async def main():
    # Generate index files
    print("\nGenerating index files...")
    await manager.generate_index_files(
-        force_generate_facts=False,
-        clear_bm25_cache=False
+        force_generate_facts=False, clear_bm25_cache=False
    )

    # Test some relevant queries about Crawl4AI
@@ -41,9 +41,12 @@ async def main():
        results = manager.search(query, top_k=2)
        print(f"Results length: {len(results)} characters")
        if results:
-            print("First 200 chars of results:", results[:200].replace('\n', ' '), "...")
+            print(
+                "First 200 chars of results:", results[:200].replace("\n", " "), "..."
+            )
        else:
            print("No results found")

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -3,8 +3,8 @@ import aiohttp
 import json
 import time
 import os
-from typing import Optional, Dict, Any
-from pydantic import BaseModel, HttpUrl
+from typing import Dict, Any
+

 class NBCNewsAPITest:
    def __init__(self, base_url: str = "http://localhost:8000"):
@@ -20,7 +20,9 @@ class NBCNewsAPITest:
            await self.session.close()

    async def submit_crawl(self, request_data: Dict[str, Any]) -> str:
-        async with self.session.post(f"{self.base_url}/crawl", json=request_data) as response:
+        async with self.session.post(
+            f"{self.base_url}/crawl", json=request_data
+        ) as response:
            result = await response.json()
            return result["task_id"]

@@ -28,11 +30,15 @@ class NBCNewsAPITest:
        async with self.session.get(f"{self.base_url}/task/{task_id}") as response:
            return await response.json()

-    async def wait_for_task(self, task_id: str, timeout: int = 300, poll_interval: int = 2) -> Dict[str, Any]:
+    async def wait_for_task(
+        self, task_id: str, timeout: int = 300, poll_interval: int = 2
+    ) -> Dict[str, Any]:
        start_time = time.time()
        while True:
            if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )

            status = await self.get_task_status(task_id)
            if status["status"] in ["completed", "failed"]:
@@ -44,13 +50,11 @@ class NBCNewsAPITest:
        async with self.session.get(f"{self.base_url}/health") as response:
            return await response.json()

+
 async def test_basic_crawl():
    print("\n=== Testing Basic Crawl ===")
    async with NBCNewsAPITest() as api:
-        request = {
-            "urls": "https://www.nbcnews.com/business",
-            "priority": 10
-        }
+        request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
        print(f"Basic crawl result length: {len(result['result']['markdown'])}")
@@ -58,6 +62,7 @@ async def test_basic_crawl():
        assert "result" in result
        assert result["result"]["success"]

+
 async def test_js_execution():
    print("\n=== Testing JS Execution ===")
    async with NBCNewsAPITest() as api:
@@ -68,9 +73,7 @@ async def test_js_execution():
                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
            ],
            "wait_for": "article.tease-card:nth-child(10)",
-            "crawler_params": {
-                "headless": True
-            }
+            "crawler_params": {"headless": True},
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -78,13 +81,14 @@ async def test_js_execution():
        assert result["status"] == "completed"
        assert result["result"]["success"]

+
 async def test_css_selector():
    print("\n=== Testing CSS Selector ===")
    async with NBCNewsAPITest() as api:
        request = {
            "urls": "https://www.nbcnews.com/business",
            "priority": 7,
-            "css_selector": ".wide-tease-item__description"
+            "css_selector": ".wide-tease-item__description",
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -92,6 +96,7 @@ async def test_css_selector():
        assert result["status"] == "completed"
        assert result["result"]["success"]

+
 async def test_structured_extraction():
    print("\n=== Testing Structured Extraction ===")
    async with NBCNewsAPITest() as api:
@@ -99,34 +104,25 @@ async def test_structured_extraction():
            "name": "NBC News Articles",
            "baseSelector": "article.tease-card",
            "fields": [
-                {
-                    "name": "title",
-                    "selector": "h2",
-                    "type": "text"
-                },
+                {"name": "title", "selector": "h2", "type": "text"},
                {
                    "name": "description",
                    "selector": ".tease-card__description",
-                    "type": "text"
+                    "type": "text",
                },
                {
                    "name": "link",
                    "selector": "a",
                    "type": "attribute",
-                    "attribute": "href"
-                }
-            ]
+                    "attribute": "href",
+                },
+            ],
        }
-        
+
        request = {
            "urls": "https://www.nbcnews.com/business",
            "priority": 9,
-            "extraction_config": {
-                "type": "json_css",
-                "params": {
-                    "schema": schema
-                }
-            }
+            "extraction_config": {"type": "json_css", "params": {"schema": schema}},
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -136,6 +132,7 @@ async def test_structured_extraction():
        assert result["result"]["success"]
        assert len(extracted) > 0

+
 async def test_batch_crawl():
    print("\n=== Testing Batch Crawl ===")
    async with NBCNewsAPITest() as api:
@@ -143,12 +140,10 @@ async def test_batch_crawl():
            "urls": [
                "https://www.nbcnews.com/business",
                "https://www.nbcnews.com/business/consumer",
-                "https://www.nbcnews.com/business/economy"
+                "https://www.nbcnews.com/business/economy",
            ],
            "priority": 6,
-            "crawler_params": {
-                "headless": True
-            }
+            "crawler_params": {"headless": True},
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -157,6 +152,7 @@ async def test_batch_crawl():
        assert "results" in result
        assert len(result["results"]) == 3

+
 async def test_llm_extraction():
    print("\n=== Testing LLM Extraction with Ollama ===")
    async with NBCNewsAPITest() as api:
@@ -165,19 +161,19 @@ async def test_llm_extraction():
            "properties": {
                "article_title": {
                    "type": "string",
-                    "description": "The main title of the news article"
+                    "description": "The main title of the news article",
                },
                "summary": {
                    "type": "string",
-                    "description": "A brief summary of the article content"
+                    "description": "A brief summary of the article content",
                },
                "main_topics": {
                    "type": "array",
                    "items": {"type": "string"},
-                    "description": "Main topics or themes discussed in the article"
-                }
+                    "description": "Main topics or themes discussed in the article",
+                },
            },
-            "required": ["article_title", "summary", "main_topics"]
+            "required": ["article_title", "summary", "main_topics"],
        }

        request = {
@@ -191,26 +187,24 @@ async def test_llm_extraction():
                    "schema": schema,
                    "extraction_type": "schema",
                    "instruction": """Extract the main article information including title, a brief summary, and main topics discussed. 
-                    Focus on the primary business news article on the page."""
-                }
+                    Focus on the primary business news article on the page.""",
+                },
            },
-            "crawler_params": {
-                "headless": True,
-                "word_count_threshold": 1
-            }
+            "crawler_params": {"headless": True, "word_count_threshold": 1},
        }
-        
+
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
-        
+
        if result["status"] == "completed":
            extracted = json.loads(result["result"]["extracted_content"])
-            print(f"Extracted article analysis:")
+            print("Extracted article analysis:")
            print(json.dumps(extracted, indent=2))
-        
+
        assert result["status"] == "completed"
        assert result["result"]["success"]

+
 async def test_screenshot():
    print("\n=== Testing Screenshot ===")
    async with NBCNewsAPITest() as api:
@@ -218,9 +212,7 @@ async def test_screenshot():
            "urls": "https://www.nbcnews.com/business",
            "priority": 5,
            "screenshot": True,
-            "crawler_params": {
-                "headless": True
-            }
+            "crawler_params": {"headless": True},
        }
        task_id = await api.submit_crawl(request)
        result = await api.wait_for_task(task_id)
@@ -229,6 +221,7 @@ async def test_screenshot():
        assert result["result"]["success"]
        assert result["result"]["screenshot"] is not None

+
 async def test_priority_handling():
    print("\n=== Testing Priority Handling ===")
    async with NBCNewsAPITest() as api:
@@ -236,7 +229,7 @@ async def test_priority_handling():
        low_priority = {
            "urls": "https://www.nbcnews.com/business",
            "priority": 1,
-            "crawler_params": {"headless": True}
+            "crawler_params": {"headless": True},
        }
        low_task_id = await api.submit_crawl(low_priority)

@@ -244,7 +237,7 @@ async def test_priority_handling():
        high_priority = {
            "urls": "https://www.nbcnews.com/business/consumer",
            "priority": 10,
-            "crawler_params": {"headless": True}
+            "crawler_params": {"headless": True},
        }
        high_task_id = await api.submit_crawl(high_priority)

@@ -256,6 +249,7 @@ async def test_priority_handling():
        assert high_result["status"] == "completed"
        assert low_result["status"] == "completed"

+
 async def main():
    try:
        # Start with health check
@@ -277,5 +271,6 @@ async def main():
        print(f"Test failed: {str(e)}")
        raise

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/tests/test_scraping_strategy.py
+++ b/tests/test_scraping_strategy.py
@@ -0,0 +1,26 @@
+import nest_asyncio
+
+nest_asyncio.apply()
+
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    LXMLWebScrapingStrategy,
+    CacheMode,
+)
+
+
+async def main():
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        scraping_strategy=LXMLWebScrapingStrategy(),  # Faster alternative to default BeautifulSoup
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com", config=config)
+        print(f"Success: {result.success}")
+        print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/tests/test_web_crawler.py
+++ b/tests/test_web_crawler.py
@@ -1,79 +1,105 @@
 import unittest, os
 from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
-from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy
+from crawl4ai.chunking_strategy import (
+    RegexChunking,
+    FixedLengthWordChunking,
+    SlidingWindowChunking,
+)
+from crawl4ai.extraction_strategy import (
+    CosineStrategy,
+    LLMExtractionStrategy,
+    TopicExtractionStrategy,
+    NoExtractionStrategy,
+)
+

 class TestWebCrawler(unittest.TestCase):
-    
    def setUp(self):
        self.crawler = WebCrawler()
-    
+
    def test_warmup(self):
        self.crawler.warmup()
        self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
-    
+
    def test_run_default_strategies(self):
        result = self.crawler.run(
-            url='https://www.nbcnews.com/business',
+            url="https://www.nbcnews.com/business",
            word_count_threshold=5,
            chunking_strategy=RegexChunking(),
-            extraction_strategy=CosineStrategy(), bypass_cache=True
+            extraction_strategy=CosineStrategy(),
+            bypass_cache=True,
        )
-        self.assertTrue(result.success, "Failed to crawl and extract using default strategies")
-    
+        self.assertTrue(
+            result.success, "Failed to crawl and extract using default strategies"
+        )
+
    def test_run_different_strategies(self):
-        url = 'https://www.nbcnews.com/business'
-        
+        url = "https://www.nbcnews.com/business"
+
        # Test with FixedLengthWordChunking and LLMExtractionStrategy
        result = self.crawler.run(
            url=url,
            word_count_threshold=5,
            chunking_strategy=FixedLengthWordChunking(chunk_size=100),
-            extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
+            extraction_strategy=LLMExtractionStrategy(
+                provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY")
+            ),
+            bypass_cache=True,
        )
-        self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")
-        
+        self.assertTrue(
+            result.success,
+            "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy",
+        )
+
        # Test with SlidingWindowChunking and TopicExtractionStrategy
        result = self.crawler.run(
            url=url,
            word_count_threshold=5,
            chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
-            extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
+            extraction_strategy=TopicExtractionStrategy(num_keywords=5),
+            bypass_cache=True,
        )
-        self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")
-    
+        self.assertTrue(
+            result.success,
+            "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy",
+        )
+
    def test_invalid_url(self):
        with self.assertRaises(Exception) as context:
-            self.crawler.run(url='invalid_url', bypass_cache=True)
+            self.crawler.run(url="invalid_url", bypass_cache=True)
        self.assertIn("Invalid URL", str(context.exception))
-    
+
    def test_unsupported_extraction_strategy(self):
        with self.assertRaises(Exception) as context:
-            self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
+            self.crawler.run(
+                url="https://www.nbcnews.com/business",
+                extraction_strategy="UnsupportedStrategy",
+                bypass_cache=True,
+            )
        self.assertIn("Unsupported extraction strategy", str(context.exception))
-    
+
    def test_invalid_css_selector(self):
        with self.assertRaises(ValueError) as context:
-            self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
+            self.crawler.run(
+                url="https://www.nbcnews.com/business",
+                css_selector="invalid_selector",
+                bypass_cache=True,
+            )
        self.assertIn("Invalid CSS selector", str(context.exception))

-    
    def test_crawl_with_cache_and_bypass_cache(self):
-        url = 'https://www.nbcnews.com/business'
-        
+        url = "https://www.nbcnews.com/business"
+
        # First crawl with cache enabled
        result = self.crawler.run(url=url, bypass_cache=False)
        self.assertTrue(result.success, "Failed to crawl and cache the result")
-        
+
        # Second crawl with bypass_cache=True
        result = self.crawler.run(url=url, bypass_cache=True)
        self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
-    
+
    def test_fetch_multiple_pages(self):
-        urls = [
-            'https://www.nbcnews.com/business',
-            'https://www.bbc.com/news'
-        ]
+        urls = ["https://www.nbcnews.com/business", "https://www.bbc.com/news"]
        results = []
        for url in urls:
            result = self.crawler.run(
@@ -81,31 +107,42 @@ class TestWebCrawler(unittest.TestCase):
                word_count_threshold=5,
                chunking_strategy=RegexChunking(),
                extraction_strategy=CosineStrategy(),
-                bypass_cache=True
+                bypass_cache=True,
            )
            results.append(result)
-        
+
        self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
        for result in results:
-            self.assertTrue(result.success, "Failed to crawl and extract a page in the list")
-    
+            self.assertTrue(
+                result.success, "Failed to crawl and extract a page in the list"
+            )
+
    def test_run_fixed_length_word_chunking_and_no_extraction(self):
        result = self.crawler.run(
-            url='https://www.nbcnews.com/business',
+            url="https://www.nbcnews.com/business",
            word_count_threshold=5,
            chunking_strategy=FixedLengthWordChunking(chunk_size=100),
-            extraction_strategy=NoExtractionStrategy(), bypass_cache=True
+            extraction_strategy=NoExtractionStrategy(),
+            bypass_cache=True,
+        )
+        self.assertTrue(
+            result.success,
+            "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy",
        )
-        self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")

    def test_run_sliding_window_and_no_extraction(self):
        result = self.crawler.run(
-            url='https://www.nbcnews.com/business',
+            url="https://www.nbcnews.com/business",
            word_count_threshold=5,
            chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
-            extraction_strategy=NoExtractionStrategy(), bypass_cache=True
+            extraction_strategy=NoExtractionStrategy(),
+            bypass_cache=True,
+        )
+        self.assertTrue(
+            result.success,
+            "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy",
        )
-        self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    unittest.main()