Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/docs/examples/amazon_product_extraction_direct_url.py
+++ b/docs/examples/amazon_product_extraction_direct_url.py
@@ -9,13 +9,11 @@ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json

+
 async def extract_amazon_products():
    # Initialize browser config
-    browser_config = BrowserConfig(
-        browser_type="chromium",
-        headless=True
-    )
-    
+    browser_config = BrowserConfig(browser_type="chromium", headless=True)
+
    # Initialize crawler config with JSON CSS extraction strategy
    crawler_config = CrawlerRunConfig(
        extraction_strategy=JsonCssExtractionStrategy(
@@ -27,74 +25,70 @@ async def extract_amazon_products():
                        "name": "asin",
                        "selector": "",
                        "type": "attribute",
-                        "attribute": "data-asin"
-                    },
-                    {
-                        "name": "title",
-                        "selector": "h2 a span",
-                        "type": "text"
+                        "attribute": "data-asin",
                    },
+                    {"name": "title", "selector": "h2 a span", "type": "text"},
                    {
                        "name": "url",
                        "selector": "h2 a",
                        "type": "attribute",
-                        "attribute": "href"
+                        "attribute": "href",
                    },
                    {
                        "name": "image",
                        "selector": ".s-image",
                        "type": "attribute",
-                        "attribute": "src"
+                        "attribute": "src",
                    },
                    {
                        "name": "rating",
                        "selector": ".a-icon-star-small .a-icon-alt",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "reviews_count",
                        "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "price",
                        "selector": ".a-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "original_price",
                        "selector": ".a-price.a-text-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "sponsored",
                        "selector": ".puis-sponsored-label-text",
-                        "type": "exists"
+                        "type": "exists",
                    },
                    {
                        "name": "delivery_info",
                        "selector": "[data-cy='delivery-recipe'] .a-color-base",
                        "type": "text",
-                        "multiple": True
-                    }
-                ]
+                        "multiple": True,
+                    },
+                ],
            }
        )
    )

    # Example search URL (you should replace with your actual Amazon URL)
    url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab"
-    
+
    # Use context manager for proper resource handling
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Extract the data
        result = await crawler.arun(url=url, config=crawler_config)
-        
+
        # Process and print the results
        if result and result.extracted_content:
            # Parse the JSON string into a list of products
            products = json.loads(result.extracted_content)
-            
+
            # Process each product in the list
            for product in products:
                print("\nProduct Details:")
@@ -105,10 +99,12 @@ async def extract_amazon_products():
                print(f"Rating: {product.get('rating')}")
                print(f"Reviews: {product.get('reviews_count')}")
                print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
-                if product.get('delivery_info'):
+                if product.get("delivery_info"):
                    print(f"Delivery: {' '.join(product['delivery_info'])}")
                print("-" * 80)

+
 if __name__ == "__main__":
    import asyncio
+
    asyncio.run(extract_amazon_products())
--- a/docs/examples/amazon_product_extraction_using_hooks.py
+++ b/docs/examples/amazon_product_extraction_using_hooks.py
@@ -10,17 +10,17 @@ from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json
 from playwright.async_api import Page, BrowserContext

+
 async def extract_amazon_products():
    # Initialize browser config
    browser_config = BrowserConfig(
        # browser_type="chromium",
        headless=True
    )
-    
+
    # Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
-
        extraction_strategy=JsonCssExtractionStrategy(
            schema={
                "name": "Amazon Product Search Results",
@@ -30,102 +30,105 @@ async def extract_amazon_products():
                        "name": "asin",
                        "selector": "",
                        "type": "attribute",
-                        "attribute": "data-asin"
-                    },
-                    {
-                        "name": "title",
-                        "selector": "h2 a span",
-                        "type": "text"
+                        "attribute": "data-asin",
                    },
+                    {"name": "title", "selector": "h2 a span", "type": "text"},
                    {
                        "name": "url",
                        "selector": "h2 a",
                        "type": "attribute",
-                        "attribute": "href"
+                        "attribute": "href",
                    },
                    {
                        "name": "image",
                        "selector": ".s-image",
                        "type": "attribute",
-                        "attribute": "src"
+                        "attribute": "src",
                    },
                    {
                        "name": "rating",
                        "selector": ".a-icon-star-small .a-icon-alt",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "reviews_count",
                        "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "price",
                        "selector": ".a-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "original_price",
                        "selector": ".a-price.a-text-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "sponsored",
                        "selector": ".puis-sponsored-label-text",
-                        "type": "exists"
+                        "type": "exists",
                    },
                    {
                        "name": "delivery_info",
                        "selector": "[data-cy='delivery-recipe'] .a-color-base",
                        "type": "text",
-                        "multiple": True
-                    }
-                ]
+                        "multiple": True,
+                    },
+                ],
            }
-        )
+        ),
    )

    url = "https://www.amazon.com/"
-    
-    async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
+
+    async def after_goto(
+        page: Page, context: BrowserContext, url: str, response: dict, **kwargs
+    ):
        """Hook called after navigating to each URL"""
        print(f"[HOOK] after_goto - Successfully loaded: {url}")
-        
+
        try:
            # Wait for search box to be available
-            search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
-            
+            search_box = await page.wait_for_selector(
+                "#twotabsearchtextbox", timeout=1000
+            )
+
            # Type the search query
-            await search_box.fill('Samsung Galaxy Tab')
-            
+            await search_box.fill("Samsung Galaxy Tab")
+
            # Get the search button and prepare for navigation
-            search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
-            
+            search_button = await page.wait_for_selector(
+                "#nav-search-submit-button", timeout=1000
+            )
+
            # Click with navigation waiting
            await search_button.click()
-            
+
            # Wait for search results to load
-            await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
+            await page.wait_for_selector(
+                '[data-component-type="s-search-result"]', timeout=10000
+            )
            print("[HOOK] Search completed and results loaded!")
-            
+
        except Exception as e:
            print(f"[HOOK] Error during search operation: {str(e)}")
-            
-        return page    
-    
+
+        return page
+
    # Use context manager for proper resource handling
    async with AsyncWebCrawler(config=browser_config) as crawler:
-        
        crawler.crawler_strategy.set_hook("after_goto", after_goto)
-        
+
        # Extract the data
        result = await crawler.arun(url=url, config=crawler_config)
-        
+
        # Process and print the results
        if result and result.extracted_content:
            # Parse the JSON string into a list of products
            products = json.loads(result.extracted_content)
-            
+
            # Process each product in the list
            for product in products:
                print("\nProduct Details:")
@@ -136,10 +139,12 @@ async def extract_amazon_products():
                print(f"Rating: {product.get('rating')}")
                print(f"Reviews: {product.get('reviews_count')}")
                print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
-                if product.get('delivery_info'):
+                if product.get("delivery_info"):
                    print(f"Delivery: {' '.join(product['delivery_info'])}")
                print("-" * 80)

+
 if __name__ == "__main__":
    import asyncio
+
    asyncio.run(extract_amazon_products())
--- a/docs/examples/amazon_product_extraction_using_use_javascript.py
+++ b/docs/examples/amazon_product_extraction_using_use_javascript.py
@@ -8,7 +8,7 @@ from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
 import json
-from playwright.async_api import Page, BrowserContext
+

 async def extract_amazon_products():
    # Initialize browser config
@@ -16,7 +16,7 @@ async def extract_amazon_products():
        # browser_type="chromium",
        headless=True
    )
-    
+
    js_code_to_search = """
        const task = async () => {
            document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
@@ -30,7 +30,7 @@ async def extract_amazon_products():
    """
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
-        js_code = js_code_to_search,
+        js_code=js_code_to_search,
        wait_for='css:[data-component-type="s-search-result"]',
        extraction_strategy=JsonCssExtractionStrategy(
            schema={
@@ -41,75 +41,70 @@ async def extract_amazon_products():
                        "name": "asin",
                        "selector": "",
                        "type": "attribute",
-                        "attribute": "data-asin"
-                    },
-                    {
-                        "name": "title",
-                        "selector": "h2 a span",
-                        "type": "text"
+                        "attribute": "data-asin",
                    },
+                    {"name": "title", "selector": "h2 a span", "type": "text"},
                    {
                        "name": "url",
                        "selector": "h2 a",
                        "type": "attribute",
-                        "attribute": "href"
+                        "attribute": "href",
                    },
                    {
                        "name": "image",
                        "selector": ".s-image",
                        "type": "attribute",
-                        "attribute": "src"
+                        "attribute": "src",
                    },
                    {
                        "name": "rating",
                        "selector": ".a-icon-star-small .a-icon-alt",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "reviews_count",
                        "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "price",
                        "selector": ".a-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "original_price",
                        "selector": ".a-price.a-text-price .a-offscreen",
-                        "type": "text"
+                        "type": "text",
                    },
                    {
                        "name": "sponsored",
                        "selector": ".puis-sponsored-label-text",
-                        "type": "exists"
+                        "type": "exists",
                    },
                    {
                        "name": "delivery_info",
                        "selector": "[data-cy='delivery-recipe'] .a-color-base",
                        "type": "text",
-                        "multiple": True
-                    }
-                ]
+                        "multiple": True,
+                    },
+                ],
            }
-        )
+        ),
    )

    # Example search URL (you should replace with your actual Amazon URL)
    url = "https://www.amazon.com/"
- 
-    
+
    # Use context manager for proper resource handling
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Extract the data
        result = await crawler.arun(url=url, config=crawler_config)
-        
+
        # Process and print the results
        if result and result.extracted_content:
            # Parse the JSON string into a list of products
            products = json.loads(result.extracted_content)
-            
+
            # Process each product in the list
            for product in products:
                print("\nProduct Details:")
@@ -120,10 +115,12 @@ async def extract_amazon_products():
                print(f"Rating: {product.get('rating')}")
                print(f"Reviews: {product.get('reviews_count')}")
                print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
-                if product.get('delivery_info'):
+                if product.get("delivery_info"):
                    print(f"Delivery: {' '.join(product['delivery_info'])}")
                print("-" * 80)

+
 if __name__ == "__main__":
    import asyncio
+
    asyncio.run(extract_amazon_products())
--- a/docs/examples/async_webcrawler_multiple_urls_example.py
+++ b/docs/examples/async_webcrawler_multiple_urls_example.py
@@ -1,12 +1,16 @@
 # File: async_webcrawler_multiple_urls_example.py
 import os, sys
+
 # append 2 parent directories to sys.path to import crawl4ai
-parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
 sys.path.append(parent_dir)

 import asyncio
 from crawl4ai import AsyncWebCrawler

+
 async def main():
    # Initialize the AsyncWebCrawler
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -16,7 +20,7 @@ async def main():
            "https://python.org",
            "https://github.com",
            "https://stackoverflow.com",
-            "https://news.ycombinator.com"
+            "https://news.ycombinator.com",
        ]

        # Set up crawling parameters
@@ -27,7 +31,7 @@ async def main():
            urls=urls,
            word_count_threshold=word_count_threshold,
            bypass_cache=True,
-            verbose=True
+            verbose=True,
        )

        # Process the results
@@ -36,7 +40,9 @@ async def main():
                print(f"Successfully crawled: {result.url}")
                print(f"Title: {result.metadata.get('title', 'N/A')}")
                print(f"Word count: {len(result.markdown.split())}")
-                print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}")
+                print(
+                    f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}"
+                )
                print(f"Number of images: {len(result.media.get('images', []))}")
                print("---")
            else:
@@ -44,5 +50,6 @@ async def main():
                print(f"Error: {result.error_message}")
                print("---")

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/docs/examples/browser_optimization_example.py
+++ b/docs/examples/browser_optimization_example.py
@@ -6,10 +6,8 @@ This example demonstrates optimal browser usage patterns in Crawl4AI:
 """

 import asyncio
-import os
 from typing import List
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
-from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator


--- a/docs/examples/crawlai_vs_firecrawl.py
+++ b/docs/examples/crawlai_vs_firecrawl.py
@@ -1,31 +1,32 @@
 import os, time
+
 # append the path to the root of the project
 import sys
 import asyncio
-sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 from firecrawl import FirecrawlApp
 from crawl4ai import AsyncWebCrawler
-__data__ = os.path.join(os.path.dirname(__file__), '..', '..') + '/.data'
+
+__data__ = os.path.join(os.path.dirname(__file__), "..", "..") + "/.data"
+

 async def compare():
-    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])

    # Tet Firecrawl with a simple crawl
    start = time.time()
    scrape_status = app.scrape_url(
-    'https://www.nbcnews.com/business',
-    params={'formats': ['markdown', 'html']}
+        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
    )
    end = time.time()
    print(f"Time taken: {end - start} seconds")
-    print(len(scrape_status['markdown']))
+    print(len(scrape_status["markdown"]))
    # save the markdown content with provider name
    with open(f"{__data__}/firecrawl_simple.md", "w") as f:
-        f.write(scrape_status['markdown'])
+        f.write(scrape_status["markdown"])
    # Count how many "cldnry.s-nbcnews.com" are in the markdown
-    print(scrape_status['markdown'].count("cldnry.s-nbcnews.com"))
-    
-
+    print(scrape_status["markdown"].count("cldnry.s-nbcnews.com"))

    async with AsyncWebCrawler() as crawler:
        start = time.time()
@@ -33,13 +34,13 @@ async def compare():
            url="https://www.nbcnews.com/business",
            # js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
            word_count_threshold=0,
-            bypass_cache=True, 
-            verbose=False
+            bypass_cache=True,
+            verbose=False,
        )
        end = time.time()
        print(f"Time taken: {end - start} seconds")
        print(len(result.markdown))
-        # save the markdown content with provider name  
+        # save the markdown content with provider name
        with open(f"{__data__}/crawl4ai_simple.md", "w") as f:
            f.write(result.markdown)
        # count how many "cldnry.s-nbcnews.com" are in the markdown
@@ -48,10 +49,12 @@ async def compare():
        start = time.time()
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
-            js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
+            js_code=[
+                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
+            ],
            word_count_threshold=0,
-            bypass_cache=True, 
-            verbose=False
+            bypass_cache=True,
+            verbose=False,
        )
        end = time.time()
        print(f"Time taken: {end - start} seconds")
@@ -61,7 +64,7 @@ async def compare():
            f.write(result.markdown)
        # count how many "cldnry.s-nbcnews.com" are in the markdown
        print(result.markdown.count("cldnry.s-nbcnews.com"))
-        
+
+
 if __name__ == "__main__":
    asyncio.run(compare())
-    
--- a/docs/examples/dispatcher_example.py
+++ b/docs/examples/dispatcher_example.py
@@ -3,11 +3,18 @@ import time
 from rich import print
 from rich.table import Table
 from crawl4ai import (
-    AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, 
-    MemoryAdaptiveDispatcher, SemaphoreDispatcher,
-    RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    MemoryAdaptiveDispatcher,
+    SemaphoreDispatcher,
+    RateLimiter,
+    CrawlerMonitor,
+    DisplayMode,
+    CacheMode,
 )

+
 async def memory_adaptive(urls, browser_config, run_config):
    """Memory adaptive crawler with monitoring"""
    start = time.perf_counter()
@@ -16,14 +23,16 @@ async def memory_adaptive(urls, browser_config, run_config):
            memory_threshold_percent=70.0,
            max_session_permit=10,
            monitor=CrawlerMonitor(
-                max_visible_rows=15,
-                display_mode=DisplayMode.DETAILED
-            )
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
        )
-        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
    duration = time.perf_counter() - start
    return len(results), duration

+
 async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
    """Memory adaptive crawler with rate limiting"""
    start = time.perf_counter()
@@ -32,19 +41,19 @@ async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
            memory_threshold_percent=70.0,
            max_session_permit=10,
            rate_limiter=RateLimiter(
-                base_delay=(1.0, 2.0),
-                max_delay=30.0,
-                max_retries=2
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
            ),
            monitor=CrawlerMonitor(
-                max_visible_rows=15,
-                display_mode=DisplayMode.DETAILED
-            )
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
        )
-        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
    duration = time.perf_counter() - start
    return len(results), duration

+
 async def semaphore(urls, browser_config, run_config):
    """Basic semaphore crawler"""
    start = time.perf_counter()
@@ -52,14 +61,16 @@ async def semaphore(urls, browser_config, run_config):
        dispatcher = SemaphoreDispatcher(
            semaphore_count=5,
            monitor=CrawlerMonitor(
-                max_visible_rows=15,
-                display_mode=DisplayMode.DETAILED
-            )
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
        )
-        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
    duration = time.perf_counter() - start
    return len(results), duration

+
 async def semaphore_with_rate_limit(urls, browser_config, run_config):
    """Semaphore crawler with rate limiting"""
    start = time.perf_counter()
@@ -67,19 +78,19 @@ async def semaphore_with_rate_limit(urls, browser_config, run_config):
        dispatcher = SemaphoreDispatcher(
            semaphore_count=5,
            rate_limiter=RateLimiter(
-                base_delay=(1.0, 2.0),
-                max_delay=30.0,
-                max_retries=2
+                base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
            ),
            monitor=CrawlerMonitor(
-                max_visible_rows=15,
-                display_mode=DisplayMode.DETAILED
-            )
+                max_visible_rows=15, display_mode=DisplayMode.DETAILED
+            ),
+        )
+        results = await crawler.arun_many(
+            urls, config=run_config, dispatcher=dispatcher
        )
-        results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
    duration = time.perf_counter() - start
    return len(results), duration

+
 def create_performance_table(results):
    """Creates a rich table showing performance results"""
    table = Table(title="Crawler Strategy Performance Comparison")
@@ -89,18 +100,16 @@ def create_performance_table(results):
    table.add_column("URLs/second", justify="right", style="magenta")

    sorted_results = sorted(results.items(), key=lambda x: x[1][1])
-    
+
    for strategy, (urls_crawled, duration) in sorted_results:
        urls_per_second = urls_crawled / duration
        table.add_row(
-            strategy,
-            str(urls_crawled),
-            f"{duration:.2f}",
-            f"{urls_per_second:.2f}"
+            strategy, str(urls_crawled), f"{duration:.2f}", f"{urls_per_second:.2f}"
        )
-    
+
    return table

+
 async def main():
    urls = [f"https://example.com/page{i}" for i in range(1, 20)]
    browser_config = BrowserConfig(headless=True, verbose=False)
@@ -108,14 +117,19 @@ async def main():

    results = {
        "Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
-        "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(urls, browser_config, run_config),
+        "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
+            urls, browser_config, run_config
+        ),
        "Semaphore": await semaphore(urls, browser_config, run_config),
-        "Semaphore + Rate Limit": await semaphore_with_rate_limit(urls, browser_config, run_config),
+        "Semaphore + Rate Limit": await semaphore_with_rate_limit(
+            urls, browser_config, run_config
+        ),
    }

    table = create_performance_table(results)
    print("\nPerformance Summary:")
    print(table)

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/docs/examples/docker_example.py
+++ b/docs/examples/docker_example.py
@@ -6,63 +6,80 @@ import base64
 import os
 from typing import Dict, Any

+
 class Crawl4AiTester:
    def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
        self.base_url = base_url
-        self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"  # Check environment variable as fallback
-        self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
-        
-    def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
+        self.api_token = (
+            api_token or os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
+        )  # Check environment variable as fallback
+        self.headers = (
+            {"Authorization": f"Bearer {self.api_token}"} if self.api_token else {}
+        )
+
+    def submit_and_wait(
+        self, request_data: Dict[str, Any], timeout: int = 300
+    ) -> Dict[str, Any]:
        # Submit crawl job
-        response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
+        response = requests.post(
+            f"{self.base_url}/crawl", json=request_data, headers=self.headers
+        )
        if response.status_code == 403:
            raise Exception("API token is invalid or missing")
        task_id = response.json()["task_id"]
        print(f"Task ID: {task_id}")
-        
+
        # Poll for result
        start_time = time.time()
        while True:
            if time.time() - start_time > timeout:
-                raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
-                
-            result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
+                raise TimeoutError(
+                    f"Task {task_id} did not complete within {timeout} seconds"
+                )
+
+            result = requests.get(
+                f"{self.base_url}/task/{task_id}", headers=self.headers
+            )
            status = result.json()
-            
+
            if status["status"] == "failed":
                print("Task failed:", status.get("error"))
                raise Exception(f"Task failed: {status.get('error')}")
-                
+
            if status["status"] == "completed":
                return status
-                
+
            time.sleep(2)
-            
+
    def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
-        response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
+        response = requests.post(
+            f"{self.base_url}/crawl_sync",
+            json=request_data,
+            headers=self.headers,
+            timeout=60,
+        )
        if response.status_code == 408:
            raise TimeoutError("Task did not complete within server timeout")
        response.raise_for_status()
        return response.json()
-    
+
    def crawl_direct(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
        """Directly crawl without using task queue"""
        response = requests.post(
-            f"{self.base_url}/crawl_direct", 
-            json=request_data, 
-            headers=self.headers
+            f"{self.base_url}/crawl_direct", json=request_data, headers=self.headers
        )
        response.raise_for_status()
        return response.json()

+
 def test_docker_deployment(version="basic"):
    tester = Crawl4AiTester(
-        base_url="http://localhost:11235" ,
+        base_url="http://localhost:11235",
        # base_url="https://api.crawl4ai.com" # just for example
        # api_token="test" # just for example
    )
    print(f"Testing Crawl4AI Docker {version} version")
-    
+
    # Health check with timeout and retry
    max_retries = 5
    for i in range(max_retries):
@@ -70,19 +87,19 @@ def test_docker_deployment(version="basic"):
            health = requests.get(f"{tester.base_url}/health", timeout=10)
            print("Health check:", health.json())
            break
-        except requests.exceptions.RequestException as e:
+        except requests.exceptions.RequestException:
            if i == max_retries - 1:
                print(f"Failed to connect after {max_retries} attempts")
                sys.exit(1)
            print(f"Waiting for service to start (attempt {i+1}/{max_retries})...")
            time.sleep(5)
-    
+
    # Test cases based on version
    test_basic_crawl_direct(tester)
    test_basic_crawl(tester)
    test_basic_crawl(tester)
    test_basic_crawl_sync(tester)
-    
+
    if version in ["full", "transformer"]:
        test_cosine_extraction(tester)

@@ -92,49 +109,52 @@ def test_docker_deployment(version="basic"):
    test_llm_extraction(tester)
    test_llm_with_ollama(tester)
    test_screenshot(tester)
-    
+

 def test_basic_crawl(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
-        "priority": 10, 
-        "session_id": "test"
+        "priority": 10,
+        "session_id": "test",
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]
    assert len(result["result"]["markdown"]) > 0

+
 def test_basic_crawl_sync(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl (Sync) ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 10,
-        "session_id": "test"
+        "session_id": "test",
    }
-    
+
    result = tester.submit_sync(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
-    assert result['status'] == 'completed'
-    assert result['result']['success']
-    assert len(result['result']['markdown']) > 0
-    
+    assert result["status"] == "completed"
+    assert result["result"]["success"]
+    assert len(result["result"]["markdown"]) > 0
+
+
 def test_basic_crawl_direct(tester: Crawl4AiTester):
    print("\n=== Testing Basic Crawl (Direct) ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 10,
        # "session_id": "test"
-        "cache_mode": "bypass"  # or "enabled", "disabled", "read_only", "write_only"
+        "cache_mode": "bypass",  # or "enabled", "disabled", "read_only", "write_only"
    }
-    
+
    result = tester.crawl_direct(request)
    print(f"Basic crawl result length: {len(result['result']['markdown'])}")
-    assert result['result']['success']
-    assert len(result['result']['markdown']) > 0
-    
+    assert result["result"]["success"]
+    assert len(result["result"]["markdown"]) > 0
+
+
 def test_js_execution(tester: Crawl4AiTester):
    print("\n=== Testing JS Execution ===")
    request = {
@@ -144,32 +164,29 @@ def test_js_execution(tester: Crawl4AiTester):
            "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
        ],
        "wait_for": "article.tease-card:nth-child(10)",
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"JS execution result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]

+
 def test_css_selector(tester: Crawl4AiTester):
    print("\n=== Testing CSS Selector ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 7,
        "css_selector": ".wide-tease-item__description",
-        "crawler_params": {
-            "headless": True
-        },
-        "extra": {"word_count_threshold": 10}
-        
+        "crawler_params": {"headless": True},
+        "extra": {"word_count_threshold": 10},
    }
-    
+
    result = tester.submit_and_wait(request)
    print(f"CSS selector result length: {len(result['result']['markdown'])}")
    assert result["result"]["success"]

+
 def test_structured_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Structured Extraction ===")
    schema = {
@@ -190,21 +207,16 @@ def test_structured_extraction(tester: Crawl4AiTester):
                "name": "price",
                "selector": "td:nth-child(2)",
                "type": "text",
-            }
+            },
        ],
    }
-    
+
    request = {
        "urls": "https://www.coinbase.com/explore",
        "priority": 9,
-        "extraction_config": {
-            "type": "json_css",
-            "params": {
-                "schema": schema
-            }
-        }
+        "extraction_config": {"type": "json_css", "params": {"schema": schema}},
    }
-    
+
    result = tester.submit_and_wait(request)
    extracted = json.loads(result["result"]["extracted_content"])
    print(f"Extracted {len(extracted)} items")
@@ -212,6 +224,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
    assert result["result"]["success"]
    assert len(extracted) > 0

+
 def test_llm_extraction(tester: Crawl4AiTester):
    print("\n=== Testing LLM Extraction ===")
    schema = {
@@ -219,20 +232,20 @@ def test_llm_extraction(tester: Crawl4AiTester):
        "properties": {
            "model_name": {
                "type": "string",
-                "description": "Name of the OpenAI model."
+                "description": "Name of the OpenAI model.",
            },
            "input_fee": {
                "type": "string",
-                "description": "Fee for input token for the OpenAI model."
+                "description": "Fee for input token for the OpenAI model.",
            },
            "output_fee": {
                "type": "string",
-                "description": "Fee for output token for the OpenAI model."
-            }
+                "description": "Fee for output token for the OpenAI model.",
+            },
        },
-        "required": ["model_name", "input_fee", "output_fee"]
+        "required": ["model_name", "input_fee", "output_fee"],
    }
-    
+
    request = {
        "urls": "https://openai.com/api/pricing",
        "priority": 8,
@@ -243,12 +256,12 @@ def test_llm_extraction(tester: Crawl4AiTester):
                "api_token": os.getenv("OPENAI_API_KEY"),
                "schema": schema,
                "extraction_type": "schema",
-                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens."""
-            }
+                "instruction": """From the crawled content, extract all mentioned model names along with their fees for input and output tokens.""",
+            },
        },
-        "crawler_params": {"word_count_threshold": 1}
+        "crawler_params": {"word_count_threshold": 1},
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -258,6 +271,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
    except Exception as e:
        print(f"LLM extraction test failed (might be due to missing API key): {str(e)}")

+
 def test_llm_with_ollama(tester: Crawl4AiTester):
    print("\n=== Testing LLM with Ollama ===")
    schema = {
@@ -265,20 +279,20 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
        "properties": {
            "article_title": {
                "type": "string",
-                "description": "The main title of the news article"
+                "description": "The main title of the news article",
            },
            "summary": {
                "type": "string",
-                "description": "A brief summary of the article content"
+                "description": "A brief summary of the article content",
            },
            "main_topics": {
                "type": "array",
                "items": {"type": "string"},
-                "description": "Main topics or themes discussed in the article"
-            }
-        }
+                "description": "Main topics or themes discussed in the article",
+            },
+        },
    }
-    
+
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 8,
@@ -288,13 +302,13 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
                "provider": "ollama/llama2",
                "schema": schema,
                "extraction_type": "schema",
-                "instruction": "Extract the main article information including title, summary, and main topics."
-            }
+                "instruction": "Extract the main article information including title, summary, and main topics.",
+            },
        },
        "extra": {"word_count_threshold": 1},
-        "crawler_params": {"verbose": True}
+        "crawler_params": {"verbose": True},
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -303,6 +317,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
    except Exception as e:
        print(f"Ollama extraction test failed: {str(e)}")

+
 def test_cosine_extraction(tester: Crawl4AiTester):
    print("\n=== Testing Cosine Extraction ===")
    request = {
@@ -314,11 +329,11 @@ def test_cosine_extraction(tester: Crawl4AiTester):
                "semantic_filter": "business finance economy",
                "word_count_threshold": 10,
                "max_dist": 0.2,
-                "top_k": 3
-            }
-        }
+                "top_k": 3,
+            },
+        },
    }
-    
+
    try:
        result = tester.submit_and_wait(request)
        extracted = json.loads(result["result"]["extracted_content"])
@@ -328,30 +343,30 @@ def test_cosine_extraction(tester: Crawl4AiTester):
    except Exception as e:
        print(f"Cosine extraction test failed: {str(e)}")

+
 def test_screenshot(tester: Crawl4AiTester):
    print("\n=== Testing Screenshot ===")
    request = {
        "urls": "https://www.nbcnews.com/business",
        "priority": 5,
        "screenshot": True,
-        "crawler_params": {
-            "headless": True
-        }
+        "crawler_params": {"headless": True},
    }
-    
+
    result = tester.submit_and_wait(request)
    print("Screenshot captured:", bool(result["result"]["screenshot"]))
-    
+
    if result["result"]["screenshot"]:
        # Save screenshot
        screenshot_data = base64.b64decode(result["result"]["screenshot"])
        with open("test_screenshot.jpg", "wb") as f:
            f.write(screenshot_data)
        print("Screenshot saved as test_screenshot.jpg")
-    
+
    assert result["result"]["success"]

+
 if __name__ == "__main__":
    version = sys.argv[1] if len(sys.argv) > 1 else "basic"
    # version = "full"
-    test_docker_deployment(version)
+    test_docker_deployment(version)
--- a/docs/examples/extraction_strategies_example.py
+++ b/docs/examples/extraction_strategies_example.py
@@ -9,18 +9,17 @@ This example shows how to:

 import asyncio
 import os
-from typing import Dict, Any

 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    JsonCssExtractionStrategy,
-    JsonXPathExtractionStrategy
+    JsonXPathExtractionStrategy,
 )
-from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
 from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

+
 async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
    """Helper function to run extraction with proper configuration"""
    try:
@@ -30,78 +29,90 @@ async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str
            extraction_strategy=strategy,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter()  # For fit_markdown support
-            )
+            ),
        )
-        
+
        # Run the crawler
        result = await crawler.arun(url=url, config=config)
-        
+
        if result.success:
            print(f"\n=== {name} Results ===")
            print(f"Extracted Content: {result.extracted_content}")
            print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
-            print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
+            print(
+                f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}"
+            )
        else:
            print(f"Error in {name}: Crawl failed")
-            
+
    except Exception as e:
        print(f"Error in {name}: {str(e)}")

+
 async def main():
    # Example URL (replace with actual URL)
    url = "https://example.com/product-page"
-    
+
    # Configure browser settings
-    browser_config = BrowserConfig(
-        headless=True,
-        verbose=True
-    )
-    
+    browser_config = BrowserConfig(headless=True, verbose=True)
+
    # Initialize extraction strategies
-    
+
    # 1. LLM Extraction with different input formats
    markdown_strategy = LLMExtractionStrategy(
        provider="openai/gpt-4o-mini",
        api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="Extract product information including name, price, and description"
+        instruction="Extract product information including name, price, and description",
    )
-    
+
    html_strategy = LLMExtractionStrategy(
        input_format="html",
        provider="openai/gpt-4o-mini",
        api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="Extract product information from HTML including structured data"
+        instruction="Extract product information from HTML including structured data",
    )
-    
+
    fit_markdown_strategy = LLMExtractionStrategy(
        input_format="fit_markdown",
        provider="openai/gpt-4o-mini",
        api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="Extract product information from cleaned markdown"
+        instruction="Extract product information from cleaned markdown",
    )
-    
+
    # 2. JSON CSS Extraction (automatically uses HTML input)
    css_schema = {
        "baseSelector": ".product",
        "fields": [
            {"name": "title", "selector": "h1.product-title", "type": "text"},
            {"name": "price", "selector": ".price", "type": "text"},
-            {"name": "description", "selector": ".description", "type": "text"}
-        ]
+            {"name": "description", "selector": ".description", "type": "text"},
+        ],
    }
    css_strategy = JsonCssExtractionStrategy(schema=css_schema)
-    
+
    # 3. JSON XPath Extraction (automatically uses HTML input)
    xpath_schema = {
        "baseSelector": "//div[@class='product']",
        "fields": [
-            {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
-            {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
-            {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
-        ]
+            {
+                "name": "title",
+                "selector": ".//h1[@class='product-title']/text()",
+                "type": "text",
+            },
+            {
+                "name": "price",
+                "selector": ".//span[@class='price']/text()",
+                "type": "text",
+            },
+            {
+                "name": "description",
+                "selector": ".//div[@class='description']/text()",
+                "type": "text",
+            },
+        ],
    }
    xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
-    
+
    # Use context manager for proper resource handling
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Run all strategies
@@ -111,5 +122,6 @@ async def main():
        await run_extraction(crawler, url, css_strategy, "CSS Extraction")
        await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")

+
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -1,20 +1,23 @@
 import asyncio
 from crawl4ai import *

+
 async def main():
    browser_config = BrowserConfig(headless=True, verbose=True)
    async with AsyncWebCrawler(config=browser_config) as crawler:
        crawler_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
-            )
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
+            ),
        )
        result = await crawler.arun(
-            url="https://www.helloworld.org",
-            config=crawler_config
+            url="https://www.helloworld.org", config=crawler_config
        )
        print(result.markdown_v2.raw_markdown[:500])

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/docs/examples/hooks_example.py
+++ b/docs/examples/hooks_example.py
@@ -1,19 +1,18 @@
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from playwright.async_api import Page, BrowserContext

+
 async def main():
    print("🔗 Hooks Example: Demonstrating different hook use cases")

    # Configure browser settings
-    browser_config = BrowserConfig(
-        headless=True
-    )
-    
+    browser_config = BrowserConfig(headless=True)
+
    # Configure crawler settings
    crawler_run_config = CrawlerRunConfig(
        js_code="window.scrollTo(0, document.body.scrollHeight);",
        wait_for="body",
-        cache_mode=CacheMode.BYPASS
+        cache_mode=CacheMode.BYPASS,
    )

    # Create crawler instance
@@ -30,16 +29,22 @@ async def main():
        """Hook called after a new page and context are created"""
        print("[HOOK] on_page_context_created - New page created!")
        # Example: Set default viewport size
-        await context.add_cookies([{
-            'name': 'session_id',
-            'value': 'example_session',
-            'domain': '.example.com',
-            'path': '/'
-        }])
+        await context.add_cookies(
+            [
+                {
+                    "name": "session_id",
+                    "value": "example_session",
+                    "domain": ".example.com",
+                    "path": "/",
+                }
+            ]
+        )
        await page.set_viewport_size({"width": 1080, "height": 800})
        return page

-    async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):
+    async def on_user_agent_updated(
+        page: Page, context: BrowserContext, user_agent: str, **kwargs
+    ):
        """Hook called when the user agent is updated"""
        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
        return page
@@ -53,17 +58,17 @@ async def main():
        """Hook called before navigating to each URL"""
        print(f"[HOOK] before_goto - About to visit: {url}")
        # Example: Add custom headers for the request
-        await page.set_extra_http_headers({
-            "Custom-Header": "my-value"
-        })
+        await page.set_extra_http_headers({"Custom-Header": "my-value"})
        return page

-    async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
+    async def after_goto(
+        page: Page, context: BrowserContext, url: str, response: dict, **kwargs
+    ):
        """Hook called after navigating to each URL"""
        print(f"[HOOK] after_goto - Successfully loaded: {url}")
        # Example: Wait for a specific element to be loaded
        try:
-            await page.wait_for_selector('.content', timeout=1000)
+            await page.wait_for_selector(".content", timeout=1000)
            print("Content element found!")
        except:
            print("Content element not found, continuing anyway")
@@ -76,7 +81,9 @@ async def main():
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
        return page

-    async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):
+    async def before_return_html(
+        page: Page, context: BrowserContext, html: str, **kwargs
+    ):
        """Hook called before returning the HTML content"""
        print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
        # Example: You could modify the HTML content here if needed
@@ -84,7 +91,9 @@ async def main():

    # Set all the hooks
    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
-    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
+    crawler.crawler_strategy.set_hook(
+        "on_page_context_created", on_page_context_created
+    )
    crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
    crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
    crawler.crawler_strategy.set_hook("before_goto", before_goto)
@@ -95,13 +104,15 @@ async def main():
    await crawler.start()

    # Example usage: crawl a simple website
-    url = 'https://example.com'
+    url = "https://example.com"
    result = await crawler.arun(url, config=crawler_run_config)
    print(f"\nCrawled URL: {result.url}")
    print(f"HTML length: {len(result.html)}")
-    
+
    await crawler.close()

+
 if __name__ == "__main__":
    import asyncio
-    asyncio.run(main())
+
+    asyncio.run(main())
--- a/docs/examples/language_support_example.py
+++ b/docs/examples/language_support_example.py
@@ -1,6 +1,7 @@
 import asyncio
 from crawl4ai import AsyncWebCrawler, AsyncPlaywrightCrawlerStrategy

+
 async def main():
    # Example 1: Setting language when creating the crawler
    crawler1 = AsyncWebCrawler(
@@ -9,11 +10,15 @@ async def main():
        )
    )
    result1 = await crawler1.arun("https://www.example.com")
-    print("Example 1 result:", result1.extracted_content[:100])  # Print first 100 characters
+    print(
+        "Example 1 result:", result1.extracted_content[:100]
+    )  # Print first 100 characters

    # Example 2: Setting language before crawling
    crawler2 = AsyncWebCrawler()
-    crawler2.crawler_strategy.headers["Accept-Language"] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
+    crawler2.crawler_strategy.headers[
+        "Accept-Language"
+    ] = "es-ES,es;q=0.9,en-US;q=0.8,en;q=0.7"
    result2 = await crawler2.arun("https://www.example.com")
    print("Example 2 result:", result2.extracted_content[:100])

@@ -21,7 +26,7 @@ async def main():
    crawler3 = AsyncWebCrawler()
    result3 = await crawler3.arun(
        "https://www.example.com",
-        headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"}
+        headers={"Accept-Language": "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"},
    )
    print("Example 3 result:", result3.extracted_content[:100])

@@ -31,15 +36,15 @@ async def main():
        ("https://www.example.org", "es-ES,es;q=0.9"),
        ("https://www.example.net", "de-DE,de;q=0.9"),
    ]
-    
+
    crawler4 = AsyncWebCrawler()
-    results = await asyncio.gather(*[
-        crawler4.arun(url, headers={"Accept-Language": lang})
-        for url, lang in urls
-    ])
-    
+    results = await asyncio.gather(
+        *[crawler4.arun(url, headers={"Accept-Language": lang}) for url, lang in urls]
+    )
+
    for url, result in zip([u for u, _ in urls], results):
        print(f"Result for {url}:", result.extracted_content[:100])

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -3,32 +3,37 @@ from crawl4ai.crawler_strategy import *
 import asyncio
 from pydantic import BaseModel, Field

-url = r'https://openai.com/api/pricing/'
+url = r"https://openai.com/api/pricing/"
+

 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )
+

 from crawl4ai import AsyncWebCrawler

+
 async def main():
    # Use AsyncWebCrawler
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
            word_count_threshold=1,
-            extraction_strategy= LLMExtractionStrategy(
+            extraction_strategy=LLMExtractionStrategy(
                # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
-                provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
+                provider="groq/llama-3.1-70b-versatile",
+                api_token=os.getenv("GROQ_API_KEY"),
                schema=OpenAIModelFee.model_json_schema(),
                extraction_type="schema",
-                instruction="From the crawled content, extract all mentioned model names along with their " \
-                            "fees for input and output tokens. Make sure not to miss anything in the entire content. " \
-                            'One extracted model JSON format should look like this: ' \
-                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
+                instruction="From the crawled content, extract all mentioned model names along with their "
+                "fees for input and output tokens. Make sure not to miss anything in the entire content. "
+                "One extracted model JSON format should look like this: "
+                '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
            ),
-
        )
        print("Success:", result.success)
        model_fees = json.loads(result.extracted_content)
@@ -37,4 +42,5 @@ async def main():
        with open(".data/data.json", "w", encoding="utf-8") as f:
            f.write(result.extracted_content)

+
 asyncio.run(main())
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -8,12 +8,12 @@ import asyncio
 import time
 import json
 import re
-from typing import Dict, List
+from typing import Dict
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.extraction_strategy import (
    JsonCssExtractionStrategy,
    LLMExtractionStrategy,
@@ -62,6 +62,7 @@ async def clean_content():
        print(f"Full Markdown Length: {full_markdown_length}")
        print(f"Fit Markdown Length: {fit_markdown_length}")

+
 async def link_analysis():
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.ENABLED,
@@ -76,9 +77,10 @@ async def link_analysis():
        print(f"Found {len(result.links['internal'])} internal links")
        print(f"Found {len(result.links['external'])} external links")

-        for link in result.links['internal'][:5]:
+        for link in result.links["internal"][:5]:
            print(f"Href: {link['href']}\nText: {link['text']}\n")

+
 # JavaScript Execution Example
 async def simple_example_with_running_js_code():
    print("\n--- Executing JavaScript and Using CSS Selectors ---")
@@ -112,25 +114,29 @@ async def simple_example_with_css_selector():
        )
        print(result.markdown[:500])

+
 async def media_handling():
-    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
+    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=crawler_config
+            url="https://www.nbcnews.com/business", config=crawler_config
        )
-        for img in result.media['images'][:5]:
+        for img in result.media["images"][:5]:
            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")

+
 async def custom_hook_workflow(verbose=True):
    async with AsyncWebCrawler() as crawler:
        # Set a 'before_goto' hook to run custom code just before navigation
-        crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))
+        crawler.crawler_strategy.set_hook(
+            "before_goto",
+            lambda page, context: print("[Hook] Preparing to navigate..."),
+        )

        # Perform the crawl operation
-        result = await crawler.arun(
-            url="https://crawl4ai.com"
-        )
+        result = await crawler.arun(url="https://crawl4ai.com")
        print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))


@@ -412,21 +418,22 @@ async def cosine_similarity_extraction():
        cache_mode=CacheMode.BYPASS,
        extraction_strategy=CosineStrategy(
            word_count_threshold=10,
-            max_dist=0.2, # Maximum distance between two words
-            linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
-            top_k=3, # Number of top keywords to extract
-            sim_threshold=0.3, # Similarity threshold for clustering
-            semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
-            verbose=True
-        ),        
+            max_dist=0.2,  # Maximum distance between two words
+            linkage_method="ward",  # Linkage method for hierarchical clustering (ward, complete, average, single)
+            top_k=3,  # Number of top keywords to extract
+            sim_threshold=0.3,  # Similarity threshold for clustering
+            semantic_filter="McDonald's economic impact, American consumer trends",  # Keywords to filter the content semantically using embeddings
+            verbose=True,
+        ),
    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
-            config=crawl_config
+            config=crawl_config,
        )
        print(json.loads(result.extracted_content)[:5])

+
 # Browser Comparison
 async def crawl_custom_browser_type():
    print("\n--- Browser Comparison ---")
@@ -484,39 +491,42 @@ async def crawl_with_user_simulation():
        result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
        print(result.markdown)

+
 async def ssl_certification():
    # Configure crawler to fetch SSL certificate
    config = CrawlerRunConfig(
        fetch_ssl_certificate=True,
-        cache_mode=CacheMode.BYPASS  # Bypass cache to always get fresh certificates
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
    )

    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url='https://example.com',
-            config=config
-        )
-        
+        result = await crawler.arun(url="https://example.com", config=config)
+
        if result.success and result.ssl_certificate:
            cert = result.ssl_certificate
-            
+
            # 1. Access certificate properties directly
            print("\nCertificate Information:")
            print(f"Issuer: {cert.issuer.get('CN', '')}")
            print(f"Valid until: {cert.valid_until}")
            print(f"Fingerprint: {cert.fingerprint}")
-            
+
            # 2. Export certificate in different formats
            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
            print("\nCertificate exported to:")
            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
-            
-            pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))  # For web servers
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
-            
-            der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der"))  # For Java apps
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")

+
 # Speed Comparison
 async def speed_comparison():
    print("\n--- Speed Comparison ---")
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -1,6 +1,10 @@
 import os, sys
+
 # append parent directory to system path
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))); os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692";
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"

 import asyncio
 # import nest_asyncio
@@ -15,7 +19,7 @@ from bs4 import BeautifulSoup
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.extraction_strategy import (
    JsonCssExtractionStrategy,
    LLMExtractionStrategy,
@@ -32,9 +36,12 @@ print("Website: https://crawl4ai.com")
 async def simple_crawl():
    print("\n--- Basic Usage ---")
    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
+        )
        print(result.markdown[:500])  # Print first 500 characters

+
 async def simple_example_with_running_js_code():
    print("\n--- Executing JavaScript and Using CSS Selectors ---")
    # New code to handle the wait_for parameter
@@ -57,6 +64,7 @@ async def simple_example_with_running_js_code():
        )
        print(result.markdown[:500])  # Print first 500 characters

+
 async def simple_example_with_css_selector():
    print("\n--- Using CSS Selectors ---")
    async with AsyncWebCrawler(verbose=True) as crawler:
@@ -67,42 +75,44 @@ async def simple_example_with_css_selector():
        )
        print(result.markdown[:500])  # Print first 500 characters

+
 async def use_proxy():
    print("\n--- Using a Proxy ---")
    print(
        "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
    )
    # Uncomment and modify the following lines to use a proxy
-    async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
+    async with AsyncWebCrawler(
+        verbose=True, proxy="http://your-proxy-url:port"
+    ) as crawler:
        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            cache_mode= CacheMode.BYPASS
+            url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
        )
        if result.success:
            print(result.markdown[:500])  # Print first 500 characters

+
 async def capture_and_save_screenshot(url: str, output_path: str):
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
-            url=url,
-            screenshot=True,
-            cache_mode= CacheMode.BYPASS
+            url=url, screenshot=True, cache_mode=CacheMode.BYPASS
        )
-        
+
        if result.success and result.screenshot:
            import base64
-            
+
            # Decode the base64 screenshot data
            screenshot_data = base64.b64decode(result.screenshot)
-            
+
            # Save the screenshot as a JPEG file
-            with open(output_path, 'wb') as f:
+            with open(output_path, "wb") as f:
                f.write(screenshot_data)
-            
+
            print(f"Screenshot saved successfully to {output_path}")
        else:
            print("Failed to capture screenshot")

+
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
@@ -110,16 +120,19 @@ class OpenAIModelFee(BaseModel):
        ..., description="Fee for output token for the OpenAI model."
    )

-async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
    print(f"\n--- Extracting Structured Data with {provider} ---")
-    
+
    if api_token is None and provider != "ollama":
        print(f"API token is required for {provider}. Skipping this example.")
        return

    # extra_args = {}
-    extra_args={
-        "temperature": 0, 
+    extra_args = {
+        "temperature": 0,
        "top_p": 0.9,
        "max_tokens": 2000,
        # any other supported parameters for litellm
@@ -139,52 +152,49 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
                Do not miss any models in the entire content. One extracted model JSON format should look like this: 
                {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
-                extra_args=extra_args
+                extra_args=extra_args,
            ),
            cache_mode=CacheMode.BYPASS,
        )
        print(result.extracted_content)

+
 async def extract_structured_data_using_css_extractor():
    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
    schema = {
-    "name": "KidoCode Courses",
-    "baseSelector": "section.charge-methodology .w-tab-content > div",
-    "fields": [
-        {
-            "name": "section_title",
-            "selector": "h3.heading-50",
-            "type": "text",
-        },
-        {
-            "name": "section_description",
-            "selector": ".charge-content",
-            "type": "text",
-        },
-        {
-            "name": "course_name",
-            "selector": ".text-block-93",
-            "type": "text",
-        },
-        {
-            "name": "course_description",
-            "selector": ".course-content-text",
-            "type": "text",
-        },
-        {
-            "name": "course_icon",
-            "selector": ".image-92",
-            "type": "attribute",
-            "attribute": "src"
-        }
-    ]
-}
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src",
+            },
+        ],
+    }

-    async with AsyncWebCrawler(
-        headless=True,
-        verbose=True
-    ) as crawler:
-        
+    async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
        # Create the JavaScript that handles clicking multiple times
        js_click_tabs = """
        (async () => {
@@ -198,19 +208,20 @@ async def extract_structured_data_using_css_extractor():
                await new Promise(r => setTimeout(r, 500));
            }
        })();
-        """     
+        """

        result = await crawler.arun(
            url="https://www.kidocode.com/degrees/technology",
            extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
            js_code=[js_click_tabs],
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
        )

        companies = json.loads(result.extracted_content)
        print(f"Successfully extracted {len(companies)} companies")
        print(json.dumps(companies[0], indent=2))

+
 # Advanced Session-Based Crawling with Dynamic Content 🔄
 async def crawl_dynamic_content_pages_method_1():
    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
@@ -267,6 +278,7 @@ async def crawl_dynamic_content_pages_method_1():
        await crawler.crawler_strategy.kill_session(session_id)
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

+
 async def crawl_dynamic_content_pages_method_2():
    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")

@@ -334,8 +346,11 @@ async def crawl_dynamic_content_pages_method_2():
        await crawler.crawler_strategy.kill_session(session_id)
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

+
 async def crawl_dynamic_content_pages_method_3():
-    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---")
+    print(
+        "\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
+    )

    async with AsyncWebCrawler(verbose=True) as crawler:
        url = "https://github.com/microsoft/TypeScript/commits/main"
@@ -357,7 +372,7 @@ async def crawl_dynamic_content_pages_method_3():
            const firstCommit = commits[0].textContent.trim();
            return firstCommit !== window.firstCommit;
        }"""
-        
+
        schema = {
            "name": "Commit Extractor",
            "baseSelector": "li.Box-sc-g0xbh4-0",
@@ -395,40 +410,53 @@ async def crawl_dynamic_content_pages_method_3():
        await crawler.crawler_strategy.kill_session(session_id)
        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

+
 async def crawl_custom_browser_type():
    # Use Firefox
    start = time.time()
-    async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+    async with AsyncWebCrawler(
+        browser_type="firefox", verbose=True, headless=True
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com", cache_mode=CacheMode.BYPASS
+        )
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)

    # Use WebKit
    start = time.time()
-    async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+    async with AsyncWebCrawler(
+        browser_type="webkit", verbose=True, headless=True
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com", cache_mode=CacheMode.BYPASS
+        )
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)

    # Use Chromium (default)
    start = time.time()
-    async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
+    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com", cache_mode=CacheMode.BYPASS
+        )
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)

+
 async def crawl_with_user_simultion():
    async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
        url = "YOUR-URL-HERE"
        result = await crawler.arun(
-            url=url,            
+            url=url,
            cache_mode=CacheMode.BYPASS,
-            magic = True, # Automatically detects and removes overlays, popups, and other elements that block content
+            magic=True,  # Automatically detects and removes overlays, popups, and other elements that block content
            # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
            # override_navigator = True # Overrides the navigator object to make it look like a real user
        )
-        
-        print(result.markdown)    
+
+        print(result.markdown)
+

 async def speed_comparison():
    # print("\n--- Speed Comparison ---")
@@ -439,18 +467,18 @@ async def speed_comparison():
    # print()
    # Simulated Firecrawl performance
    from firecrawl import FirecrawlApp
-    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+
+    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
    start = time.time()
    scrape_status = app.scrape_url(
-    'https://www.nbcnews.com/business',
-    params={'formats': ['markdown', 'html']}
+        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
    )
    end = time.time()
    print("Firecrawl:")
    print(f"Time taken: {end - start:.2f} seconds")
    print(f"Content length: {len(scrape_status['markdown'])} characters")
    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
-    print()    
+    print()

    async with AsyncWebCrawler() as crawler:
        # Crawl4AI simple crawl
@@ -474,7 +502,9 @@ async def speed_comparison():
            url="https://www.nbcnews.com/business",
            word_count_threshold=0,
            markdown_generator=DefaultMarkdownGenerator(
-                content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
            ),
            cache_mode=CacheMode.BYPASS,
@@ -498,7 +528,9 @@ async def speed_comparison():
            word_count_threshold=0,
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
-                content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                )
                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
            ),
            verbose=False,
@@ -520,11 +552,12 @@ async def speed_comparison():
    print("If you run these tests in an environment with better network conditions,")
    print("you may observe an even more significant speed advantage for Crawl4AI.")

+
 async def generate_knowledge_graph():
    class Entity(BaseModel):
        name: str
        description: str
-        
+
    class Relationship(BaseModel):
        entity1: Entity
        entity2: Entity
@@ -536,11 +569,11 @@ async def generate_knowledge_graph():
        relationships: List[Relationship]

    extraction_strategy = LLMExtractionStrategy(
-            provider='openai/gpt-4o-mini', # Or any other provider, including Ollama and open source models
-            api_token=os.getenv('OPENAI_API_KEY'), # In case of Ollama just pass "no-token"
-            schema=KnowledgeGraph.model_json_schema(),
-            extraction_type="schema",
-            instruction="""Extract entities and relationships from the given text."""
+        provider="openai/gpt-4o-mini",  # Or any other provider, including Ollama and open source models
+        api_token=os.getenv("OPENAI_API_KEY"),  # In case of Ollama just pass "no-token"
+        schema=KnowledgeGraph.model_json_schema(),
+        extraction_type="schema",
+        instruction="""Extract entities and relationships from the given text.""",
    )
    async with AsyncWebCrawler() as crawler:
        url = "https://paulgraham.com/love.html"
@@ -554,27 +587,22 @@ async def generate_knowledge_graph():
        with open(os.path.join(__location__, "kb.json"), "w") as f:
            f.write(result.extracted_content)

+
 async def fit_markdown_remove_overlay():
-    
    async with AsyncWebCrawler(
-            headless=True,  # Set to False to see what is happening
-            verbose=True,
-            user_agent_mode="random",
-            user_agent_generator_config={
-                "device_type": "mobile",
-                "os_type": "android"
-            },
+        headless=True,  # Set to False to see what is happening
+        verbose=True,
+        user_agent_mode="random",
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
    ) as crawler:
        result = await crawler.arun(
-            url='https://www.kidocode.com/degrees/technology',
+            url="https://www.kidocode.com/degrees/technology",
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter(
                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
                ),
-                options={
-                    "ignore_links": True
-                }
+                options={"ignore_links": True},
            ),
            # markdown_generator=DefaultMarkdownGenerator(
            #     content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
@@ -583,31 +611,38 @@ async def fit_markdown_remove_overlay():
            #     }
            # ),
        )
-        
+
        if result.success:
            print(len(result.markdown_v2.raw_markdown))
            print(len(result.markdown_v2.markdown_with_citations))
            print(len(result.markdown_v2.fit_markdown))
-            
+
            # Save clean html
            with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
                f.write(result.cleaned_html)
-            
-            with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
+
+            with open(
+                os.path.join(__location__, "output/output_raw_markdown.md"), "w"
+            ) as f:
                f.write(result.markdown_v2.raw_markdown)
-                
-            with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
-                f.write(result.markdown_v2.markdown_with_citations) 
-                
-            with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:   
+
+            with open(
+                os.path.join(__location__, "output/output_markdown_with_citations.md"),
+                "w",
+            ) as f:
+                f.write(result.markdown_v2.markdown_with_citations)
+
+            with open(
+                os.path.join(__location__, "output/output_fit_markdown.md"), "w"
+            ) as f:
                f.write(result.markdown_v2.fit_markdown)
-        
+
    print("Done")


 async def main():
    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-    
+
    # await simple_crawl()
    # await simple_example_with_running_js_code()
    # await simple_example_with_css_selector()
@@ -618,7 +653,7 @@ async def main():
    # LLM extraction examples
    # await extract_structured_data_using_llm()
    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-    # await extract_structured_data_using_llm("ollama/llama3.2")    
+    # await extract_structured_data_using_llm("ollama/llama3.2")

    # You always can pass custom headers to the extraction strategy
    # custom_headers = {
@@ -626,13 +661,13 @@ async def main():
    #     "X-Custom-Header": "Some-Value"
    # }
    # await extract_structured_data_using_llm(extra_headers=custom_headers)
-    
+
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
    await crawl_dynamic_content_pages_method_3()
-    
+
    # await crawl_custom_browser_type()
-    
+
    # await speed_comparison()


--- a/docs/examples/quickstart_sync.py
+++ b/docs/examples/quickstart_sync.py
@@ -10,15 +10,17 @@ from functools import lru_cache

 console = Console()

+
@lru_cache()
 def create_crawler():
    crawler = WebCrawler(verbose=True)
    crawler.warmup()
    return crawler

+
 def print_result(result):
    # Print each key in one line and just the first 10 characters of each one's value and three dots
-    console.print(f"\t[bold]Result:[/bold]")
+    console.print("\t[bold]Result:[/bold]")
    for key, value in result.model_dump().items():
        if isinstance(value, str) and value:
            console.print(f"\t{key}: [green]{value[:20]}...[/green]")
@@ -33,18 +35,27 @@ def cprint(message, press_any_key=False):
        console.print("Press any key to continue...", style="")
        input()

+
 def basic_usage(crawler):
-    cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", only_text = True)
+    cprint(
+        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
+    )
+    result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)

+
 def basic_usage_some_params(crawler):
-    cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True)
+    cprint(
+        "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
+    )
+    result = crawler.run(
+        url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
+    )
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)

+
 def screenshot_usage(crawler):
    cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
    result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
@@ -55,16 +66,23 @@ def screenshot_usage(crawler):
    cprint("Screenshot saved to 'screenshot.png'!")
    print_result(result)

+
 def understanding_parameters(crawler):
-    cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
-    cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
-    
+    cprint(
+        "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
+    )
+    cprint(
+        "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
+    )
+
    # First crawl (reads from cache)
    cprint("1️⃣ First crawl (caches the result):", True)
    start_time = time.time()
    result = crawler.run(url="https://www.nbcnews.com/business")
    end_time = time.time()
-    cprint(f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]")
+    cprint(
+        f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
+    )
    print_result(result)

    # Force to crawl again
@@ -72,169 +90,232 @@ def understanding_parameters(crawler):
    start_time = time.time()
    result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
    end_time = time.time()
-    cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
+    cprint(
+        f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
+    )
    print_result(result)

+
 def add_chunking_strategy(crawler):
    # Adding a chunking strategy: RegexChunking
-    cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
-    cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
+    cprint(
+        "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
+        True,
+    )
+    cprint(
+        "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
+    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
-        chunking_strategy=RegexChunking(patterns=["\n\n"])
+        chunking_strategy=RegexChunking(patterns=["\n\n"]),
    )
    cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
    print_result(result)

    # Adding another chunking strategy: NlpSentenceChunking
-    cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
-    cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
+    cprint(
+        "\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
+        True,
+    )
+    cprint(
+        "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
+    )
    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        chunking_strategy=NlpSentenceChunking()
+        url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
    )
    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
    print_result(result)

+
 def add_extraction_strategy(crawler):
    # Adding an extraction strategy: CosineStrategy
-    cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True)
-    cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
+    cprint(
+        "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
+        True,
+    )
+    cprint(
+        "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
+    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
-        extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=10,
+            max_dist=0.2,
+            linkage_method="ward",
+            top_k=3,
+            sim_threshold=0.3,
+            verbose=True,
+        ),
    )
    cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
    print_result(result)
-    
+
    # Using semantic_filter with CosineStrategy
-    cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
+    cprint(
+        "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
+    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=CosineStrategy(
            semantic_filter="inflation rent prices",
-        )
+        ),
+    )
+    cprint(
+        "[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
    )
-    cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
    print_result(result)

+
 def add_llm_extraction_strategy(crawler):
    # Adding an LLM extraction strategy without instructions
-    cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True)
-    cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
+    cprint(
+        "\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
+        True,
+    )
+    cprint(
+        "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
+    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
-        extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
+        extraction_strategy=LLMExtractionStrategy(
+            provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
+        ),
+    )
+    cprint(
+        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
    )
-    cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]")
    print_result(result)
-    
+
    # Adding an LLM extraction strategy with instructions
-    cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True)
-    cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!")
+    cprint(
+        "\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
+        True,
+    )
+    cprint(
+        "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
+    )
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=LLMExtractionStrategy(
            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
-            instruction="I am interested in only financial news"
-        )
+            api_token=os.getenv("OPENAI_API_KEY"),
+            instruction="I am interested in only financial news",
+        ),
+    )
+    cprint(
+        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
    )
-    cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]")
    print_result(result)
-    
+
    result = crawler.run(
        url="https://www.nbcnews.com/business",
        extraction_strategy=LLMExtractionStrategy(
            provider="openai/gpt-4o",
-            api_token=os.getenv('OPENAI_API_KEY'),
-            instruction="Extract only content related to technology"
-        )
+            api_token=os.getenv("OPENAI_API_KEY"),
+            instruction="Extract only content related to technology",
+        ),
+    )
+    cprint(
+        "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
    )
-    cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]")
    print_result(result)

+
 def targeted_extraction(crawler):
    # Using a CSS selector to extract only H2 tags
-    cprint("\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True)
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        css_selector="h2"
+    cprint(
+        "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
+        True,
    )
+    result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
    cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
    print_result(result)

+
 def interactive_extraction(crawler):
    # Passing JavaScript code to interact with the page
-    cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
-    cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
+    cprint(
+        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
+        True,
+    )
+    cprint(
+        "In this example we try to click the 'Load More' button on the page using JavaScript code."
+    )
    js_code = """
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
    """
    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        js = js_code
+    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
+    cprint(
+        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
    )
-    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)

+
 def multiple_scrip(crawler):
    # Passing JavaScript code to interact with the page
-    cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
-    cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
-    js_code = ["""
+    cprint(
+        "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
+        True,
+    )
+    cprint(
+        "In this example we try to click the 'Load More' button on the page using JavaScript code."
+    )
+    js_code = [
+        """
    const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
-    """] * 2
+    """
+    ] * 2
    # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
    # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        js = js_code  
+    result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
+    cprint(
+        "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
    )
-    cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
    print_result(result)

+
 def using_crawler_hooks(crawler):
    # Example usage of the hooks for authentication and setting a cookie
    def on_driver_created(driver):
        print("[HOOK] on_driver_created")
        # Example customization: maximize the window
        driver.maximize_window()
-        
+
        # Example customization: logging in to a hypothetical website
-        driver.get('https://example.com/login')
-        
+        driver.get("https://example.com/login")
+
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support import expected_conditions as EC
-        
+
        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.NAME, 'username'))
+            EC.presence_of_element_located((By.NAME, "username"))
        )
-        driver.find_element(By.NAME, 'username').send_keys('testuser')
-        driver.find_element(By.NAME, 'password').send_keys('password123')
-        driver.find_element(By.NAME, 'login').click()
+        driver.find_element(By.NAME, "username").send_keys("testuser")
+        driver.find_element(By.NAME, "password").send_keys("password123")
+        driver.find_element(By.NAME, "login").click()
        WebDriverWait(driver, 10).until(
-            EC.presence_of_element_located((By.ID, 'welcome'))
+            EC.presence_of_element_located((By.ID, "welcome"))
        )
        # Add a custom cookie
-        driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
-        return driver        
-        
+        driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
+        return driver

    def before_get_url(driver):
        print("[HOOK] before_get_url")
        # Example customization: add a custom header
        # Enable Network domain for sending headers
-        driver.execute_cdp_cmd('Network.enable', {})
+        driver.execute_cdp_cmd("Network.enable", {})
        # Add a custom header
-        driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
+        driver.execute_cdp_cmd(
+            "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
+        )
        return driver
-    
+
    def after_get_url(driver):
        print("[HOOK] after_get_url")
        # Example customization: log the URL
@@ -246,48 +327,59 @@ def using_crawler_hooks(crawler):
        # Example customization: log the HTML
        print(len(html))
        return driver
-    
-    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
-    
+
+    cprint(
+        "\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
+        True,
+    )
+
    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-    crawler_strategy.set_hook('on_driver_created', on_driver_created)
-    crawler_strategy.set_hook('before_get_url', before_get_url)
-    crawler_strategy.set_hook('after_get_url', after_get_url)
-    crawler_strategy.set_hook('before_return_html', before_return_html)
-    
+    crawler_strategy.set_hook("on_driver_created", on_driver_created)
+    crawler_strategy.set_hook("before_get_url", before_get_url)
+    crawler_strategy.set_hook("after_get_url", after_get_url)
+    crawler_strategy.set_hook("before_return_html", before_return_html)
+
    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
-    crawler.warmup()    
+    crawler.warmup()
    result = crawler.run(url="https://example.com")
-    
+
    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
-    print_result(result= result)
-    
+    print_result(result=result)
+
+
 def using_crawler_hooks_dleay_example(crawler):
    def delay(driver):
        print("Delaying for 5 seconds...")
        time.sleep(5)
        print("Resuming...")
-        
+
    def create_crawler():
        crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
-        crawler_strategy.set_hook('after_get_url', delay)
+        crawler_strategy.set_hook("after_get_url", delay)
        crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
        crawler.warmup()
        return crawler

-    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
+    cprint(
+        "\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
+    )
    crawler = create_crawler()
-    result = crawler.run(url="https://google.com", bypass_cache=True)    
-    
+    result = crawler.run(url="https://google.com", bypass_cache=True)
+
    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
    print_result(result)
-    
-    
+

 def main():
-    cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
-    cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
-    cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
+    cprint(
+        "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
+    )
+    cprint(
+        "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
+    )
+    cprint(
+        "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
+    )

    crawler = create_crawler()

@@ -295,7 +387,7 @@ def main():
    basic_usage(crawler)
    # basic_usage_some_params(crawler)
    understanding_parameters(crawler)
-    
+
    crawler.always_by_pass_cache = True
    screenshot_usage(crawler)
    add_chunking_strategy(crawler)
@@ -305,8 +397,10 @@ def main():
    interactive_extraction(crawler)
    multiple_scrip(crawler)

-    cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
+    cprint(
+        "\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
+    )
+

 if __name__ == "__main__":
    main()
-
--- a/docs/examples/research_assistant.py
+++ b/docs/examples/research_assistant.py
@@ -11,7 +11,9 @@ from groq import Groq
 # Import threadpools to run the crawl_url function in a separate thread
 from concurrent.futures import ThreadPoolExecutor

-client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
+client = AsyncOpenAI(
+    base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY")
+)

 # Instrument the OpenAI client
 cl.instrument_openai()
@@ -25,41 +27,39 @@ settings = {
    "presence_penalty": 0,
 }

+
 def extract_urls(text):
-    url_pattern = re.compile(r'(https?://\S+)')
+    url_pattern = re.compile(r"(https?://\S+)")
    return url_pattern.findall(text)

+
 def crawl_url(url):
    data = {
        "urls": [url],
        "include_raw_html": True,
        "word_count_threshold": 10,
        "extraction_strategy": "NoExtractionStrategy",
-        "chunking_strategy": "RegexChunking"
+        "chunking_strategy": "RegexChunking",
    }
    response = requests.post("https://crawl4ai.com/crawl", json=data)
    response_data = response.json()
-    response_data = response_data['results'][0]
-    return response_data['markdown']
+    response_data = response_data["results"][0]
+    return response_data["markdown"]
+

@cl.on_chat_start
 async def on_chat_start():
-    cl.user_session.set("session", {
-        "history": [],
-        "context": {}
-    })  
-    await cl.Message(
-        content="Welcome to the chat! How can I assist you today?"
-    ).send()
+    cl.user_session.set("session", {"history": [], "context": {}})
+    await cl.Message(content="Welcome to the chat! How can I assist you today?").send()
+

@cl.on_message
 async def on_message(message: cl.Message):
    user_session = cl.user_session.get("session")
-    
+
    # Extract URLs from the user's message
    urls = extract_urls(message.content)
-    
-    
+
    futures = []
    with ThreadPoolExecutor() as executor:
        for url in urls:
@@ -69,16 +69,9 @@ async def on_message(message: cl.Message):

    for url, result in zip(urls, results):
        ref_number = f"REF_{len(user_session['context']) + 1}"
-        user_session["context"][ref_number] = {
-            "url": url,
-            "content": result
-        }    
+        user_session["context"][ref_number] = {"url": url, "content": result}

-
-    user_session["history"].append({
-        "role": "user",
-        "content": message.content
-    })
+    user_session["history"].append({"role": "user", "content": message.content})

    # Create a system message that includes the context
    context_messages = [
@@ -95,26 +88,17 @@ async def on_message(message: cl.Message):
                "If not, there is no need to add a references section. "
                "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
                "\n\n".join(context_messages)
-            )
+            ),
        }
    else:
-        system_message = {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }
-
+        system_message = {"role": "system", "content": "You are a helpful assistant."}

    msg = cl.Message(content="")
    await msg.send()

    # Get response from the LLM
    stream = await client.chat.completions.create(
-        messages=[
-            system_message,
-            *user_session["history"]
-        ],
-        stream=True,
-        **settings
+        messages=[system_message, *user_session["history"]], stream=True, **settings
    )

    assistant_response = ""
@@ -124,10 +108,7 @@ async def on_message(message: cl.Message):
            await msg.stream_token(token)

    # Add assistant message to the history
-    user_session["history"].append({
-        "role": "assistant",
-        "content": assistant_response
-    })
+    user_session["history"].append({"role": "assistant", "content": assistant_response})
    await msg.update()

    # Append the reference section to the assistant's response
@@ -154,10 +135,11 @@ async def on_audio_chunk(chunk: cl.AudioChunk):

    pass

+
@cl.step(type="tool")
 async def speech_to_text(audio_file):
    cli = Groq()
-       
+
    response = await client.audio.transcriptions.create(
        model="whisper-large-v3", file=audio_file
    )
@@ -172,24 +154,19 @@ async def on_audio_end(elements: list[ElementBased]):
    audio_buffer.seek(0)  # Move the file pointer to the beginning
    audio_file = audio_buffer.read()
    audio_mime_type: str = cl.user_session.get("audio_mime_type")
-    
+
    start_time = time.time()
    whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
    transcription = await speech_to_text(whisper_input)
    end_time = time.time()
    print(f"Transcription took {end_time - start_time} seconds")
-    
-    user_msg = cl.Message(
-        author="You", 
-        type="user_message",
-        content=transcription
-    )
+
+    user_msg = cl.Message(author="You", type="user_message", content=transcription)
    await user_msg.send()
    await on_message(user_msg)


 if __name__ == "__main__":
    from chainlit.cli import run_chainlit
+
    run_chainlit(__file__)
-
-
--- a/docs/examples/rest_call.py
+++ b/docs/examples/rest_call.py
@@ -1,4 +1,3 @@
-
 import requests, base64, os

 data = {
@@ -6,59 +5,50 @@ data = {
    "screenshot": True,
 }

-response = requests.post("https://crawl4ai.com/crawl", json=data) 
-result = response.json()['results'][0]
+response = requests.post("https://crawl4ai.com/crawl", json=data)
+result = response.json()["results"][0]
 print(result.keys())
-# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media', 
-# 'links', 'screenshot', 'markdown', 'extracted_content', 
+# dict_keys(['url', 'html', 'success', 'cleaned_html', 'media',
+# 'links', 'screenshot', 'markdown', 'extracted_content',
 # 'metadata', 'error_message'])
 with open("screenshot.png", "wb") as f:
-    f.write(base64.b64decode(result['screenshot']))
-    
+    f.write(base64.b64decode(result["screenshot"]))
+
 # Example of filtering the content using CSS selectors
 data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
+    "urls": ["https://www.nbcnews.com/business"],
    "css_selector": "article",
    "screenshot": True,
 }

 # Example of executing a JS script on the page before extracting the content
 data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
+    "urls": ["https://www.nbcnews.com/business"],
    "screenshot": True,
-    'js' : ["""
+    "js": [
+        """
    const loadMoreButton = Array.from(document.querySelectorAll('button')).
    find(button => button.textContent.includes('Load More'));
    loadMoreButton && loadMoreButton.click();
-    """]
+    """
+    ],
 }

 # Example of using a custom extraction strategy
 data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
+    "urls": ["https://www.nbcnews.com/business"],
    "extraction_strategy": "CosineStrategy",
-    "extraction_strategy_args": {
-        "semantic_filter": "inflation rent prices"
-    },
+    "extraction_strategy_args": {"semantic_filter": "inflation rent prices"},
 }

 # Example of using LLM to extract content
 data = {
-    "urls": [
-        "https://www.nbcnews.com/business"
-    ],
+    "urls": ["https://www.nbcnews.com/business"],
    "extraction_strategy": "LLMExtractionStrategy",
    "extraction_strategy_args": {
        "provider": "groq/llama3-8b-8192",
        "api_token": os.environ.get("GROQ_API_KEY"),
        "instruction": """I am interested in only financial news, 
-        and translate them in French."""
+        and translate them in French.""",
    },
 }
-
--- a/docs/examples/ssl_example.py
+++ b/docs/examples/ssl_example.py
@@ -5,42 +5,47 @@ import os
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode

 # Create tmp directory if it doesn't exist
-parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+)
 tmp_dir = os.path.join(parent_dir, "tmp")
 os.makedirs(tmp_dir, exist_ok=True)

+
 async def main():
    # Configure crawler to fetch SSL certificate
    config = CrawlerRunConfig(
        fetch_ssl_certificate=True,
-        cache_mode=CacheMode.BYPASS  # Bypass cache to always get fresh certificates
+        cache_mode=CacheMode.BYPASS,  # Bypass cache to always get fresh certificates
    )

    async with AsyncWebCrawler() as crawler:
-        result = await crawler.arun(
-            url='https://example.com',
-            config=config
-        )
-        
+        result = await crawler.arun(url="https://example.com", config=config)
+
        if result.success and result.ssl_certificate:
            cert = result.ssl_certificate
-            
+
            # 1. Access certificate properties directly
            print("\nCertificate Information:")
            print(f"Issuer: {cert.issuer.get('CN', '')}")
            print(f"Valid until: {cert.valid_until}")
            print(f"Fingerprint: {cert.fingerprint}")
-            
+
            # 2. Export certificate in different formats
            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
            print("\nCertificate exported to:")
            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
-            
-            pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))  # For web servers
+
+            pem_data = cert.to_pem(
+                os.path.join(tmp_dir, "certificate.pem")
+            )  # For web servers
            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
-            
-            der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der"))  # For Java apps
+
+            der_data = cert.to_der(
+                os.path.join(tmp_dir, "certificate.der")
+            )  # For Java apps
            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")

+
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/examples/summarize_page.py
+++ b/docs/examples/summarize_page.py
@@ -1,39 +1,41 @@
 import os
-import time
 import json
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *

-url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
+url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"

 crawler = WebCrawler()
 crawler.warmup()

 from pydantic import BaseModel, Field

+
 class PageSummary(BaseModel):
    title: str = Field(..., description="Title of the page.")
    summary: str = Field(..., description="Summary of the page.")
    brief_summary: str = Field(..., description="Brief summary of the page.")
    keywords: list = Field(..., description="Keywords assigned to the page.")

+
 result = crawler.run(
    url=url,
    word_count_threshold=1,
-    extraction_strategy= LLMExtractionStrategy(
-        provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
+    extraction_strategy=LLMExtractionStrategy(
+        provider="openai/gpt-4o",
+        api_token=os.getenv("OPENAI_API_KEY"),
        schema=PageSummary.model_json_schema(),
        extraction_type="schema",
-        apply_chunking =False,
-        instruction="From the crawled content, extract the following details: "\
-            "1. Title of the page "\
-            "2. Summary of the page, which is a detailed summary "\
-            "3. Brief summary of the page, which is a paragraph text "\
-            "4. Keywords assigned to the page, which is a list of keywords. "\
-            'The extracted JSON format should look like this: '\
-            '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
+        apply_chunking=False,
+        instruction="From the crawled content, extract the following details: "
+        "1. Title of the page "
+        "2. Summary of the page, which is a detailed summary "
+        "3. Brief summary of the page, which is a paragraph text "
+        "4. Keywords assigned to the page, which is a list of keywords. "
+        "The extracted JSON format should look like this: "
+        '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
    ),
    bypass_cache=True,
 )
--- a/docs/examples/v0.3.74.overview.py
+++ b/docs/examples/v0.3.74.overview.py
@@ -1,4 +1,5 @@
 import os, sys
+
 # append the parent directory to the sys.path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
@@ -13,19 +14,18 @@ import json
 from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.content_filter_strategy import BM25ContentFilter

+
 # 1. File Download Processing Example
 async def download_example():
    """Example of downloading files from Python.org"""
    # downloads_path = os.path.join(os.getcwd(), "downloads")
    downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
    os.makedirs(downloads_path, exist_ok=True)
-    
+
    print(f"Downloads will be saved to: {downloads_path}")
-    
+
    async with AsyncWebCrawler(
-        accept_downloads=True,
-        downloads_path=downloads_path,
-        verbose=True
+        accept_downloads=True, downloads_path=downloads_path, verbose=True
    ) as crawler:
        result = await crawler.arun(
            url="https://www.python.org/downloads/",
@@ -40,9 +40,9 @@ async def download_example():
            }
            """,
            delay_before_return_html=1,  # Wait 5 seconds to ensure download starts
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
        )
-        
+
        if result.downloaded_files:
            print("\nDownload successful!")
            print("Downloaded files:")
@@ -52,25 +52,26 @@ async def download_example():
        else:
            print("\nNo files were downloaded")

+
 # 2. Local File and Raw HTML Processing Example
 async def local_and_raw_html_example():
    """Example of processing local files and raw HTML"""
    # Create a sample HTML file
    sample_file = os.path.join(__data__, "sample.html")
    with open(sample_file, "w") as f:
-        f.write("""
+        f.write(
+            """
        <html><body>
            <h1>Test Content</h1>
            <p>This is a test paragraph.</p>
        </body></html>
-        """)
-    
+        """
+        )
+
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Process local file
-        local_result = await crawler.arun(
-            url=f"file://{os.path.abspath(sample_file)}"
-        )
-        
+        local_result = await crawler.arun(url=f"file://{os.path.abspath(sample_file)}")
+
        # Process raw HTML
        raw_html = """
        <html><body>
@@ -78,16 +79,15 @@ async def local_and_raw_html_example():
            <p>This is a test of raw HTML processing.</p>
        </body></html>
        """
-        raw_result = await crawler.arun(
-            url=f"raw:{raw_html}"
-        )
-        
+        raw_result = await crawler.arun(url=f"raw:{raw_html}")
+
        # Clean up
        os.remove(sample_file)
-        
+
        print("Local file content:", local_result.markdown)
        print("\nRaw HTML content:", raw_result.markdown)

+
 # 3. Enhanced Markdown Generation Example
 async def markdown_generation_example():
    """Example of enhanced markdown generation with citations and LLM-friendly features"""
@@ -97,58 +97,66 @@ async def markdown_generation_example():
            # user_query="History and cultivation",
            bm25_threshold=1.0
        )
-        
+
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/Apple",
            css_selector="main div#bodyContent",
            content_filter=content_filter,
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
        )
-        
-        from crawl4ai import AsyncWebCrawler
+
        from crawl4ai.content_filter_strategy import BM25ContentFilter
-        
+
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/Apple",
            css_selector="main div#bodyContent",
-            content_filter=BM25ContentFilter()
+            content_filter=BM25ContentFilter(),
        )
        print(result.markdown_v2.fit_markdown)
-        
+
        print("\nMarkdown Generation Results:")
        print(f"1. Original markdown length: {len(result.markdown)}")
-        print(f"2. New markdown versions (markdown_v2):")
+        print("2. New markdown versions (markdown_v2):")
        print(f"   - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
-        print(f"   - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}")
-        print(f"   - References section length: {len(result.markdown_v2.references_markdown)}")
+        print(
+            f"   - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}"
+        )
+        print(
+            f"   - References section length: {len(result.markdown_v2.references_markdown)}"
+        )
        if result.markdown_v2.fit_markdown:
-            print(f"   - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}")
-        
+            print(
+                f"   - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}"
+            )
+
        # Save examples to files
        output_dir = os.path.join(__data__, "markdown_examples")
        os.makedirs(output_dir, exist_ok=True)
-        
+
        # Save different versions
        with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
            f.write(result.markdown_v2.raw_markdown)
-            
+
        with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
            f.write(result.markdown_v2.markdown_with_citations)
-            
+
        with open(os.path.join(output_dir, "3_references.md"), "w") as f:
            f.write(result.markdown_v2.references_markdown)
-            
+
        if result.markdown_v2.fit_markdown:
            with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
                f.write(result.markdown_v2.fit_markdown)
-                
+
        print(f"\nMarkdown examples saved to: {output_dir}")
-        
+
        # Show a sample of citations and references
        print("\nSample of markdown with citations:")
        print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
        print("Sample of references:")
-        print('\n'.join(result.markdown_v2.references_markdown.split('\n')[:10]) + "...")
+        print(
+            "\n".join(result.markdown_v2.references_markdown.split("\n")[:10]) + "..."
+        )
+

 # 4. Browser Management Example
 async def browser_management_example():
@@ -156,38 +164,38 @@ async def browser_management_example():
    # Use the specified user directory path
    user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
    os.makedirs(user_data_dir, exist_ok=True)
-    
+
    print(f"Browser profile will be saved to: {user_data_dir}")
-    
+
    async with AsyncWebCrawler(
        use_managed_browser=True,
        user_data_dir=user_data_dir,
        headless=False,
-        verbose=True
+        verbose=True,
    ) as crawler:
-
        result = await crawler.arun(
            url="https://crawl4ai.com",
            # session_id="persistent_session_1",
-            cache_mode=CacheMode.BYPASS
-        )        
+            cache_mode=CacheMode.BYPASS,
+        )
        # Use GitHub as an example - it's a good test for browser management
        # because it requires proper browser handling
        result = await crawler.arun(
            url="https://github.com/trending",
            # session_id="persistent_session_1",
-            cache_mode=CacheMode.BYPASS
+            cache_mode=CacheMode.BYPASS,
        )
-        
+
        print("\nBrowser session result:", result.success)
        if result.success:
-            print("Page title:", result.metadata.get('title', 'No title found'))
+            print("Page title:", result.metadata.get("title", "No title found"))
+

 # 5. API Usage Example
 async def api_example():
    """Example of using the new API endpoints"""
-    api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
-    headers = {'Authorization': f'Bearer {api_token}'}    
+    api_token = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
+    headers = {"Authorization": f"Bearer {api_token}"}
    async with aiohttp.ClientSession() as session:
        # Submit crawl job
        crawl_request = {
@@ -199,25 +207,17 @@ async def api_example():
                        "name": "Hacker News Articles",
                        "baseSelector": ".athing",
                        "fields": [
-                            {
-                                "name": "title",
-                                "selector": ".title a",
-                                "type": "text"
-                            },
-                            {
-                                "name": "score",
-                                "selector": ".score",
-                                "type": "text"
-                            },
+                            {"name": "title", "selector": ".title a", "type": "text"},
+                            {"name": "score", "selector": ".score", "type": "text"},
                            {
                                "name": "url",
                                "selector": ".title a",
                                "type": "attribute",
-                                "attribute": "href"
-                            }
-                        ]
+                                "attribute": "href",
+                            },
+                        ],
                    }
-                }
+                },
            },
            "crawler_params": {
                "headless": True,
@@ -227,51 +227,50 @@ async def api_example():
            # "screenshot": True,
            # "magic": True
        }
-        
+
        async with session.post(
-            "http://localhost:11235/crawl",
-            json=crawl_request,
-            headers=headers
+            "http://localhost:11235/crawl", json=crawl_request, headers=headers
        ) as response:
            task_data = await response.json()
            task_id = task_data["task_id"]
-            
+
            # Check task status
            while True:
                async with session.get(
-                    f"http://localhost:11235/task/{task_id}",
-                    headers=headers
+                    f"http://localhost:11235/task/{task_id}", headers=headers
                ) as status_response:
                    result = await status_response.json()
                    print(f"Task status: {result['status']}")
-                    
+
                    if result["status"] == "completed":
                        print("Task completed!")
                        print("Results:")
-                        news = json.loads(result["results"][0]['extracted_content'])
+                        news = json.loads(result["results"][0]["extracted_content"])
                        print(json.dumps(news[:4], indent=2))
                        break
                    else:
                        await asyncio.sleep(1)

+
 # Main execution
 async def main():
    # print("Running Crawl4AI feature examples...")
-    
+
    # print("\n1. Running Download Example:")
    # await download_example()
-    
+
    # print("\n2. Running Markdown Generation Example:")
    # await markdown_generation_example()
-    
+
    # # print("\n3. Running Local and Raw HTML Example:")
    # await local_and_raw_html_example()
-    
+
    # # print("\n4. Running Browser Management Example:")
    await browser_management_example()
-    
+
    # print("\n5. Running API Example:")
    await api_example()

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/docs/examples/v0_4_24_walkthrough.py
+++ b/docs/examples/v0_4_24_walkthrough.py
@@ -10,18 +10,17 @@ import asyncio
 import os
 import json
 import re
-from typing import List, Optional, Dict, Any
-from pydantic import BaseModel, Field
+from typing import List
 from crawl4ai import (
    AsyncWebCrawler,
    BrowserConfig,
    CrawlerRunConfig,
    CacheMode,
    LLMExtractionStrategy,
-    JsonCssExtractionStrategy
+    JsonCssExtractionStrategy,
 )
 from crawl4ai.content_filter_strategy import RelevantContentFilter
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator 
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from bs4 import BeautifulSoup

 # Sample HTML for demonstrations
@@ -52,17 +51,18 @@ SAMPLE_HTML = """
 </div>
 """

+
 async def demo_ssl_features():
    """
    Enhanced SSL & Security Features Demo
    -----------------------------------
-    
+
    This example demonstrates the new SSL certificate handling and security features:
    1. Custom certificate paths
    2. SSL verification options
    3. HTTPS error handling
    4. Certificate validation configurations
-    
+
    These features are particularly useful when:
    - Working with self-signed certificates
    - Dealing with corporate proxies
@@ -76,14 +76,11 @@ async def demo_ssl_features():

    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
-        fetch_ssl_certificate=True  # Enable SSL certificate fetching
+        fetch_ssl_certificate=True,  # Enable SSL certificate fetching
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url="https://example.com",
-            config=run_config
-        )
+        result = await crawler.arun(url="https://example.com", config=run_config)
        print(f"SSL Crawl Success: {result.success}")
        result.ssl_certificate.to_json(
            os.path.join(os.getcwd(), "ssl_certificate.json")
@@ -91,11 +88,12 @@ async def demo_ssl_features():
        if not result.success:
            print(f"SSL Error: {result.error_message}")

+
 async def demo_content_filtering():
    """
    Smart Content Filtering Demo
    ----------------------
-    
+
    Demonstrates advanced content filtering capabilities:
    1. Custom filter to identify and extract specific content
    2. Integration with markdown generation
@@ -110,87 +108,90 @@ async def demo_content_filtering():
            super().__init__()
            # Add news-specific patterns
            self.negative_patterns = re.compile(
-                r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending',
-                re.I
+                r"nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending",
+                re.I,
            )
            self.min_word_count = 30  # Higher threshold for news content

-        def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        def filter_content(
+            self, html: str, min_word_threshold: int = None
+        ) -> List[str]:
            """
            Implements news-specific content filtering logic.
-            
+
            Args:
                html (str): HTML content to be filtered
                min_word_threshold (int, optional): Minimum word count threshold
-                
+
            Returns:
                List[str]: List of filtered HTML content blocks
            """
            if not html or not isinstance(html, str):
                return []
-                
-            soup = BeautifulSoup(html, 'lxml')
+
+            soup = BeautifulSoup(html, "lxml")
            if not soup.body:
-                soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
-            
-            body = soup.find('body')
-            
+                soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+
+            body = soup.find("body")
+
            # Extract chunks with metadata
-            chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count)
-            
+            chunks = self.extract_text_chunks(
+                body, min_word_threshold or self.min_word_count
+            )
+
            # Filter chunks based on news-specific criteria
            filtered_chunks = []
            for _, text, tag_type, element in chunks:
                # Skip if element has negative class/id
                if self.is_excluded(element):
                    continue
-                    
+
                # Headers are important in news articles
-                if tag_type == 'header':
+                if tag_type == "header":
                    filtered_chunks.append(self.clean_element(element))
                    continue
-                    
+
                # For content, check word count and link density
                text = element.get_text(strip=True)
                if len(text.split()) >= (min_word_threshold or self.min_word_count):
                    # Calculate link density
-                    links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a'))
+                    links_text = " ".join(
+                        a.get_text(strip=True) for a in element.find_all("a")
+                    )
                    link_density = len(links_text) / len(text) if text else 1
-                    
+
                    # Accept if link density is reasonable
                    if link_density < 0.5:
                        filtered_chunks.append(self.clean_element(element))
-            
+
            return filtered_chunks

    # Create markdown generator with custom filter
-    markdown_gen = DefaultMarkdownGenerator(
-        content_filter=CustomNewsFilter()
-    )
+    markdown_gen = DefaultMarkdownGenerator(content_filter=CustomNewsFilter())

    run_config = CrawlerRunConfig(
-        markdown_generator=markdown_gen,
-        cache_mode=CacheMode.BYPASS
+        markdown_generator=markdown_gen, cache_mode=CacheMode.BYPASS
    )

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
-            url="https://news.ycombinator.com",
-            config=run_config
+            url="https://news.ycombinator.com", config=run_config
        )
        print("Filtered Content Sample:")
        print(result.markdown[:500])  # Show first 500 chars

+
 async def demo_json_extraction():
    """
    Improved JSON Extraction Demo
    ---------------------------
-    
+
    Demonstrates the enhanced JSON extraction capabilities:
    1. Base element attributes extraction
    2. Complex nested structures
    3. Multiple extraction patterns
-    
+
    Key features shown:
    - Extracting attributes from base elements (href, data-* attributes)
    - Processing repeated patterns
@@ -206,7 +207,7 @@ async def demo_json_extraction():
            "baseSelector": "div.article-list",
            "baseFields": [
                {"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
-                {"name": "category", "type": "attribute", "attribute": "data-category"}
+                {"name": "category", "type": "attribute", "attribute": "data-category"},
            ],
            "fields": [
                {
@@ -214,8 +215,16 @@ async def demo_json_extraction():
                    "selector": "article.post",
                    "type": "nested_list",
                    "baseFields": [
-                        {"name": "post_id", "type": "attribute", "attribute": "data-post-id"},
-                        {"name": "author_id", "type": "attribute", "attribute": "data-author"}
+                        {
+                            "name": "post_id",
+                            "type": "attribute",
+                            "attribute": "data-post-id",
+                        },
+                        {
+                            "name": "author_id",
+                            "type": "attribute",
+                            "attribute": "data-author",
+                        },
                    ],
                    "fields": [
                        {
@@ -223,60 +232,68 @@ async def demo_json_extraction():
                            "selector": "h2.title a",
                            "type": "text",
                            "baseFields": [
-                                {"name": "url", "type": "attribute", "attribute": "href"}
-                            ]
+                                {
+                                    "name": "url",
+                                    "type": "attribute",
+                                    "attribute": "href",
+                                }
+                            ],
                        },
                        {
                            "name": "author",
                            "selector": "div.meta a.author",
                            "type": "text",
                            "baseFields": [
-                                {"name": "profile_url", "type": "attribute", "attribute": "href"}
-                            ]
-                        },
-                        {
-                            "name": "date",
-                            "selector": "span.date",
-                            "type": "text"
+                                {
+                                    "name": "profile_url",
+                                    "type": "attribute",
+                                    "attribute": "href",
+                                }
+                            ],
                        },
+                        {"name": "date", "selector": "span.date", "type": "text"},
                        {
                            "name": "read_more",
                            "selector": "a.read-more",
                            "type": "nested",
                            "fields": [
                                {"name": "text", "type": "text"},
-                                {"name": "url", "type": "attribute", "attribute": "href"}
-                            ]
-                        }
-                    ]
+                                {
+                                    "name": "url",
+                                    "type": "attribute",
+                                    "attribute": "href",
+                                },
+                            ],
+                        },
+                    ],
                }
-            ]
+            ],
        }
    )

    # Demonstrate extraction from raw HTML
    run_config = CrawlerRunConfig(
-        extraction_strategy=json_strategy,
-        cache_mode=CacheMode.BYPASS
+        extraction_strategy=json_strategy, cache_mode=CacheMode.BYPASS
    )

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="raw:" + SAMPLE_HTML,  # Use raw: prefix for raw HTML
-            config=run_config
+            config=run_config,
        )
        print("Extracted Content:")
        print(result.extracted_content)

+
 async def demo_input_formats():
    """
    Input Format Handling Demo
    ----------------------
-    
+
    Demonstrates how LLM extraction can work with different input formats:
    1. Markdown (default) - Good for simple text extraction
    2. HTML - Better when you need structure and attributes
-    
+
    This example shows how HTML input can be beneficial when:
    - You need to understand the DOM structure
    - You want to extract both visible text and HTML attributes
@@ -350,7 +367,7 @@ async def demo_input_formats():
        </footer>
    </div>
    """
-    
+
    # Use raw:// prefix to pass HTML content directly
    url = f"raw://{dummy_html}"

@@ -359,18 +376,30 @@ async def demo_input_formats():

    # Define our schema using Pydantic
    class JobRequirement(BaseModel):
-        category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)")
-        items: List[str] = Field(description="List of specific requirements in this category")
-        priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context")
+        category: str = Field(
+            description="Category of the requirement (e.g., Technical, Soft Skills)"
+        )
+        items: List[str] = Field(
+            description="List of specific requirements in this category"
+        )
+        priority: str = Field(
+            description="Priority level (Required/Preferred) based on the HTML class or context"
+        )

    class JobPosting(BaseModel):
        title: str = Field(description="Job title")
        department: str = Field(description="Department or team")
        location: str = Field(description="Job location, including remote options")
        salary_range: Optional[str] = Field(description="Salary range if specified")
-        requirements: List[JobRequirement] = Field(description="Categorized job requirements")
-        application_deadline: Optional[str] = Field(description="Application deadline if specified")
-        contact_info: Optional[dict] = Field(description="Contact information from footer or contact section")
+        requirements: List[JobRequirement] = Field(
+            description="Categorized job requirements"
+        )
+        application_deadline: Optional[str] = Field(
+            description="Application deadline if specified"
+        )
+        contact_info: Optional[dict] = Field(
+            description="Contact information from footer or contact section"
+        )

    # First try with markdown (default)
    markdown_strategy = LLMExtractionStrategy(
@@ -382,7 +411,7 @@ async def demo_input_formats():
        Extract job posting details into structured data. Focus on the visible text content 
        and organize requirements into categories.
        """,
-        input_format="markdown"  # default
+        input_format="markdown",  # default
    )

    # Then with HTML for better structure understanding
@@ -400,34 +429,25 @@ async def demo_input_formats():
        
        Use HTML attributes and classes to enhance extraction accuracy.
        """,
-        input_format="html"  # explicitly use HTML
+        input_format="html",  # explicitly use HTML
    )

    async with AsyncWebCrawler() as crawler:
        # Try with markdown first
-        markdown_config = CrawlerRunConfig(
-            extraction_strategy=markdown_strategy
-        )
-        markdown_result = await crawler.arun(
-            url=url,
-            config=markdown_config
-        )
+        markdown_config = CrawlerRunConfig(extraction_strategy=markdown_strategy)
+        markdown_result = await crawler.arun(url=url, config=markdown_config)
        print("\nMarkdown-based Extraction Result:")
        items = json.loads(markdown_result.extracted_content)
        print(json.dumps(items, indent=2))

        # Then with HTML for better structure understanding
-        html_config = CrawlerRunConfig(
-            extraction_strategy=html_strategy
-        )
-        html_result = await crawler.arun(
-            url=url,
-            config=html_config
-        )
+        html_config = CrawlerRunConfig(extraction_strategy=html_strategy)
+        html_result = await crawler.arun(url=url, config=html_config)
        print("\nHTML-based Extraction Result:")
        items = json.loads(html_result.extracted_content)
        print(json.dumps(items, indent=2))

+
 # Main execution
 async def main():
    print("Crawl4AI v0.4.24 Feature Walkthrough")
@@ -439,5 +459,6 @@ async def main():
    await demo_json_extraction()
    # await demo_input_formats()

+
 if __name__ == "__main__":
    asyncio.run(main())