chore: resolve merge conflicts for v0.4.24

2024-12-31 19:24:03 +08:00
parent a04870a662 553a4622bf
commit 67d0999bc3
79 changed files with 11903 additions and 7543 deletions
--- a/docs/examples/amazon_product_extraction_direct_url.py
+++ b/docs/examples/amazon_product_extraction_direct_url.py
@@ -0,0 +1,114 @@
+"""
+This example demonstrates how to use JSON CSS extraction to scrape product information 
+from Amazon search results. It shows how to extract structured data like product titles,
+prices, ratings, and other details using CSS selectors.
+"""
+
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import json
+
+async def extract_amazon_products():
+    # Initialize browser config
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+    
+    # Initialize crawler config with JSON CSS extraction strategy
+    crawler_config = CrawlerRunConfig(
+        extraction_strategy=JsonCssExtractionStrategy(
+            schema={
+                "name": "Amazon Product Search Results",
+                "baseSelector": "[data-component-type='s-search-result']",
+                "fields": [
+                    {
+                        "name": "asin",
+                        "selector": "",
+                        "type": "attribute",
+                        "attribute": "data-asin"
+                    },
+                    {
+                        "name": "title",
+                        "selector": "h2 a span",
+                        "type": "text"
+                    },
+                    {
+                        "name": "url",
+                        "selector": "h2 a",
+                        "type": "attribute",
+                        "attribute": "href"
+                    },
+                    {
+                        "name": "image",
+                        "selector": ".s-image",
+                        "type": "attribute",
+                        "attribute": "src"
+                    },
+                    {
+                        "name": "rating",
+                        "selector": ".a-icon-star-small .a-icon-alt",
+                        "type": "text"
+                    },
+                    {
+                        "name": "reviews_count",
+                        "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
+                        "type": "text"
+                    },
+                    {
+                        "name": "price",
+                        "selector": ".a-price .a-offscreen",
+                        "type": "text"
+                    },
+                    {
+                        "name": "original_price",
+                        "selector": ".a-price.a-text-price .a-offscreen",
+                        "type": "text"
+                    },
+                    {
+                        "name": "sponsored",
+                        "selector": ".puis-sponsored-label-text",
+                        "type": "exists"
+                    },
+                    {
+                        "name": "delivery_info",
+                        "selector": "[data-cy='delivery-recipe'] .a-color-base",
+                        "type": "text",
+                        "multiple": True
+                    }
+                ]
+            }
+        )
+    )
+
+    # Example search URL (you should replace with your actual Amazon URL)
+    url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab"
+    
+    # Use context manager for proper resource handling
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Extract the data
+        result = await crawler.arun(url=url, config=crawler_config)
+        
+        # Process and print the results
+        if result and result.extracted_content:
+            # Parse the JSON string into a list of products
+            products = json.loads(result.extracted_content)
+            
+            # Process each product in the list
+            for product in products:
+                print("\nProduct Details:")
+                print(f"ASIN: {product.get('asin')}")
+                print(f"Title: {product.get('title')}")
+                print(f"Price: {product.get('price')}")
+                print(f"Original Price: {product.get('original_price')}")
+                print(f"Rating: {product.get('rating')}")
+                print(f"Reviews: {product.get('reviews_count')}")
+                print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
+                if product.get('delivery_info'):
+                    print(f"Delivery: {' '.join(product['delivery_info'])}")
+                print("-" * 80)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(extract_amazon_products())
--- a/docs/examples/amazon_product_extraction_using_hooks.py
+++ b/docs/examples/amazon_product_extraction_using_hooks.py
@@ -0,0 +1,145 @@
+"""
+This example demonstrates how to use JSON CSS extraction to scrape product information 
+from Amazon search results. It shows how to extract structured data like product titles,
+prices, ratings, and other details using CSS selectors.
+"""
+
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import json
+from playwright.async_api import Page, BrowserContext
+
+async def extract_amazon_products():
+    # Initialize browser config
+    browser_config = BrowserConfig(
+        # browser_type="chromium",
+        headless=True
+    )
+    
+    # Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+
+        extraction_strategy=JsonCssExtractionStrategy(
+            schema={
+                "name": "Amazon Product Search Results",
+                "baseSelector": "[data-component-type='s-search-result']",
+                "fields": [
+                    {
+                        "name": "asin",
+                        "selector": "",
+                        "type": "attribute",
+                        "attribute": "data-asin"
+                    },
+                    {
+                        "name": "title",
+                        "selector": "h2 a span",
+                        "type": "text"
+                    },
+                    {
+                        "name": "url",
+                        "selector": "h2 a",
+                        "type": "attribute",
+                        "attribute": "href"
+                    },
+                    {
+                        "name": "image",
+                        "selector": ".s-image",
+                        "type": "attribute",
+                        "attribute": "src"
+                    },
+                    {
+                        "name": "rating",
+                        "selector": ".a-icon-star-small .a-icon-alt",
+                        "type": "text"
+                    },
+                    {
+                        "name": "reviews_count",
+                        "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
+                        "type": "text"
+                    },
+                    {
+                        "name": "price",
+                        "selector": ".a-price .a-offscreen",
+                        "type": "text"
+                    },
+                    {
+                        "name": "original_price",
+                        "selector": ".a-price.a-text-price .a-offscreen",
+                        "type": "text"
+                    },
+                    {
+                        "name": "sponsored",
+                        "selector": ".puis-sponsored-label-text",
+                        "type": "exists"
+                    },
+                    {
+                        "name": "delivery_info",
+                        "selector": "[data-cy='delivery-recipe'] .a-color-base",
+                        "type": "text",
+                        "multiple": True
+                    }
+                ]
+            }
+        )
+    )
+
+    url = "https://www.amazon.com/"
+    
+    async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
+        """Hook called after navigating to each URL"""
+        print(f"[HOOK] after_goto - Successfully loaded: {url}")
+        
+        try:
+            # Wait for search box to be available
+            search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
+            
+            # Type the search query
+            await search_box.fill('Samsung Galaxy Tab')
+            
+            # Get the search button and prepare for navigation
+            search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
+            
+            # Click with navigation waiting
+            await search_button.click()
+            
+            # Wait for search results to load
+            await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
+            print("[HOOK] Search completed and results loaded!")
+            
+        except Exception as e:
+            print(f"[HOOK] Error during search operation: {str(e)}")
+            
+        return page    
+    
+    # Use context manager for proper resource handling
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        
+        crawler.crawler_strategy.set_hook("after_goto", after_goto)
+        
+        # Extract the data
+        result = await crawler.arun(url=url, config=crawler_config)
+        
+        # Process and print the results
+        if result and result.extracted_content:
+            # Parse the JSON string into a list of products
+            products = json.loads(result.extracted_content)
+            
+            # Process each product in the list
+            for product in products:
+                print("\nProduct Details:")
+                print(f"ASIN: {product.get('asin')}")
+                print(f"Title: {product.get('title')}")
+                print(f"Price: {product.get('price')}")
+                print(f"Original Price: {product.get('original_price')}")
+                print(f"Rating: {product.get('rating')}")
+                print(f"Reviews: {product.get('reviews_count')}")
+                print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
+                if product.get('delivery_info'):
+                    print(f"Delivery: {' '.join(product['delivery_info'])}")
+                print("-" * 80)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(extract_amazon_products())
--- a/docs/examples/amazon_product_extraction_using_use_javascript.py
+++ b/docs/examples/amazon_product_extraction_using_use_javascript.py
@@ -0,0 +1,129 @@
+"""
+This example demonstrates how to use JSON CSS extraction to scrape product information 
+from Amazon search results. It shows how to extract structured data like product titles,
+prices, ratings, and other details using CSS selectors.
+"""
+
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import json
+from playwright.async_api import Page, BrowserContext
+
+async def extract_amazon_products():
+    # Initialize browser config
+    browser_config = BrowserConfig(
+        # browser_type="chromium",
+        headless=True
+    )
+    
+    js_code_to_search = """
+        const task = async () => {
+            document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
+            document.querySelector('#nav-search-submit-button').click();
+        }
+        await task();
+    """
+    js_code_to_search_sync = """
+            document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
+            document.querySelector('#nav-search-submit-button').click();
+    """
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        js_code = js_code_to_search,
+        wait_for='css:[data-component-type="s-search-result"]',
+        extraction_strategy=JsonCssExtractionStrategy(
+            schema={
+                "name": "Amazon Product Search Results",
+                "baseSelector": "[data-component-type='s-search-result']",
+                "fields": [
+                    {
+                        "name": "asin",
+                        "selector": "",
+                        "type": "attribute",
+                        "attribute": "data-asin"
+                    },
+                    {
+                        "name": "title",
+                        "selector": "h2 a span",
+                        "type": "text"
+                    },
+                    {
+                        "name": "url",
+                        "selector": "h2 a",
+                        "type": "attribute",
+                        "attribute": "href"
+                    },
+                    {
+                        "name": "image",
+                        "selector": ".s-image",
+                        "type": "attribute",
+                        "attribute": "src"
+                    },
+                    {
+                        "name": "rating",
+                        "selector": ".a-icon-star-small .a-icon-alt",
+                        "type": "text"
+                    },
+                    {
+                        "name": "reviews_count",
+                        "selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
+                        "type": "text"
+                    },
+                    {
+                        "name": "price",
+                        "selector": ".a-price .a-offscreen",
+                        "type": "text"
+                    },
+                    {
+                        "name": "original_price",
+                        "selector": ".a-price.a-text-price .a-offscreen",
+                        "type": "text"
+                    },
+                    {
+                        "name": "sponsored",
+                        "selector": ".puis-sponsored-label-text",
+                        "type": "exists"
+                    },
+                    {
+                        "name": "delivery_info",
+                        "selector": "[data-cy='delivery-recipe'] .a-color-base",
+                        "type": "text",
+                        "multiple": True
+                    }
+                ]
+            }
+        )
+    )
+
+    # Example search URL (you should replace with your actual Amazon URL)
+    url = "https://www.amazon.com/"
+ 
+    
+    # Use context manager for proper resource handling
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Extract the data
+        result = await crawler.arun(url=url, config=crawler_config)
+        
+        # Process and print the results
+        if result and result.extracted_content:
+            # Parse the JSON string into a list of products
+            products = json.loads(result.extracted_content)
+            
+            # Process each product in the list
+            for product in products:
+                print("\nProduct Details:")
+                print(f"ASIN: {product.get('asin')}")
+                print(f"Title: {product.get('title')}")
+                print(f"Price: {product.get('price')}")
+                print(f"Original Price: {product.get('original_price')}")
+                print(f"Rating: {product.get('rating')}")
+                print(f"Reviews: {product.get('reviews_count')}")
+                print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
+                if product.get('delivery_info'):
+                    print(f"Delivery: {' '.join(product['delivery_info'])}")
+                print("-" * 80)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(extract_amazon_products())
--- a/docs/examples/browser_optimization_example.py
+++ b/docs/examples/browser_optimization_example.py
@@ -0,0 +1,128 @@
+"""
+This example demonstrates optimal browser usage patterns in Crawl4AI:
+1. Sequential crawling with session reuse
+2. Parallel crawling with browser instance reuse
+3. Performance optimization settings
+"""
+
+import asyncio
+import os
+from typing import List
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+
+async def crawl_sequential(urls: List[str]):
+    """
+    Sequential crawling using session reuse - most efficient for moderate workloads
+    """
+    print("\n=== Sequential Crawling with Session Reuse ===")
+
+    # Configure browser with optimized settings
+    browser_config = BrowserConfig(
+        headless=True,
+        browser_args=[
+            "--disable-gpu",  # Disable GPU acceleration
+            "--disable-dev-shm-usage",  # Disable /dev/shm usage
+            "--no-sandbox",  # Required for Docker
+        ],
+        viewport={
+            "width": 800,
+            "height": 600,
+        },  # Smaller viewport for better performance
+    )
+
+    # Configure crawl settings
+    crawl_config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator(
+            #  content_filter=PruningContentFilter(), In case you need fit_markdown
+        ),
+    )
+
+    # Create single crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    try:
+        session_id = "session1"  # Use same session for all URLs
+        for url in urls:
+            result = await crawler.arun(
+                url=url,
+                config=crawl_config,
+                session_id=session_id,  # Reuse same browser tab
+            )
+            if result.success:
+                print(f"Successfully crawled {url}")
+                print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+    finally:
+        await crawler.close()
+
+
+async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
+    """
+    Parallel crawling while reusing browser instance - best for large workloads
+    """
+    print("\n=== Parallel Crawling with Browser Reuse ===")
+
+    browser_config = BrowserConfig(
+        headless=True,
+        browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+        viewport={"width": 800, "height": 600},
+    )
+
+    crawl_config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator(
+            #  content_filter=PruningContentFilter(), In case you need fit_markdown
+        ),
+    )
+
+    # Create single crawler instance for all parallel tasks
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    try:
+        # Create tasks in batches to control concurrency
+        for i in range(0, len(urls), max_concurrent):
+            batch = urls[i : i + max_concurrent]
+            tasks = []
+
+            for j, url in enumerate(batch):
+                session_id = (
+                    f"parallel_session_{j}"  # Different session per concurrent task
+                )
+                task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
+                tasks.append(task)
+
+            # Wait for batch to complete
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+            # Process results
+            for url, result in zip(batch, results):
+                if isinstance(result, Exception):
+                    print(f"Error crawling {url}: {str(result)}")
+                elif result.success:
+                    print(f"Successfully crawled {url}")
+                    print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+    finally:
+        await crawler.close()
+
+
+async def main():
+    # Example URLs
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3",
+        "https://example.com/page4",
+    ]
+
+    # Demo sequential crawling
+    await crawl_sequential(urls)
+
+    # Demo parallel crawling
+    await crawl_parallel(urls, max_concurrent=2)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/extraction_strategies_example.py
+++ b/docs/examples/extraction_strategies_example.py
@@ -0,0 +1,115 @@
+"""
+Example demonstrating different extraction strategies with various input formats.
+This example shows how to:
+1. Use different input formats (markdown, HTML, fit_markdown)
+2. Work with JSON-based extractors (CSS and XPath)
+3. Use LLM-based extraction with different input formats
+4. Configure browser and crawler settings properly
+"""
+
+import asyncio
+import os
+from typing import Dict, Any
+
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import (
+    LLMExtractionStrategy,
+    JsonCssExtractionStrategy,
+    JsonXPathExtractionStrategy
+)
+from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
+    """Helper function to run extraction with proper configuration"""
+    try:
+        # Configure the crawler run settings
+        config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            extraction_strategy=strategy,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter()  # For fit_markdown support
+            )
+        )
+        
+        # Run the crawler
+        result = await crawler.arun(url=url, config=config)
+        
+        if result.success:
+            print(f"\n=== {name} Results ===")
+            print(f"Extracted Content: {result.extracted_content}")
+            print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
+            print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
+        else:
+            print(f"Error in {name}: Crawl failed")
+            
+    except Exception as e:
+        print(f"Error in {name}: {str(e)}")
+
+async def main():
+    # Example URL (replace with actual URL)
+    url = "https://example.com/product-page"
+    
+    # Configure browser settings
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True
+    )
+    
+    # Initialize extraction strategies
+    
+    # 1. LLM Extraction with different input formats
+    markdown_strategy = LLMExtractionStrategy(
+        provider="openai/gpt-4o-mini",
+        api_token=os.getenv("OPENAI_API_KEY"),
+        instruction="Extract product information including name, price, and description"
+    )
+    
+    html_strategy = LLMExtractionStrategy(
+        input_format="html",
+        provider="openai/gpt-4o-mini",
+        api_token=os.getenv("OPENAI_API_KEY"),
+        instruction="Extract product information from HTML including structured data"
+    )
+    
+    fit_markdown_strategy = LLMExtractionStrategy(
+        input_format="fit_markdown",
+        provider="openai/gpt-4o-mini",
+        api_token=os.getenv("OPENAI_API_KEY"),
+        instruction="Extract product information from cleaned markdown"
+    )
+    
+    # 2. JSON CSS Extraction (automatically uses HTML input)
+    css_schema = {
+        "baseSelector": ".product",
+        "fields": [
+            {"name": "title", "selector": "h1.product-title", "type": "text"},
+            {"name": "price", "selector": ".price", "type": "text"},
+            {"name": "description", "selector": ".description", "type": "text"}
+        ]
+    }
+    css_strategy = JsonCssExtractionStrategy(schema=css_schema)
+    
+    # 3. JSON XPath Extraction (automatically uses HTML input)
+    xpath_schema = {
+        "baseSelector": "//div[@class='product']",
+        "fields": [
+            {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
+            {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
+            {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
+        ]
+    }
+    xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
+    
+    # Use context manager for proper resource handling
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Run all strategies
+        await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
+        await run_extraction(crawler, url, html_strategy, "HTML LLM")
+        await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
+        await run_extraction(crawler, url, css_strategy, "CSS Extraction")
+        await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/full_page_screenshot_and_pdf_export.md
+++ b/docs/examples/full_page_screenshot_and_pdf_export.md
@@ -39,8 +39,8 @@ async def main():
                    f.write(b64decode(result.screenshot))
            
            # Save PDF
-            if result.pdf_data:
-                pdf_bytes = b64decode(result.pdf_data)
+            if result.pdf:
+                pdf_bytes = b64decode(result.pdf)
                with open(os.path.join(__location__, "page.pdf"), "wb") as f:
                    f.write(pdf_bytes)

--- a/docs/examples/hooks_example.py
+++ b/docs/examples/hooks_example.py
@@ -0,0 +1,107 @@
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from playwright.async_api import Page, BrowserContext
+
+async def main():
+    print("🔗 Hooks Example: Demonstrating different hook use cases")
+
+    # Configure browser settings
+    browser_config = BrowserConfig(
+        headless=True
+    )
+    
+    # Configure crawler settings
+    crawler_run_config = CrawlerRunConfig(
+        js_code="window.scrollTo(0, document.body.scrollHeight);",
+        wait_for="body",
+        cache_mode=CacheMode.BYPASS
+    )
+
+    # Create crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+
+    # Define and set hook functions
+    async def on_browser_created(browser, context: BrowserContext, **kwargs):
+        """Hook called after the browser is created"""
+        print("[HOOK] on_browser_created - Browser is ready!")
+        # Example: Set a cookie that will be used for all requests
+        return browser
+
+    async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
+        """Hook called after a new page and context are created"""
+        print("[HOOK] on_page_context_created - New page created!")
+        # Example: Set default viewport size
+        await context.add_cookies([{
+            'name': 'session_id',
+            'value': 'example_session',
+            'domain': '.example.com',
+            'path': '/'
+        }])
+        await page.set_viewport_size({"width": 1920, "height": 1080})
+        return page
+
+    async def on_user_agent_updated(page: Page, context: BrowserContext, user_agent: str, **kwargs):
+        """Hook called when the user agent is updated"""
+        print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
+        return page
+
+    async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+        """Hook called after custom JavaScript execution"""
+        print("[HOOK] on_execution_started - Custom JS executed!")
+        return page
+
+    async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
+        """Hook called before navigating to each URL"""
+        print(f"[HOOK] before_goto - About to visit: {url}")
+        # Example: Add custom headers for the request
+        await page.set_extra_http_headers({
+            "Custom-Header": "my-value"
+        })
+        return page
+
+    async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
+        """Hook called after navigating to each URL"""
+        print(f"[HOOK] after_goto - Successfully loaded: {url}")
+        # Example: Wait for a specific element to be loaded
+        try:
+            await page.wait_for_selector('.content', timeout=1000)
+            print("Content element found!")
+        except:
+            print("Content element not found, continuing anyway")
+        return page
+
+    async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
+        """Hook called before retrieving the HTML content"""
+        print("[HOOK] before_retrieve_html - About to get HTML content")
+        # Example: Scroll to bottom to trigger lazy loading
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        return page
+
+    async def before_return_html(page: Page, context: BrowserContext, html:str, **kwargs):
+        """Hook called before returning the HTML content"""
+        print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
+        # Example: You could modify the HTML content here if needed
+        return page
+
+    # Set all the hooks
+    crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+    crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
+    crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
+    crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+    crawler.crawler_strategy.set_hook("before_goto", before_goto)
+    crawler.crawler_strategy.set_hook("after_goto", after_goto)
+    crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
+    crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
+
+    await crawler.start()
+
+    # Example usage: crawl a simple website
+    url = 'https://example.com'
+    result = await crawler.arun(url, config=crawler_run_config)
+    print(f"\nCrawled URL: {result.url}")
+    print(f"HTML length: {len(result.html)}")
+    
+    await crawler.close()
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -1,6 +1,8 @@
 import os, sys
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692"
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)

 import asyncio
 import time
@@ -12,7 +14,10 @@ from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
-from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai.extraction_strategy import (
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+)

 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

@@ -21,128 +26,182 @@ print("GitHub Repository: https://github.com/unclecode/crawl4ai")
 print("Twitter: @unclecode")
 print("Website: https://crawl4ai.com")

+
 # Basic Example - Simple Crawl
 async def simple_crawl():
    print("\n--- Basic Usage ---")
    browser_config = BrowserConfig(headless=True)
-    crawler_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS
-    )
-    
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=crawler_config
+            url="https://www.nbcnews.com/business", config=crawler_config
        )
        print(result.markdown[:500])

+
+async def clean_content():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        excluded_tags=["nav", "footer", "aside"],
+        remove_overlay_elements=True,
+        markdown_generator=DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48, threshold_type="fixed", min_word_threshold=0
+            ),
+            options={"ignore_links": True},
+        ),
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://en.wikipedia.org/wiki/Apple",
+            config=crawler_config,
+        )
+        full_markdown_length = len(result.markdown_v2.raw_markdown)
+        fit_markdown_length = len(result.markdown_v2.fit_markdown)
+        print(f"Full Markdown Length: {full_markdown_length}")
+        print(f"Fit Markdown Length: {fit_markdown_length}")
+
+async def link_analysis():
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,
+        exclude_external_links=True,
+        exclude_social_media_links=True,
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config,
+        )
+        print(f"Found {len(result.links['internal'])} internal links")
+        print(f"Found {len(result.links['external'])} external links")
+
+        for link in result.links['internal'][:5]:
+            print(f"Href: {link['href']}\nText: {link['text']}\n")
+
 # JavaScript Execution Example
 async def simple_example_with_running_js_code():
    print("\n--- Executing JavaScript and Using CSS Selectors ---")
-    
-    browser_config = BrowserConfig(
-        headless=True,
-        java_script_enabled=True
-    )
-    
+
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
-        js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
+        js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
        # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
    )
-    
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=crawler_config
+            url="https://www.nbcnews.com/business", config=crawler_config
        )
        print(result.markdown[:500])

+
 # CSS Selector Example
 async def simple_example_with_css_selector():
    print("\n--- Using CSS Selectors ---")
    browser_config = BrowserConfig(headless=True)
    crawler_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        css_selector=".wide-tease-item__description"
+        cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
    )
-    
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business", config=crawler_config
+        )
+        print(result.markdown[:500])
+
+async def media_handling():
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
+    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            config=crawler_config
        )
-        print(result.markdown[:500])
+        for img in result.media['images'][:5]:
+            print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
+
+async def custom_hook_workflow(verbose=True):
+    async with AsyncWebCrawler() as crawler:
+        # Set a 'before_goto' hook to run custom code just before navigation
+        crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))
+
+        # Perform the crawl operation
+        result = await crawler.arun(
+            url="https://crawl4ai.com"
+        )
+        print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
+

 # Proxy Example
 async def use_proxy():
    print("\n--- Using a Proxy ---")
    browser_config = BrowserConfig(
        headless=True,
-        proxy="http://your-proxy-url:port"
+        proxy_config={
+            "server": "http://proxy.example.com:8080",
+            "username": "username",
+            "password": "password",
+        },
    )
-    crawler_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS
-    )
-    
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
-            url="https://www.nbcnews.com/business",
-            config=crawler_config
+            url="https://www.nbcnews.com/business", config=crawler_config
        )
        if result.success:
            print(result.markdown[:500])

+
 # Screenshot Example
 async def capture_and_save_screenshot(url: str, output_path: str):
    browser_config = BrowserConfig(headless=True)
-    crawler_config = CrawlerRunConfig(
-        cache_mode=CacheMode.BYPASS,
-        screenshot=True
-    )
-    
+    crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url=url,
-            config=crawler_config
-        )
-        
+        result = await crawler.arun(url=url, config=crawler_config)
+
        if result.success and result.screenshot:
            import base64
+
            screenshot_data = base64.b64decode(result.screenshot)
-            with open(output_path, 'wb') as f:
+            with open(output_path, "wb") as f:
                f.write(screenshot_data)
            print(f"Screenshot saved successfully to {output_path}")
        else:
            print("Failed to capture screenshot")

+
 # LLM Extraction Example
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
-    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
+    output_fee: str = Field(
+        ..., description="Fee for output token for the OpenAI model."
+    )

-async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
+
+async def extract_structured_data_using_llm(
+    provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
    print(f"\n--- Extracting Structured Data with {provider} ---")
-    
+
    if api_token is None and provider != "ollama":
        print(f"API token is required for {provider}. Skipping this example.")
        return

    browser_config = BrowserConfig(headless=True)
-    
-    extra_args = {
-        "temperature": 0,
-        "top_p": 0.9,
-        "max_tokens": 2000
-    }
+
+    extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
    if extra_headers:
        extra_args["extra_headers"] = extra_headers

    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        word_count_threshold=1,
-        page_timeout = 80000,
+        page_timeout=80000,
        extraction_strategy=LLMExtractionStrategy(
            provider=provider,
            api_token=api_token,
@@ -150,17 +209,17 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
            extraction_type="schema",
            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
            Do not miss any models in the entire content.""",
-            extra_args=extra_args
-        )
+            extra_args=extra_args,
+        ),
    )
-    
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
-            url="https://openai.com/api/pricing/",
-            config=crawler_config
+            url="https://openai.com/api/pricing/", config=crawler_config
        )
        print(result.extracted_content)

+
 # CSS Extraction Example
 async def extract_structured_data_using_css_extractor():
    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
@@ -192,16 +251,13 @@ async def extract_structured_data_using_css_extractor():
                "name": "course_icon",
                "selector": ".image-92",
                "type": "attribute",
-                "attribute": "src"
-            }
-        ]
+                "attribute": "src",
+            },
+        ],
    }

-    browser_config = BrowserConfig(
-        headless=True,
-        java_script_enabled=True
-    )
-    
+    browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
    js_click_tabs = """
    (async () => {
        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
@@ -212,23 +268,23 @@ async def extract_structured_data_using_css_extractor():
        }
    })();
    """
-    
+
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        extraction_strategy=JsonCssExtractionStrategy(schema),
-        js_code=[js_click_tabs]
+        js_code=[js_click_tabs],
    )
-    
+
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
-            url="https://www.kidocode.com/degrees/technology",
-            config=crawler_config
+            url="https://www.kidocode.com/degrees/technology", config=crawler_config
        )

        companies = json.loads(result.extracted_content)
        print(f"Successfully extracted {len(companies)} companies")
        print(json.dumps(companies[0], indent=2))

+
 # Dynamic Content Examples - Method 1
 async def crawl_dynamic_content_pages_method_1():
    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
@@ -249,10 +305,7 @@ async def crawl_dynamic_content_pages_method_1():
        except Exception as e:
            print(f"Warning: New content didn't appear after JavaScript execution: {e}")

-    browser_config = BrowserConfig(
-        headless=False,
-        java_script_enabled=True
-    )
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)

    async with AsyncWebCrawler(config=browser_config) as crawler:
        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
@@ -272,7 +325,7 @@ async def crawl_dynamic_content_pages_method_1():
                css_selector="li.Box-sc-g0xbh4-0",
                js_code=js_next_page if page > 0 else None,
                js_only=page > 0,
-                session_id=session_id
+                session_id=session_id,
            )

            result = await crawler.arun(url=url, config=crawler_config)
@@ -286,14 +339,12 @@ async def crawl_dynamic_content_pages_method_1():

        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

+
 # Dynamic Content Examples - Method 2
 async def crawl_dynamic_content_pages_method_2():
    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")

-    browser_config = BrowserConfig(
-        headless=False,
-        java_script_enabled=True
-    )
+    browser_config = BrowserConfig(headless=False, java_script_enabled=True)

    js_next_page_and_wait = """
    (async () => {
@@ -343,7 +394,7 @@ async def crawl_dynamic_content_pages_method_2():
                extraction_strategy=extraction_strategy,
                js_code=js_next_page_and_wait if page > 0 else None,
                js_only=page > 0,
-                session_id=session_id
+                session_id=session_id,
            )

            result = await crawler.arun(url=url, config=crawler_config)
@@ -355,88 +406,128 @@ async def crawl_dynamic_content_pages_method_2():

        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")

+
+async def cosine_similarity_extraction():
+    crawl_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=10,
+            max_dist=0.2, # Maximum distance between two words
+            linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
+            top_k=3, # Number of top keywords to extract
+            sim_threshold=0.3, # Similarity threshold for clustering
+            semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
+            verbose=True
+        ),        
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
+            config=crawl_config
+        )
+        print(json.loads(result.extracted_content)[:5])
+
 # Browser Comparison
 async def crawl_custom_browser_type():
    print("\n--- Browser Comparison ---")
-    
+
    # Firefox
-    browser_config_firefox = BrowserConfig(
-        browser_type="firefox",
-        headless=True
-    )
+    browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
    start = time.time()
    async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
        result = await crawler.arun(
            url="https://www.example.com",
-            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
        )
        print("Firefox:", time.time() - start)
        print(result.markdown[:500])

    # WebKit
-    browser_config_webkit = BrowserConfig(
-        browser_type="webkit",
-        headless=True
-    )
+    browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
    start = time.time()
    async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
        result = await crawler.arun(
            url="https://www.example.com",
-            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
        )
        print("WebKit:", time.time() - start)
        print(result.markdown[:500])

    # Chromium (default)
-    browser_config_chromium = BrowserConfig(
-        browser_type="chromium",
-        headless=True
-    )
+    browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
    start = time.time()
    async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
        result = await crawler.arun(
            url="https://www.example.com",
-            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
        )
        print("Chromium:", time.time() - start)
        print(result.markdown[:500])

+
 # Anti-Bot and User Simulation
 async def crawl_with_user_simulation():
    browser_config = BrowserConfig(
        headless=True,
        user_agent_mode="random",
-        user_agent_generator_config={
-            "device_type": "mobile",
-            "os_type": "android"
-        }
+        user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
    )

    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        magic=True,
        simulate_user=True,
-        override_navigator=True
+        override_navigator=True,
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url="YOUR-URL-HERE",
-            config=crawler_config
-        )
+        result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
        print(result.markdown)

+async def ssl_certification():
+    # Configure crawler to fetch SSL certificate
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS  # Bypass cache to always get fresh certificates
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url='https://example.com',
+            config=config
+        )
+        
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            
+            # 1. Access certificate properties directly
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+            
+            # 2. Export certificate in different formats
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
+            print("\nCertificate exported to:")
+            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+            
+            pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))  # For web servers
+            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+            
+            der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der"))  # For Java apps
+            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
 # Speed Comparison
 async def speed_comparison():
    print("\n--- Speed Comparison ---")
-    
+
    # Firecrawl comparison
    from firecrawl import FirecrawlApp
-    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+
+    app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
    start = time.time()
    scrape_status = app.scrape_url(
-        'https://www.nbcnews.com/business',
-        params={'formats': ['markdown', 'html']}
+        "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
    )
    end = time.time()
    print("Firecrawl:")
@@ -447,16 +538,15 @@ async def speed_comparison():

    # Crawl4AI comparisons
    browser_config = BrowserConfig(headless=True)
-    
+
    # Simple crawl
    async with AsyncWebCrawler(config=browser_config) as crawler:
        start = time.time()
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            config=CrawlerRunConfig(
-                cache_mode=CacheMode.BYPASS,
-                word_count_threshold=0
-            )
+                cache_mode=CacheMode.BYPASS, word_count_threshold=0
+            ),
        )
        end = time.time()
        print("Crawl4AI (simple crawl):")
@@ -474,12 +564,10 @@ async def speed_comparison():
                word_count_threshold=0,
                markdown_generator=DefaultMarkdownGenerator(
                    content_filter=PruningContentFilter(
-                        threshold=0.48,
-                        threshold_type="fixed",
-                        min_word_threshold=0
+                        threshold=0.48, threshold_type="fixed", min_word_threshold=0
                    )
-                )
-            )
+                ),
+            ),
        )
        end = time.time()
        print("Crawl4AI (Markdown Plus):")
@@ -489,22 +577,25 @@ async def speed_comparison():
        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
        print()

+
 # Main execution
 async def main():
    # Basic examples
    # await simple_crawl()
    # await simple_example_with_running_js_code()
    # await simple_example_with_css_selector()
-    
+
    # Advanced examples
    # await extract_structured_data_using_css_extractor()
-    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    await extract_structured_data_using_llm(
+        "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
+    )
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
-    
+
    # Browser comparisons
    # await crawl_custom_browser_type()
-    
+
    # Performance testing
    # await speed_comparison()

@@ -514,5 +605,6 @@ async def main():
    #     os.path.join(__location__, "tmp/example_screenshot.jpg")
    # )

+
 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -627,13 +627,13 @@ async def main():
    # }
    # await extract_structured_data_using_llm(extra_headers=custom_headers)
    
-    await crawl_dynamic_content_pages_method_1()
-    await crawl_dynamic_content_pages_method_2()
+    # await crawl_dynamic_content_pages_method_1()
+    # await crawl_dynamic_content_pages_method_2()
    await crawl_dynamic_content_pages_method_3()
    
-    await crawl_custom_browser_type()
+    # await crawl_custom_browser_type()
    
-    await speed_comparison()
+    # await speed_comparison()


 if __name__ == "__main__":
--- a/docs/examples/ssl_example.py
+++ b/docs/examples/ssl_example.py
@@ -0,0 +1,46 @@
+"""Example showing how to work with SSL certificates in Crawl4AI."""
+
+import asyncio
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+# Create tmp directory if it doesn't exist
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+tmp_dir = os.path.join(parent_dir, "tmp")
+os.makedirs(tmp_dir, exist_ok=True)
+
+async def main():
+    # Configure crawler to fetch SSL certificate
+    config = CrawlerRunConfig(
+        fetch_ssl_certificate=True,
+        cache_mode=CacheMode.BYPASS  # Bypass cache to always get fresh certificates
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url='https://example.com',
+            config=config
+        )
+        
+        if result.success and result.ssl_certificate:
+            cert = result.ssl_certificate
+            
+            # 1. Access certificate properties directly
+            print("\nCertificate Information:")
+            print(f"Issuer: {cert.issuer.get('CN', '')}")
+            print(f"Valid until: {cert.valid_until}")
+            print(f"Fingerprint: {cert.fingerprint}")
+            
+            # 2. Export certificate in different formats
+            cert.to_json(os.path.join(tmp_dir, "certificate.json"))  # For analysis
+            print("\nCertificate exported to:")
+            print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+            
+            pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem"))  # For web servers
+            print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+            
+            der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der"))  # For Java apps
+            print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/tmp/chainlit_review.py
+++ b/docs/examples/tmp/chainlit_review.py
@@ -1,281 +0,0 @@
-from openai import AsyncOpenAI
-from chainlit.types import ThreadDict
-import chainlit as cl
-from chainlit.input_widget import Select, Switch, Slider
-client = AsyncOpenAI()
-
-# Instrument the OpenAI client
-cl.instrument_openai()
-
-settings = {
-    "model": "gpt-3.5-turbo",
-    "temperature": 0.5,
-    "max_tokens": 500,
-    "top_p": 1,
-    "frequency_penalty": 0,
-    "presence_penalty": 0,
-}
-
-@cl.action_callback("action_button")
-async def on_action(action: cl.Action):
-    print("The user clicked on the action button!")
-
-    return "Thank you for clicking on the action button!"
-
-@cl.set_chat_profiles
-async def chat_profile():
-    return [
-        cl.ChatProfile(
-            name="GPT-3.5",
-            markdown_description="The underlying LLM model is **GPT-3.5**.",
-            icon="https://picsum.photos/200",
-        ),
-        cl.ChatProfile(
-            name="GPT-4",
-            markdown_description="The underlying LLM model is **GPT-4**.",
-            icon="https://picsum.photos/250",
-        ),
-    ]
-
-@cl.on_chat_start
-async def on_chat_start():
-    
-    settings = await cl.ChatSettings(
-        [
-            Select(
-                id="Model",
-                label="OpenAI - Model",
-                values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
-                initial_index=0,
-            ),
-            Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
-            Slider(
-                id="Temperature",
-                label="OpenAI - Temperature",
-                initial=1,
-                min=0,
-                max=2,
-                step=0.1,
-            ),
-            Slider(
-                id="SAI_Steps",
-                label="Stability AI - Steps",
-                initial=30,
-                min=10,
-                max=150,
-                step=1,
-                description="Amount of inference steps performed on image generation.",
-            ),
-            Slider(
-                id="SAI_Cfg_Scale",
-                label="Stability AI - Cfg_Scale",
-                initial=7,
-                min=1,
-                max=35,
-                step=0.1,
-                description="Influences how strongly your generation is guided to match your prompt.",
-            ),
-            Slider(
-                id="SAI_Width",
-                label="Stability AI - Image Width",
-                initial=512,
-                min=256,
-                max=2048,
-                step=64,
-                tooltip="Measured in pixels",
-            ),
-            Slider(
-                id="SAI_Height",
-                label="Stability AI - Image Height",
-                initial=512,
-                min=256,
-                max=2048,
-                step=64,
-                tooltip="Measured in pixels",
-            ),
-        ]
-    ).send()
-    
-    chat_profile = cl.user_session.get("chat_profile")
-    await cl.Message(
-        content=f"starting chat using the {chat_profile} chat profile"
-    ).send()
-    
-    print("A new chat session has started!")
-    cl.user_session.set("session", {
-        "history": [],
-        "context": []
-    })  
-    
-    image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
-
-    # Attach the image to the message
-    await cl.Message(
-        content="You are such a good girl, aren't you?!",
-        elements=[image],
-    ).send()
-    
-    text_content = "Hello, this is a text element."
-    elements = [
-        cl.Text(name="simple_text", content=text_content, display="inline")
-    ]
-
-    await cl.Message(
-        content="Check out this text element!",
-        elements=elements,
-    ).send()
-    
-    elements = [
-        cl.Audio(path="./assets/audio.mp3", display="inline"),
-    ]
-    await cl.Message(
-        content="Here is an audio file",
-        elements=elements,
-    ).send()
-    
-    await cl.Avatar(
-        name="Tool 1",
-        url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
-    ).send()
-    
-    await cl.Message(
-        content="This message should not have an avatar!", author="Tool 0"
-    ).send()
-    
-    await cl.Message(
-        content="This message should have an avatar!", author="Tool 1"
-    ).send()
-    
-    elements = [
-        cl.File(
-            name="quickstart.py",
-            path="./quickstart.py",
-            display="inline",
-        ),
-    ]
-
-    await cl.Message(
-        content="This message has a file element", elements=elements
-    ).send()
-    
-    # Sending an action button within a chatbot message
-    actions = [
-        cl.Action(name="action_button", value="example_value", description="Click me!")
-    ]
-
-    await cl.Message(content="Interact with this action button:", actions=actions).send()
-    
-    # res = await cl.AskActionMessage(
-    #     content="Pick an action!",
-    #     actions=[
-    #         cl.Action(name="continue", value="continue", label="✅ Continue"),
-    #         cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
-    #     ],
-    # ).send()
-
-    # if res and res.get("value") == "continue":
-    #     await cl.Message(
-    #         content="Continue!",
-    #     ).send()
-    
-    # import plotly.graph_objects as go
-    # fig = go.Figure(
-    #     data=[go.Bar(y=[2, 1, 3])],
-    #     layout_title_text="An example figure",
-    # )
-    # elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
-
-    # await cl.Message(content="This message has a chart", elements=elements).send()
-    
-    # Sending a pdf with the local file path
-    # elements = [
-    #   cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
-    # ]
-
-    # cl.Message(content="Look at this local pdf!", elements=elements).send()    
-
-@cl.on_settings_update
-async def setup_agent(settings):
-    print("on_settings_update", settings)
-    
-@cl.on_stop
-def on_stop():
-    print("The user wants to stop the task!")
-
-@cl.on_chat_end
-def on_chat_end():
-    print("The user disconnected!")
-
-
-@cl.on_chat_resume
-async def on_chat_resume(thread: ThreadDict):
-    print("The user resumed a previous chat session!")
-
-
-
-
-# @cl.on_message
-async def on_message(message: cl.Message):
-    cl.user_session.get("session")["history"].append({
-        "role": "user",
-        "content": message.content
-    })    
-    response = await client.chat.completions.create(
-        messages=[
-            {
-                "content": "You are a helpful bot",
-                "role": "system"
-            },
-            *cl.user_session.get("session")["history"]
-        ],
-        **settings
-    )
-    
-
-    # Add assitanr message to the history
-    cl.user_session.get("session")["history"].append({
-        "role": "assistant",
-        "content": response.choices[0].message.content
-    })
-    
-    # msg.content = response.choices[0].message.content
-    # await msg.update()
-    
-    # await cl.Message(content=response.choices[0].message.content).send()
-
-@cl.on_message
-async def on_message(message: cl.Message):
-    cl.user_session.get("session")["history"].append({
-        "role": "user",
-        "content": message.content
-    })    
-
-    msg = cl.Message(content="")
-    await msg.send()    
-    
-    stream = await client.chat.completions.create(
-        messages=[
-            {
-                "content": "You are a helpful bot",
-                "role": "system"
-            },
-            *cl.user_session.get("session")["history"]
-        ],
-        stream = True, 
-        **settings
-    )
-    
-    async for part in stream:
-        if token := part.choices[0].delta.content or "":
-            await msg.stream_token(token)
-    
-    # Add assitanr message to the history
-    cl.user_session.get("session")["history"].append({
-        "role": "assistant",
-        "content": msg.content
-    })    
-    await msg.update()
-
-if __name__ == "__main__":
-    from chainlit.cli import run_chainlit
-    run_chainlit(__file__)
--- a/docs/examples/tmp/research_assistant_audio_not_completed.py
+++ b/docs/examples/tmp/research_assistant_audio_not_completed.py
@@ -1,238 +0,0 @@
-# Make sure to install the required packageschainlit and groq
-import os, time
-from openai import AsyncOpenAI
-import chainlit as cl
-import re
-import requests
-from io import BytesIO
-from chainlit.element import ElementBased
-from groq import Groq
-
-# Import threadpools to run the crawl_url function in a separate thread
-from concurrent.futures import ThreadPoolExecutor
-
-client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
-
-# Instrument the OpenAI client
-cl.instrument_openai()
-
-settings = {
-    "model": "llama3-8b-8192",
-    "temperature": 0.5,
-    "max_tokens": 500,
-    "top_p": 1,
-    "frequency_penalty": 0,
-    "presence_penalty": 0,
-}
-
-def extract_urls(text):
-    url_pattern = re.compile(r'(https?://\S+)')
-    return url_pattern.findall(text)
-
-def crawl_url(url):
-    data = {
-        "urls": [url],
-        "include_raw_html": True,
-        "word_count_threshold": 10,
-        "extraction_strategy": "NoExtractionStrategy",
-        "chunking_strategy": "RegexChunking"
-    }
-    response = requests.post("https://crawl4ai.com/crawl", json=data)
-    response_data = response.json()
-    response_data = response_data['results'][0]
-    return response_data['markdown']
-
-@cl.on_chat_start
-async def on_chat_start():
-    cl.user_session.set("session", {
-        "history": [],
-        "context": {}
-    })  
-    await cl.Message(
-        content="Welcome to the chat! How can I assist you today?"
-    ).send()
-
-@cl.on_message
-async def on_message(message: cl.Message):
-    user_session = cl.user_session.get("session")
-    
-    # Extract URLs from the user's message
-    urls = extract_urls(message.content)
-    
-    
-    futures = []
-    with ThreadPoolExecutor() as executor:
-        for url in urls:
-            futures.append(executor.submit(crawl_url, url))
-
-    results = [future.result() for future in futures]
-
-    for url, result in zip(urls, results):
-        ref_number = f"REF_{len(user_session['context']) + 1}"
-        user_session["context"][ref_number] = {
-            "url": url,
-            "content": result
-        }    
-    
-    # for url in urls:
-    #     # Crawl the content of each URL and add it to the session context with a reference number
-    #     ref_number = f"REF_{len(user_session['context']) + 1}"
-    #     crawled_content = crawl_url(url)
-    #     user_session["context"][ref_number] = {
-    #         "url": url,
-    #         "content": crawled_content
-    #     }
-
-    user_session["history"].append({
-        "role": "user",
-        "content": message.content
-    })
-
-    # Create a system message that includes the context
-    context_messages = [
-        f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
-        for ref, data in user_session["context"].items()
-    ]
-    if context_messages:
-        system_message = {
-            "role": "system",
-            "content": (
-                "You are a helpful bot. Use the following context for answering questions. "
-                "Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
-                "If the question requires any information from the provided appendices or context, refer to the sources. "
-                "If not, there is no need to add a references section. "
-                "At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
-                "\n\n".join(context_messages)
-            )
-        }
-    else:
-        system_message = {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        }
-
-
-    msg = cl.Message(content="")
-    await msg.send()
-
-    # Get response from the LLM
-    stream = await client.chat.completions.create(
-        messages=[
-            system_message,
-            *user_session["history"]
-        ],
-        stream=True,
-        **settings
-    )
-
-    assistant_response = ""
-    async for part in stream:
-        if token := part.choices[0].delta.content:
-            assistant_response += token
-            await msg.stream_token(token)
-
-    # Add assistant message to the history
-    user_session["history"].append({
-        "role": "assistant",
-        "content": assistant_response
-    })
-    await msg.update()
-
-    # Append the reference section to the assistant's response
-    reference_section = "\n\nReferences:\n"
-    for ref, data in user_session["context"].items():
-        reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
-
-    msg.content += reference_section
-    await msg.update()
-
-
-@cl.on_audio_chunk
-async def on_audio_chunk(chunk: cl.AudioChunk):
-    if chunk.isStart:
-        buffer = BytesIO()
-        # This is required for whisper to recognize the file type
-        buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
-        # Initialize the session for a new audio stream
-        cl.user_session.set("audio_buffer", buffer)
-        cl.user_session.set("audio_mime_type", chunk.mimeType)
-
-    # Write the chunks to a buffer and transcribe the whole audio at the end
-    cl.user_session.get("audio_buffer").write(chunk.data)
-
-    pass
-
-@cl.step(type="tool")
-async def speech_to_text(audio_file):
-    cli = Groq()
-    
-    # response = cli.audio.transcriptions.create(
-    #     file=audio_file, #(filename, file.read()),
-    #     model="whisper-large-v3",
-    # )
-    
-    response = await client.audio.transcriptions.create(
-        model="whisper-large-v3", file=audio_file
-    )
-
-    return response.text
-
-
-@cl.on_audio_end
-async def on_audio_end(elements: list[ElementBased]):
-    # Get the audio buffer from the session
-    audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
-    audio_buffer.seek(0)  # Move the file pointer to the beginning
-    audio_file = audio_buffer.read()
-    audio_mime_type: str = cl.user_session.get("audio_mime_type")
-
-    # input_audio_el = cl.Audio(
-    #     mime=audio_mime_type, content=audio_file, name=audio_buffer.name
-    # )
-    # await cl.Message(
-    #     author="You", 
-    #     type="user_message",
-    #     content="",
-    #     elements=[input_audio_el, *elements]
-    # ).send()
-    
-    # answer_message = await cl.Message(content="").send()
-    
-    
-    start_time = time.time()
-    whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
-    transcription = await speech_to_text(whisper_input)
-    end_time = time.time()
-    print(f"Transcription took {end_time - start_time} seconds")
-    
-    user_msg = cl.Message(
-        author="You", 
-        type="user_message",
-        content=transcription
-    )
-    await user_msg.send()
-    await on_message(user_msg)
-
-    # images = [file for file in elements if "image" in file.mime]
-
-    # text_answer = await generate_text_answer(transcription, images)
-    
-    # output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
-    
-    # output_audio_el = cl.Audio(
-    #     name=output_name,
-    #     auto_play=True,
-    #     mime=audio_mime_type,
-    #     content=output_audio,
-    # )
-    
-    # answer_message.elements = [output_audio_el]
-    
-    # answer_message.content = transcription
-    # await answer_message.update()
-
-if __name__ == "__main__":
-    from chainlit.cli import run_chainlit
-    run_chainlit(__file__)
-
-