Merge pull request #9 from aravindkarnam/main

Pulling version 0.4.22 from main into scraper
2024-12-17 18:43:36 +05:30
parent 2f5e0598bb ed7bc1909c
commit 7c0fa269a6
72 changed files with 10610 additions and 5540 deletions
--- a/docs/examples/docker_example.py
+++ b/docs/examples/docker_example.py
@@ -78,20 +78,20 @@ def test_docker_deployment(version="basic"):
            time.sleep(5)
    
    # Test cases based on version
-    # test_basic_crawl(tester)
-    # test_basic_crawl(tester)
-    # test_basic_crawl_sync(tester)
    test_basic_crawl_direct(tester)
+    test_basic_crawl(tester)
+    test_basic_crawl(tester)
+    test_basic_crawl_sync(tester)
    
-    # if version in ["full", "transformer"]:
-    #     test_cosine_extraction(tester)
+    if version in ["full", "transformer"]:
+        test_cosine_extraction(tester)

-    # test_js_execution(tester)
-    # test_css_selector(tester)
-    # test_structured_extraction(tester)
-    # test_llm_extraction(tester)
-    # test_llm_with_ollama(tester)
-    # test_screenshot(tester)
+    test_js_execution(tester)
+    test_css_selector(tester)
+    test_structured_extraction(tester)
+    test_llm_extraction(tester)
+    test_llm_with_ollama(tester)
+    test_screenshot(tester)
    

 def test_basic_crawl(tester: Crawl4AiTester):
--- a/docs/examples/full_page_screenshot_and_pdf_export.md
+++ b/docs/examples/full_page_screenshot_and_pdf_export.md
@@ -0,0 +1,58 @@
+# Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI
+
+When dealing with very long web pages, traditional full-page screenshots can be slow or fail entirely. For large pages (like extensive Wikipedia articles), generating a single massive screenshot often leads to delays, memory issues, or style differences.
+
+**The New Approach:**
+We’ve introduced a new feature that effortlessly handles even the biggest pages by first exporting them as a PDF, then converting that PDF into a high-quality image. This approach leverages the browser’s built-in PDF rendering, making it both stable and efficient for very long content. You also have the option to directly save the PDF for your own usage—no need for multiple passes or complex stitching logic.
+
+**Key Benefits:**
+- **Reliability:** The PDF export never times out and works regardless of page length.
+- **Versatility:** Get both the PDF and a screenshot in one crawl, without reloading or reprocessing.
+- **Performance:** Skips manual scrolling and stitching images, reducing complexity and runtime.
+
+**Simple Example:**
+```python
+import os, sys
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+# Adjust paths as needed
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        # Request both PDF and screenshot
+        result = await crawler.arun(
+            url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
+            cache_mode=CacheMode.BYPASS,
+            pdf=True,
+            screenshot=True
+        )
+        
+        if result.success:
+            # Save screenshot
+            if result.screenshot:
+                from base64 import b64decode
+                with open(os.path.join(__location__, "screenshot.png"), "wb") as f:
+                    f.write(b64decode(result.screenshot))
+            
+            # Save PDF
+            if result.pdf_data:
+                pdf_bytes = b64decode(result.pdf_data)
+                with open(os.path.join(__location__, "page.pdf"), "wb") as f:
+                    f.write(pdf_bytes)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What Happens Under the Hood:**
+- Crawl4AI navigates to the target page.
+- If `pdf=True`, it exports the current page as a full PDF, capturing all of its content no matter the length.
+- If `screenshot=True`, and a PDF is already available, it directly converts the first page of that PDF to an image for you—no repeated loading or scrolling.
+- Finally, you get your PDF and/or screenshot ready to use.
+
+**Conclusion:**
+With this feature, Crawl4AI becomes even more robust and versatile for large-scale content extraction. Whether you need a PDF snapshot or a quick screenshot, you now have a reliable solution for even the most extensive webpages.
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -0,0 +1,518 @@
+import os, sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692"
+
+import asyncio
+import time
+import json
+import re
+from typing import Dict, List
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+# Basic Example - Simple Crawl
+async def simple_crawl():
+    print("\n--- Basic Usage ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config
+        )
+        print(result.markdown[:500])
+
+# JavaScript Execution Example
+async def simple_example_with_running_js_code():
+    print("\n--- Executing JavaScript and Using CSS Selectors ---")
+    
+    browser_config = BrowserConfig(
+        headless=True,
+        java_script_enabled=True
+    )
+    
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
+        # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config
+        )
+        print(result.markdown[:500])
+
+# CSS Selector Example
+async def simple_example_with_css_selector():
+    print("\n--- Using CSS Selectors ---")
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        css_selector=".wide-tease-item__description"
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config
+        )
+        print(result.markdown[:500])
+
+# Proxy Example
+async def use_proxy():
+    print("\n--- Using a Proxy ---")
+    browser_config = BrowserConfig(
+        headless=True,
+        proxy="http://your-proxy-url:port"
+    )
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config
+        )
+        if result.success:
+            print(result.markdown[:500])
+
+# Screenshot Example
+async def capture_and_save_screenshot(url: str, output_path: str):
+    browser_config = BrowserConfig(headless=True)
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        screenshot=True
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url=url,
+            config=crawler_config
+        )
+        
+        if result.success and result.screenshot:
+            import base64
+            screenshot_data = base64.b64decode(result.screenshot)
+            with open(output_path, 'wb') as f:
+                f.write(screenshot_data)
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+
+# LLM Extraction Example
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
+
+async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
+    print(f"\n--- Extracting Structured Data with {provider} ---")
+    
+    if api_token is None and provider != "ollama":
+        print(f"API token is required for {provider}. Skipping this example.")
+        return
+
+    browser_config = BrowserConfig(headless=True)
+    
+    extra_args = {
+        "temperature": 0,
+        "top_p": 0.9,
+        "max_tokens": 2000
+    }
+    if extra_headers:
+        extra_args["extra_headers"] = extra_headers
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=1,
+        page_timeout = 80000,
+        extraction_strategy=LLMExtractionStrategy(
+            provider=provider,
+            api_token=api_token,
+            schema=OpenAIModelFee.model_json_schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content.""",
+            extra_args=extra_args
+        )
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://openai.com/api/pricing/",
+            config=crawler_config
+        )
+        print(result.extracted_content)
+
+# CSS Extraction Example
+async def extract_structured_data_using_css_extractor():
+    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+    schema = {
+        "name": "KidoCode Courses",
+        "baseSelector": "section.charge-methodology .w-tab-content > div",
+        "fields": [
+            {
+                "name": "section_title",
+                "selector": "h3.heading-50",
+                "type": "text",
+            },
+            {
+                "name": "section_description",
+                "selector": ".charge-content",
+                "type": "text",
+            },
+            {
+                "name": "course_name",
+                "selector": ".text-block-93",
+                "type": "text",
+            },
+            {
+                "name": "course_description",
+                "selector": ".course-content-text",
+                "type": "text",
+            },
+            {
+                "name": "course_icon",
+                "selector": ".image-92",
+                "type": "attribute",
+                "attribute": "src"
+            }
+        ]
+    }
+
+    browser_config = BrowserConfig(
+        headless=True,
+        java_script_enabled=True
+    )
+    
+    js_click_tabs = """
+    (async () => {
+        const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+        for(let tab of tabs) {
+            tab.scrollIntoView();
+            tab.click();
+            await new Promise(r => setTimeout(r, 500));
+        }
+    })();
+    """
+    
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        extraction_strategy=JsonCssExtractionStrategy(schema),
+        js_code=[js_click_tabs]
+    )
+    
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="https://www.kidocode.com/degrees/technology",
+            config=crawler_config
+        )
+
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))
+
+# Dynamic Content Examples - Method 1
+async def crawl_dynamic_content_pages_method_1():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+    first_commit = ""
+
+    async def on_execution_started(page, **kwargs):
+        nonlocal first_commit
+        try:
+            while True:
+                await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+                commit = await commit.evaluate("(element) => element.textContent")
+                commit = re.sub(r"\s+", "", commit)
+                if commit and commit != first_commit:
+                    first_commit = commit
+                    break
+                await asyncio.sleep(0.5)
+        except Exception as e:
+            print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+    browser_config = BrowserConfig(
+        headless=False,
+        java_script_enabled=True
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        js_next_page = """
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+        """
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                js_code=js_next_page if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            soup = BeautifulSoup(result.cleaned_html, "html.parser")
+            commits = soup.select("li")
+            all_commits.extend(commits)
+
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+# Dynamic Content Examples - Method 2
+async def crawl_dynamic_content_pages_method_2():
+    print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+    browser_config = BrowserConfig(
+        headless=False,
+        java_script_enabled=True
+    )
+
+    js_next_page_and_wait = """
+    (async () => {
+        const getCurrentCommit = () => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            return commits.length > 0 ? commits[0].textContent.trim() : null;
+        };
+
+        const initialCommit = getCurrentCommit();
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+
+        while (true) {
+            await new Promise(resolve => setTimeout(resolve, 100));
+            const newCommit = getCurrentCommit();
+            if (newCommit && newCommit !== initialCommit) {
+                break;
+            }
+        }
+    })();
+    """
+
+    schema = {
+        "name": "Commit Extractor",
+        "baseSelector": "li.Box-sc-g0xbh4-0",
+        "fields": [
+            {
+                "name": "title",
+                "selector": "h4.markdown-title",
+                "type": "text",
+                "transform": "strip",
+            },
+        ],
+    }
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        all_commits = []
+
+        extraction_strategy = JsonCssExtractionStrategy(schema)
+
+        for page in range(3):
+            crawler_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                css_selector="li.Box-sc-g0xbh4-0",
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page_and_wait if page > 0 else None,
+                js_only=page > 0,
+                session_id=session_id
+            )
+
+            result = await crawler.arun(url=url, config=crawler_config)
+            assert result.success, f"Failed to crawl page {page + 1}"
+
+            commits = json.loads(result.extracted_content)
+            all_commits.extend(commits)
+            print(f"Page {page + 1}: Found {len(commits)} commits")
+
+        print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+# Browser Comparison
+async def crawl_custom_browser_type():
+    print("\n--- Browser Comparison ---")
+    
+    # Firefox
+    browser_config_firefox = BrowserConfig(
+        browser_type="firefox",
+        headless=True
+    )
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        print("Firefox:", time.time() - start)
+        print(result.markdown[:500])
+
+    # WebKit
+    browser_config_webkit = BrowserConfig(
+        browser_type="webkit",
+        headless=True
+    )
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        print("WebKit:", time.time() - start)
+        print(result.markdown[:500])
+
+    # Chromium (default)
+    browser_config_chromium = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+    start = time.time()
+    async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com",
+            config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        print("Chromium:", time.time() - start)
+        print(result.markdown[:500])
+
+# Anti-Bot and User Simulation
+async def crawl_with_user_simulation():
+    browser_config = BrowserConfig(
+        headless=True,
+        user_agent_mode="random",
+        user_agent_generator_config={
+            "device_type": "mobile",
+            "os_type": "android"
+        }
+    )
+
+    crawler_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        magic=True,
+        simulate_user=True,
+        override_navigator=True
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(
+            url="YOUR-URL-HERE",
+            config=crawler_config
+        )
+        print(result.markdown)
+
+# Speed Comparison
+async def speed_comparison():
+    print("\n--- Speed Comparison ---")
+    
+    # Firecrawl comparison
+    from firecrawl import FirecrawlApp
+    app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+    start = time.time()
+    scrape_status = app.scrape_url(
+        'https://www.nbcnews.com/business',
+        params={'formats': ['markdown', 'html']}
+    )
+    end = time.time()
+    print("Firecrawl:")
+    print(f"Time taken: {end - start:.2f} seconds")
+    print(f"Content length: {len(scrape_status['markdown'])} characters")
+    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
+    print()
+
+    # Crawl4AI comparisons
+    browser_config = BrowserConfig(headless=True)
+    
+    # Simple crawl
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        start = time.time()
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                word_count_threshold=0
+            )
+        )
+        end = time.time()
+        print("Crawl4AI (simple crawl):")
+        print(f"Time taken: {end - start:.2f} seconds")
+        print(f"Content length: {len(result.markdown)} characters")
+        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+        print()
+
+        # Advanced filtering
+        start = time.time()
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                word_count_threshold=0,
+                markdown_generator=DefaultMarkdownGenerator(
+                    content_filter=PruningContentFilter(
+                        threshold=0.48,
+                        threshold_type="fixed",
+                        min_word_threshold=0
+                    )
+                )
+            )
+        )
+        end = time.time()
+        print("Crawl4AI (Markdown Plus):")
+        print(f"Time taken: {end - start:.2f} seconds")
+        print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
+        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
+        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+        print()
+
+# Main execution
+async def main():
+    # Basic examples
+    # await simple_crawl()
+    # await simple_example_with_running_js_code()
+    # await simple_example_with_css_selector()
+    
+    # Advanced examples
+    # await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    # await crawl_dynamic_content_pages_method_1()
+    # await crawl_dynamic_content_pages_method_2()
+    
+    # Browser comparisons
+    # await crawl_custom_browser_type()
+    
+    # Performance testing
+    # await speed_comparison()
+
+    # Screenshot example
+    # await capture_and_save_screenshot(
+    #     "https://www.example.com",
+    #     os.path.join(__location__, "tmp/example_screenshot.jpg")
+    # )
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -13,7 +13,9 @@ import re
 from typing import Dict, List
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
 from crawl4ai.extraction_strategy import (
    JsonCssExtractionStrategy,
    LLMExtractionStrategy,
@@ -30,7 +32,7 @@ print("Website: https://crawl4ai.com")
 async def simple_crawl():
    print("\n--- Basic Usage ---")
    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(url="https://www.nbcnews.com/business")
+        result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
        print(result.markdown[:500])  # Print first 500 characters

 async def simple_example_with_running_js_code():
@@ -51,7 +53,7 @@ async def simple_example_with_running_js_code():
            url="https://www.nbcnews.com/business",
            js_code=js_code,
            # wait_for=wait_for,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
        )
        print(result.markdown[:500])  # Print first 500 characters

@@ -61,7 +63,7 @@ async def simple_example_with_css_selector():
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            css_selector=".wide-tease-item__description",
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
        )
        print(result.markdown[:500])  # Print first 500 characters

@@ -74,16 +76,17 @@ async def use_proxy():
    async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
-            bypass_cache=True
+            cache_mode= CacheMode.BYPASS
        )
-        print(result.markdown[:500])  # Print first 500 characters
+        if result.success:
+            print(result.markdown[:500])  # Print first 500 characters

 async def capture_and_save_screenshot(url: str, output_path: str):
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url=url,
            screenshot=True,
-            bypass_cache=True
+            cache_mode= CacheMode.BYPASS
        )
        
        if result.success and result.screenshot:
@@ -114,7 +117,13 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
        print(f"API token is required for {provider}. Skipping this example.")
        return

-    extra_args = {}
+    # extra_args = {}
+    extra_args={
+        "temperature": 0, 
+        "top_p": 0.9,
+        "max_tokens": 2000,
+        # any other supported parameters for litellm
+    }
    if extra_headers:
        extra_args["extra_headers"] = extra_headers

@@ -125,55 +134,82 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
            extraction_strategy=LLMExtractionStrategy(
                provider=provider,
                api_token=api_token,
-                schema=OpenAIModelFee.schema(),
+                schema=OpenAIModelFee.model_json_schema(),
                extraction_type="schema",
                instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
                Do not miss any models in the entire content. One extracted model JSON format should look like this: 
                {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
                extra_args=extra_args
            ),
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
        )
        print(result.extracted_content)

 async def extract_structured_data_using_css_extractor():
    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
    schema = {
-        "name": "Coinbase Crypto Prices",
-        "baseSelector": ".cds-tableRow-t45thuk",
-        "fields": [
-            {
-                "name": "crypto",
-                "selector": "td:nth-child(1) h2",
-                "type": "text",
-            },
-            {
-                "name": "symbol",
-                "selector": "td:nth-child(1) p",
-                "type": "text",
-            },
-            {
-                "name": "price",
-                "selector": "td:nth-child(2)",
-                "type": "text",
+    "name": "KidoCode Courses",
+    "baseSelector": "section.charge-methodology .w-tab-content > div",
+    "fields": [
+        {
+            "name": "section_title",
+            "selector": "h3.heading-50",
+            "type": "text",
+        },
+        {
+            "name": "section_description",
+            "selector": ".charge-content",
+            "type": "text",
+        },
+        {
+            "name": "course_name",
+            "selector": ".text-block-93",
+            "type": "text",
+        },
+        {
+            "name": "course_description",
+            "selector": ".course-content-text",
+            "type": "text",
+        },
+        {
+            "name": "course_icon",
+            "selector": ".image-92",
+            "type": "attribute",
+            "attribute": "src"
+        }
+    ]
+}
+
+    async with AsyncWebCrawler(
+        headless=True,
+        verbose=True
+    ) as crawler:
+        
+        # Create the JavaScript that handles clicking multiple times
+        js_click_tabs = """
+        (async () => {
+            const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+            
+            for(let tab of tabs) {
+                // scroll to the tab
+                tab.scrollIntoView();
+                tab.click();
+                // Wait for content to load and animations to complete
+                await new Promise(r => setTimeout(r, 500));
            }
-        ],
-    }
+        })();
+        """     

-    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
-    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
-            url="https://www.coinbase.com/explore",
-            extraction_strategy=extraction_strategy,
-            bypass_cache=True,
+            url="https://www.kidocode.com/degrees/technology",
+            extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
+            js_code=[js_click_tabs],
+            cache_mode=CacheMode.BYPASS
        )

-        assert result.success, "Failed to crawl the page"
-
-        news_teasers = json.loads(result.extracted_content)
-        print(f"Successfully extracted {len(news_teasers)} news teasers")
-        print(json.dumps(news_teasers[0], indent=2))
+        companies = json.loads(result.extracted_content)
+        print(f"Successfully extracted {len(companies)} companies")
+        print(json.dumps(companies[0], indent=2))

 # Advanced Session-Based Crawling with Dynamic Content 🔄
 async def crawl_dynamic_content_pages_method_1():
@@ -203,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1():
        all_commits = []

        js_next_page = """
-        const button = document.querySelector('a[data-testid="pagination-next-button"]');
-        if (button) button.click();
+        (() => {
+            const button = document.querySelector('a[data-testid="pagination-next-button"]');
+            if (button) button.click();
+        })();
        """

        for page in range(3):  # Crawl 3 pages
@@ -213,7 +251,7 @@ async def crawl_dynamic_content_pages_method_1():
                session_id=session_id,
                css_selector="li.Box-sc-g0xbh4-0",
                js=js_next_page if page > 0 else None,
-                bypass_cache=True,
+                cache_mode=CacheMode.BYPASS,
                js_only=page > 0,
                headless=False,
            )
@@ -282,7 +320,7 @@ async def crawl_dynamic_content_pages_method_2():
                extraction_strategy=extraction_strategy,
                js_code=js_next_page_and_wait if page > 0 else None,
                js_only=page > 0,
-                bypass_cache=True,
+                cache_mode=CacheMode.BYPASS,
                headless=False,
            )

@@ -343,7 +381,7 @@ async def crawl_dynamic_content_pages_method_3():
                js_code=js_next_page if page > 0 else None,
                wait_for=wait_for if page > 0 else None,
                js_only=page > 0,
-                bypass_cache=True,
+                cache_mode=CacheMode.BYPASS,
                headless=False,
            )

@@ -361,21 +399,21 @@ async def crawl_custom_browser_type():
    # Use Firefox
    start = time.time()
    async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)

    # Use WebKit
    start = time.time()
    async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)

    # Use Chromium (default)
    start = time.time()
    async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)

@@ -384,7 +422,7 @@ async def crawl_with_user_simultion():
        url = "YOUR-URL-HERE"
        result = await crawler.arun(
            url=url,            
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
            magic = True, # Automatically detects and removes overlays, popups, and other elements that block content
            # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
            # override_navigator = True # Overrides the navigator object to make it look like a real user
@@ -408,7 +446,7 @@ async def speed_comparison():
    params={'formats': ['markdown', 'html']}
    )
    end = time.time()
-    print("Firecrawl (simulated):")
+    print("Firecrawl:")
    print(f"Time taken: {end - start:.2f} seconds")
    print(f"Content length: {len(scrape_status['markdown'])} characters")
    print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
@@ -420,7 +458,7 @@ async def speed_comparison():
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
            word_count_threshold=0,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
            verbose=False,
        )
        end = time.time()
@@ -430,6 +468,26 @@ async def speed_comparison():
        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
        print()

+        # Crawl4AI with advanced content filtering
+        start = time.time()
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            word_count_threshold=0,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
+            ),
+            cache_mode=CacheMode.BYPASS,
+            verbose=False,
+        )
+        end = time.time()
+        print("Crawl4AI (Markdown Plus):")
+        print(f"Time taken: {end - start:.2f} seconds")
+        print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
+        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
+        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+        print()
+
        # Crawl4AI with JavaScript execution
        start = time.time()
        result = await crawler.arun(
@@ -438,13 +496,18 @@ async def speed_comparison():
                "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
            ],
            word_count_threshold=0,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+                # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
+            ),
            verbose=False,
        )
        end = time.time()
        print("Crawl4AI (with JavaScript execution):")
        print(f"Time taken: {end - start:.2f} seconds")
        print(f"Content length: {len(result.markdown)} characters")
+        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")

    print("\nNote on Speed Comparison:")
@@ -483,7 +546,7 @@ async def generate_knowledge_graph():
        url = "https://paulgraham.com/love.html"
        result = await crawler.arun(
            url=url,
-            bypass_cache=True,
+            cache_mode=CacheMode.BYPASS,
            extraction_strategy=extraction_strategy,
            # magic=True
        )
@@ -492,45 +555,80 @@ async def generate_knowledge_graph():
            f.write(result.extracted_content)

 async def fit_markdown_remove_overlay():
-    async with AsyncWebCrawler(headless = False) as crawler:
-        url = "https://janineintheworld.com/places-to-visit-in-central-mexico"
+    
+    async with AsyncWebCrawler(
+            headless=True,  # Set to False to see what is happening
+            verbose=True,
+            user_agent_mode="random",
+            user_agent_generator_config={
+                "device_type": "mobile",
+                "os_type": "android"
+            },
+    ) as crawler:
        result = await crawler.arun(
-            url=url,
-            bypass_cache=True,
-            word_count_threshold = 10,
-            remove_overlay_elements=True,
-            screenshot = True
+            url='https://www.kidocode.com/degrees/technology',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                ),
+                options={
+                    "ignore_links": True
+                }
+            ),
+            # markdown_generator=DefaultMarkdownGenerator(
+            #     content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
+            #     options={
+            #         "ignore_links": True
+            #     }
+            # ),
        )
-        # Save markdown to file
-        with open(os.path.join(__location__, "mexico_places.md"), "w") as f:
-            f.write(result.fit_markdown)
-
+        
+        if result.success:
+            print(len(result.markdown_v2.raw_markdown))
+            print(len(result.markdown_v2.markdown_with_citations))
+            print(len(result.markdown_v2.fit_markdown))
+            
+            # Save clean html
+            with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
+                f.write(result.cleaned_html)
+            
+            with open(os.path.join(__location__, "output/output_raw_markdown.md"), "w") as f:
+                f.write(result.markdown_v2.raw_markdown)
+                
+            with open(os.path.join(__location__, "output/output_markdown_with_citations.md"), "w") as f:
+                f.write(result.markdown_v2.markdown_with_citations) 
+                
+            with open(os.path.join(__location__, "output/output_fit_markdown.md"), "w") as f:   
+                f.write(result.markdown_v2.fit_markdown)
+        
    print("Done")


 async def main():
-    await simple_crawl()
-    await simple_example_with_running_js_code()
-    await simple_example_with_css_selector()
-    await use_proxy()
-    await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    await extract_structured_data_using_css_extractor()
+    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    
+    # await simple_crawl()
+    # await simple_example_with_running_js_code()
+    # await simple_example_with_css_selector()
+    # # await use_proxy()
+    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+    # await extract_structured_data_using_css_extractor()

    # LLM extraction examples
-    await extract_structured_data_using_llm()
-    await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
-    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-    await extract_structured_data_using_llm("ollama/llama3.2")    
+    # await extract_structured_data_using_llm()
+    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
+    # await extract_structured_data_using_llm("ollama/llama3.2")    

    # You always can pass custom headers to the extraction strategy
-    custom_headers = {
-        "Authorization": "Bearer your-custom-token",
-        "X-Custom-Header": "Some-Value"
-    }
-    await extract_structured_data_using_llm(extra_headers=custom_headers)
+    # custom_headers = {
+    #     "Authorization": "Bearer your-custom-token",
+    #     "X-Custom-Header": "Some-Value"
+    # }
+    # await extract_structured_data_using_llm(extra_headers=custom_headers)
    
-    # await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
+    await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_2()
    await crawl_dynamic_content_pages_method_3()
    
    await crawl_custom_browser_type()
--- a/docs/examples/storage_state_tutorial.md
+++ b/docs/examples/storage_state_tutorial.md
@@ -0,0 +1,225 @@
+### Using `storage_state` to Pre-Load Cookies and LocalStorage
+
+Crawl4ai’s `AsyncWebCrawler` lets you preserve and reuse session data, including cookies and localStorage, across multiple runs. By providing a `storage_state`, you can start your crawls already “logged in” or with any other necessary session data—no need to repeat the login flow every time.
+
+#### What is `storage_state`?
+
+`storage_state` can be:
+
+- A dictionary containing cookies and localStorage data.
+- A path to a JSON file that holds this information.
+
+When you pass `storage_state` to the crawler, it applies these cookies and localStorage entries before loading any pages. This means your crawler effectively starts in a known authenticated or pre-configured state.
+
+#### Example Structure
+
+Here’s an example storage state:
+
+```json
+{
+  "cookies": [
+    {
+      "name": "session",
+      "value": "abcd1234",
+      "domain": "example.com",
+      "path": "/",
+      "expires": 1675363572.037711,
+      "httpOnly": false,
+      "secure": false,
+      "sameSite": "None"
+    }
+  ],
+  "origins": [
+    {
+      "origin": "https://example.com",
+      "localStorage": [
+        { "name": "token", "value": "my_auth_token" },
+        { "name": "refreshToken", "value": "my_refresh_token" }
+      ]
+    }
+  ]
+}
+```
+
+This JSON sets a `session` cookie and two localStorage entries (`token` and `refreshToken`) for `https://example.com`.
+
+---
+
+### Passing `storage_state` as a Dictionary
+
+You can directly provide the data as a dictionary:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    storage_dict = {
+        "cookies": [
+            {
+                "name": "session",
+                "value": "abcd1234",
+                "domain": "example.com",
+                "path": "/",
+                "expires": 1675363572.037711,
+                "httpOnly": False,
+                "secure": False,
+                "sameSite": "None"
+            }
+        ],
+        "origins": [
+            {
+                "origin": "https://example.com",
+                "localStorage": [
+                    {"name": "token", "value": "my_auth_token"},
+                    {"name": "refreshToken", "value": "my_refresh_token"}
+                ]
+            }
+        ]
+    }
+
+    async with AsyncWebCrawler(
+        headless=True,
+        storage_state=storage_dict
+    ) as crawler:
+        result = await crawler.arun(url='https://example.com/protected')
+        if result.success:
+            print("Crawl succeeded with pre-loaded session data!")
+            print("Page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+### Passing `storage_state` as a File
+
+If you prefer a file-based approach, save the JSON above to `mystate.json` and reference it:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler(
+        headless=True,
+        storage_state="mystate.json"  # Uses a JSON file instead of a dictionary
+    ) as crawler:
+        result = await crawler.arun(url='https://example.com/protected')
+        if result.success:
+            print("Crawl succeeded with pre-loaded session data!")
+            print("Page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+---
+
+### Using `storage_state` to Avoid Repeated Logins (Sign In Once, Use Later)
+
+A common scenario is when you need to log in to a site (entering username/password, etc.) to access protected pages. Doing so every crawl is cumbersome. Instead, you can:
+
+1. Perform the login once in a hook.
+2. After login completes, export the resulting `storage_state` to a file.
+3. On subsequent runs, provide that `storage_state` to skip the login step.
+
+**Step-by-Step Example:**
+
+**First Run (Perform Login and Save State):**
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def on_browser_created_hook(browser):
+    # Access the default context and create a page
+    context = browser.contexts[0]
+    page = await context.new_page()
+    
+    # Navigate to the login page
+    await page.goto("https://example.com/login", wait_until="domcontentloaded")
+    
+    # Fill in credentials and submit
+    await page.fill("input[name='username']", "myuser")
+    await page.fill("input[name='password']", "mypassword")
+    await page.click("button[type='submit']")
+    await page.wait_for_load_state("networkidle")
+    
+    # Now the site sets tokens in localStorage and cookies
+    # Export this state to a file so we can reuse it
+    await context.storage_state(path="my_storage_state.json")
+    await page.close()
+
+async def main():
+    # First run: perform login and export the storage_state
+    async with AsyncWebCrawler(
+        headless=True,
+        verbose=True,
+        hooks={"on_browser_created": on_browser_created_hook},
+        use_persistent_context=True,
+        user_data_dir="./my_user_data"
+    ) as crawler:
+        
+        # After on_browser_created_hook runs, we have storage_state saved to my_storage_state.json
+        result = await crawler.arun(
+            url='https://example.com/protected-page',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
+        )
+        print("First run result success:", result.success)
+        if result.success:
+            print("Protected page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**Second Run (Reuse Saved State, No Login Needed):**
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+    # Second run: no need to hook on_browser_created this time.
+    # Just provide the previously saved storage state.
+    async with AsyncWebCrawler(
+        headless=True,
+        verbose=True,
+        use_persistent_context=True,
+        user_data_dir="./my_user_data",
+        storage_state="my_storage_state.json"  # Reuse previously exported state
+    ) as crawler:
+        
+        # Now the crawler starts already logged in
+        result = await crawler.arun(
+            url='https://example.com/protected-page',
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
+        )
+        print("Second run result success:", result.success)
+        if result.success:
+            print("Protected page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+**What’s Happening Here?**
+
+- During the first run, the `on_browser_created_hook` logs into the site.  
+- After logging in, the crawler exports the current session (cookies, localStorage, etc.) to `my_storage_state.json`.  
+- On subsequent runs, passing `storage_state="my_storage_state.json"` starts the browser context with these tokens already in place, skipping the login steps.
+
+**Sign Out Scenario:**  
+If the website allows you to sign out by clearing tokens or by navigating to a sign-out URL, you can also run a script that uses `on_browser_created_hook` or `arun` to simulate signing out, then export the resulting `storage_state` again. That would give you a baseline “logged out” state to start fresh from next time.
+
+---
+
+### Conclusion
+
+By using `storage_state`, you can skip repetitive actions, like logging in, and jump straight into crawling protected content. Whether you provide a file path or a dictionary, this powerful feature helps maintain state between crawls, simplifying your data extraction pipelines.
--- a/docs/examples/tutorial_dynamic_clicks.md
+++ b/docs/examples/tutorial_dynamic_clicks.md
@@ -0,0 +1,117 @@
+# Tutorial: Clicking Buttons to Load More Content with Crawl4AI
+
+## Introduction
+
+When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches:
+
+1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content.
+2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction.
+
+## Prerequisites
+
+- A working installation of Crawl4AI
+- Basic familiarity with Python’s `async`/`await` syntax
+
+## Step-by-Step Approach
+
+Use a session ID to maintain state across multiple `arun()` calls:
+
+```python
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+js_code = [
+    # This JS finds the “Next” button and clicks it
+    "const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();"
+]
+
+wait_for_condition = "css:.new-content-class"
+
+async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
+    # 1. Load the initial page
+    result_initial = await crawler.arun(
+        url="https://example.com",
+        cache_mode=CacheMode.BYPASS,
+        session_id="my_session"
+    )
+
+    # 2. Click the 'Next' button and wait for new content
+    result_next = await crawler.arun(
+        url="https://example.com",
+        session_id="my_session",
+        js_code=js_code,
+        wait_for=wait_for_condition,
+        js_only=True,
+        cache_mode=CacheMode.BYPASS
+    )
+
+# `result_next` now contains the updated HTML after clicking 'Next'
+```
+
+**Key Points:**
+- **`session_id`**: Keeps the same browser context open.
+- **`js_code`**: Executes JavaScript in the context of the already loaded page.
+- **`wait_for`**: Ensures the crawler waits until new content is fully loaded.
+- **`js_only=True`**: Runs the JS in the current session without reloading the page.
+
+By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content.
+
+## Single-call Approach
+
+If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that:
+- Iterates over all the modules or "Next" buttons
+- Clicks them one by one
+- Waits for content updates between each click
+- Once done, returns control to Crawl4AI for extraction.
+
+Example snippet:
+
+```python
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+js_code = [
+    # Example JS that clicks multiple modules:
+    """
+    (async () => {
+      const modules = document.querySelectorAll('.module-item');
+      for (let i = 0; i < modules.length; i++) {
+        modules[i].scrollIntoView();
+        modules[i].click();
+        // Wait for each module’s content to load, adjust 100ms as needed
+        await new Promise(r => setTimeout(r, 100));
+      }
+    })();
+    """
+]
+
+async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
+    result = await crawler.arun(
+        url="https://example.com",
+        js_code=js_code,
+        wait_for="css:.final-loaded-content-class",
+        cache_mode=CacheMode.BYPASS
+    )
+
+# `result` now contains all content after all modules have been clicked in one go.
+```
+
+**Key Points:**
+- All interactions (clicks and waits) happen before the extraction.
+- Ideal for pages where all steps can be done in a single pass.
+
+## Choosing the Right Approach
+
+- **Step-by-Step (Session-based)**: 
+  - Good when you need fine-grained control or must dynamically check conditions before clicking the next page.
+  - Useful if the page requires multiple conditions checked at runtime.
+
+- **Single-call**:
+  - Perfect if the sequence of interactions is known in advance.
+  - Cleaner code if the page’s structure is consistent and predictable.
+
+## Conclusion
+
+Crawl4AI makes it easy to handle dynamic content:
+- Use session IDs and multiple `arun()` calls for stepwise crawling.
+- Or pack all actions into one `arun()` call if the interactions are well-defined upfront.
+
+This flexibility ensures you can handle a wide range of dynamic web pages efficiently.
--- a/docs/md_v2/advanced/hooks-auth.md
+++ b/docs/md_v2/advanced/hooks-auth.md
@@ -18,7 +18,7 @@ Let's see how we can customize the AsyncWebCrawler using hooks! In this example,
 import asyncio
 from crawl4ai import AsyncWebCrawler
 from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
-from playwright.async_api import Page, Browser
+from playwright.async_api import Page, Browser, BrowserContext

 async def on_browser_created(browser: Browser):
    print("[HOOK] on_browser_created")
@@ -71,7 +71,11 @@ from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
 async def main():
    print("\n🔗 Using Crawler Hooks: Let's see how we can customize the AsyncWebCrawler using hooks!")
    
-    crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True)
+    initial_cookies = [
+        {"name": "sessionId", "value": "abc123", "domain": ".example.com"},
+        {"name": "userId", "value": "12345", "domain": ".example.com"}
+    ]
+    crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True, cookies=initial_cookies)
    crawler_strategy.set_hook('on_browser_created', on_browser_created)
    crawler_strategy.set_hook('before_goto', before_goto)
    crawler_strategy.set_hook('after_goto', after_goto)
--- a/docs/md_v2/advanced/managed_browser.md
+++ b/docs/md_v2/advanced/managed_browser.md
@@ -4,7 +4,59 @@ This guide explains how to use content filtering strategies in Crawl4AI to extra

 ## Relevance Content Filter

-The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
+The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
+
+
+## Pruning Content Filter
+
+The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold.
+
+### Usage
+
+```python
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+async def filter_content(url):
+    async with AsyncWebCrawler() as crawler:
+        content_filter = PruningContentFilter(
+            min_word_threshold=5,
+            threshold_type='dynamic',
+            threshold=0.45
+        )
+        result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True)
+        if result.success:
+            print(f"Cleaned Markdown:\n{result.fit_markdown}")
+```
+
+### Parameters
+
+- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned.
+
+- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated:
+  - `'fixed'`: Uses a constant threshold value for all nodes
+  - `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios
+
+- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning:
+  - For fixed threshold: Nodes scoring below this value are removed
+  - For dynamic threshold: This value is adjusted based on node properties
+
+### How It Works
+
+The pruning algorithm evaluates each node using multiple metrics:
+- Text density: Ratio of actual text to overall node content
+- Link density: Proportion of text within links
+- Tag importance: Weight based on HTML tag type (e.g., article, p, div)
+- Content quality: Metrics like text length and structural importance
+
+Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks.
+
+The algorithm is particularly effective for:
+- Removing boilerplate content
+- Eliminating navigation menus and sidebars
+- Preserving main article content
+- Maintaining document structure while removing noise
+

 ## BM25 Algorithm

--- a/docs/md_v2/basic/cache-modes.md
+++ b/docs/md_v2/basic/cache-modes.md
@@ -1,7 +1,7 @@
 # Crawl4AI Cache System and Migration Guide

 ## Overview
-Starting from version X.X.X, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
+Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.

 ## Old vs New Approach

--- a/docs/md_v2/basic/content_filtering.md
+++ b/docs/md_v2/basic/content_filtering.md
@@ -4,7 +4,59 @@ This guide explains how to use content filtering strategies in Crawl4AI to extra

 ## Relevance Content Filter

-The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
+The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
+
+
+## Pruning Content Filter
+
+The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold.
+
+### Usage
+
+```python
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+async def filter_content(url):
+    async with AsyncWebCrawler() as crawler:
+        content_filter = PruningContentFilter(
+            min_word_threshold=5,
+            threshold_type='dynamic',
+            threshold=0.45
+        )
+        result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True)
+        if result.success:
+            print(f"Cleaned Markdown:\n{result.fit_markdown}")
+```
+
+### Parameters
+
+- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned.
+
+- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated:
+  - `'fixed'`: Uses a constant threshold value for all nodes
+  - `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios
+
+- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning:
+  - For fixed threshold: Nodes scoring below this value are removed
+  - For dynamic threshold: This value is adjusted based on node properties
+
+### How It Works
+
+The pruning algorithm evaluates each node using multiple metrics:
+- Text density: Ratio of actual text to overall node content
+- Link density: Proportion of text within links
+- Tag importance: Weight based on HTML tag type (e.g., article, p, div)
+- Content quality: Metrics like text length and structural importance
+
+Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks.
+
+The algorithm is particularly effective for:
+- Removing boilerplate content
+- Eliminating navigation menus and sidebars
+- Preserving main article content
+- Maintaining document structure while removing noise
+

 ## BM25 Algorithm

@@ -21,7 +73,7 @@ from crawl4ai.content_filter_strategy import BM25ContentFilter
 async def filter_content(url, query=None):
    async with AsyncWebCrawler() as crawler:
        content_filter = BM25ContentFilter(user_query=query)
-        result = await crawler.arun(url=url, content_filter=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering
+        result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering
        if result.success:
            print(f"Filtered Content (JSON):\n{result.extracted_content}")
            print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object
@@ -71,7 +123,7 @@ class MyCustomFilter(RelevantContentFilter):
 async def custom_filter_demo(url: str):
    async with AsyncWebCrawler() as crawler:
        custom_filter = MyCustomFilter()
-        result = await crawler.arun(url, content_filter=custom_filter)
+        result = await crawler.arun(url, extraction_strategy=custom_filter)
        if result.success:
            print(result.extracted_content)

--- a/docs/md_v2/basic/quickstart.md
+++ b/docs/md_v2/basic/quickstart.md
@@ -8,7 +8,7 @@ First, let's import the necessary modules and create an instance of `AsyncWebCra

 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler, CasheMode
+from crawl4ai import AsyncWebCrawler, CacheMode

 async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
--- a/docs/md_v2/blog/index.md
+++ b/docs/md_v2/blog/index.md
@@ -0,0 +1,47 @@
+# Crawl4AI Blog
+
+Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical insights, and updates about the project. Whether you're looking for the latest improvements or want to dive deep into web crawling techniques, this is the place.
+
+## Latest Release
+
+### [0.4.2 - Configurable Crawlers, Session Management, and Smarter Screenshots](releases/0.4.2.md)
+*December 12, 2024*
+
+The 0.4.2 update brings massive improvements to configuration, making crawlers and browsers easier to manage with dedicated objects. You can now import/export local storage for seamless session management. Plus, long-page screenshots are faster and cleaner, and full-page PDF exports are now possible. Check out all the new features to make your crawling experience even smoother.
+
+[Read full release notes →](releases/0.4.2.md)
+
+---
+
+### [0.4.1 - Smarter Crawling with Lazy-Load Handling, Text-Only Mode, and More](releases/0.4.1.md)
+*December 8, 2024*
+
+This release brings major improvements to handling lazy-loaded images, a blazing-fast Text-Only Mode, full-page scanning for infinite scrolls, dynamic viewport adjustments, and session reuse for efficient crawling. If you're looking to improve speed, reliability, or handle dynamic content with ease, this update has you covered.
+
+[Read full release notes →](releases/0.4.1.md)
+
+---
+
+### [0.4.0 - Major Content Filtering Update](releases/0.4.0.md)
+*December 1, 2024*
+
+Introduced significant improvements to content filtering, multi-threaded environment handling, and user-agent generation. This release features the new PruningContentFilter, enhanced thread safety, and improved test coverage.
+
+[Read full release notes →](releases/0.4.0.md)
+
+## Project History
+
+Curious about how Crawl4AI has evolved? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for a detailed history of all versions and updates.
+
+## Categories
+
+- [Technical Deep Dives](/blog/technical) - Coming soon
+- [Tutorials & Guides](/blog/tutorials) - Coming soon
+- [Community Updates](/blog/community) - Coming soon
+
+## Stay Updated
+
+- Star us on [GitHub](https://github.com/unclecode/crawl4ai)
+- Follow [@unclecode](https://twitter.com/unclecode) on Twitter
+- Join our community discussions on GitHub
+
--- a/docs/md_v2/blog/releases/0.4.0.md
+++ b/docs/md_v2/blog/releases/0.4.0.md
@@ -0,0 +1,62 @@
+# Release Summary for Version 0.4.0 (December 1, 2024)
+
+## Overview
+The 0.4.0 release introduces significant improvements to content filtering, multi-threaded environment handling, user-agent generation, and test coverage. Key highlights include the introduction of the PruningContentFilter, designed to automatically identify and extract the most valuable parts of an HTML document, as well as enhancements to the BM25ContentFilter to extend its versatility and effectiveness.
+
+## Major Features and Enhancements
+
+### 1. PruningContentFilter
+- Introduced a new unsupervised content filtering strategy that scores and prunes less relevant nodes in an HTML document based on metrics like text and link density.
+- Focuses on retaining the most valuable parts of the content, making it highly effective for extracting relevant information from complex web pages.
+- Fully documented with updated README and expanded user guides.
+
+### 2. User-Agent Generator
+- Added a user-agent generator utility that resolves compatibility issues and supports customizable user-agent strings.
+- By default, the generator randomizes user agents for each request, adding diversity, but users can customize it for tailored scenarios.
+
+### 3. Enhanced Thread Safety
+- Improved handling of multi-threaded environments by adding better thread locks for parallel processing, ensuring consistency and stability when running multiple threads.
+
+### 4. Extended Content Filtering Strategies
+- Users now have access to both the PruningContentFilter for unsupervised extraction and the BM25ContentFilter for supervised filtering based on user queries.
+- Enhanced BM25ContentFilter with improved capabilities to process page titles, meta tags, and descriptions, allowing for more effective classification and clustering of text chunks.
+
+### 5. Documentation Updates
+- Updated examples and tutorials to promote the use of the PruningContentFilter alongside the BM25ContentFilter, providing clear instructions for selecting the appropriate filter for each use case.
+
+### 6. Unit Test Enhancements
+- Added unit tests for PruningContentFilter to ensure accuracy and reliability.
+- Enhanced BM25ContentFilter tests to cover additional edge cases and performance metrics, particularly for malformed HTML inputs.
+
+## Revised Change Logs for Version 0.4.0
+
+### PruningContentFilter (Dec 01, 2024)
+- Introduced the PruningContentFilter to optimize content extraction by pruning less relevant HTML nodes.
+  - **Affected Files:**
+    - **crawl4ai/content_filter_strategy.py**: Added a scoring-based pruning algorithm.
+    - **README.md**: Updated to include PruningContentFilter usage.
+    - **docs/md_v2/basic/content_filtering.md**: Expanded user documentation, detailing the use and benefits of PruningContentFilter.
+
+### Unit Tests for PruningContentFilter (Dec 01, 2024)
+- Added comprehensive unit tests for PruningContentFilter to ensure correctness and efficiency.
+  - **Affected Files:**
+    - **tests/async/test_content_filter_prune.py**: Created tests covering different pruning scenarios to ensure stability and correctness.
+
+### Enhanced BM25ContentFilter Tests (Dec 01, 2024)
+- Expanded tests to cover additional extraction scenarios and performance metrics, improving robustness.
+  - **Affected Files:**
+    - **tests/async/test_content_filter_bm25.py**: Added tests for edge cases, including malformed HTML inputs.
+
+### Documentation and Example Updates (Dec 01, 2024)
+- Revised examples to illustrate the use of PruningContentFilter alongside existing content filtering methods.
+  - **Affected Files:**
+    - **docs/examples/quickstart_async.py**: Enhanced example clarity and usability for new users.
+
+## Experimental Features
+- The PruningContentFilter is still under experimental development, and we continue to gather feedback for further refinements.
+
+## Conclusion
+This release significantly enhances the content extraction capabilities of Crawl4ai with the introduction of the PruningContentFilter, improved supervised filtering with BM25ContentFilter, and robust multi-threaded handling. Additionally, the user-agent generator provides much-needed versatility, resolving compatibility issues faced by many users.
+
+Users are encouraged to experiment with the new content filtering methods to determine which best suits their needs.
+
--- a/docs/md_v2/blog/releases/0.4.1.md
+++ b/docs/md_v2/blog/releases/0.4.1.md
@@ -0,0 +1,145 @@
+# Release Summary for Version 0.4.1 (December 8, 2024): Major Efficiency Boosts with New Features!
+
+_This post was generated with the help of ChatGPT, take everything with a grain of salt. 🧂_
+
+Hi everyone,
+
+I just finished putting together version 0.4.1 of Crawl4AI, and there are a few changes in here that I think you’ll find really helpful. I’ll explain what’s new, why it matters, and exactly how you can use these features (with the code to back it up). Let’s get into it.
+
+---
+
+### Handling Lazy Loading Better (Images Included)
+
+One thing that always bugged me with crawlers is how often they miss lazy-loaded content, especially images. In this version, I made sure Crawl4AI **waits for all images to load** before moving forward. This is useful because many modern websites only load images when they’re in the viewport or after some JavaScript executes.
+
+Here’s how to enable it:
+
+```python
+await crawler.crawl(
+    url="https://example.com",
+    wait_for_images=True  # Add this argument to ensure images are fully loaded
+)
+```
+
+What this does is:
+1. Waits for the page to reach a "network idle" state.
+2. Ensures all images on the page have been completely loaded.
+
+This single change handles the majority of lazy-loading cases you’re likely to encounter.
+
+---
+
+### Text-Only Mode (Fast, Lightweight Crawling)
+
+Sometimes, you don’t need to download images or process JavaScript at all. For example, if you’re crawling to extract text data, you can enable **text-only mode** to speed things up. By disabling images, JavaScript, and other heavy resources, this mode makes crawling **3-4 times faster** in most cases.
+
+Here’s how to turn it on:
+
+```python
+crawler = AsyncPlaywrightCrawlerStrategy(
+    text_only=True  # Set this to True to enable text-only crawling
+)
+```
+
+When `text_only=True`, the crawler automatically:
+- Disables GPU processing.
+- Blocks image and JavaScript resources.
+- Reduces the viewport size to 800x600 (you can override this with `viewport_width` and `viewport_height`).
+
+If you need to crawl thousands of pages where you only care about text, this mode will save you a ton of time and resources.
+
+---
+
+### Adjusting the Viewport Dynamically
+
+Another useful addition is the ability to **dynamically adjust the viewport size** to match the content on the page. This is particularly helpful when you’re working with responsive layouts or want to ensure all parts of the page load properly.
+
+Here’s how it works:
+1. The crawler calculates the page’s width and height after it loads.
+2. It adjusts the viewport to fit the content dimensions.
+3. (Optional) It uses Chrome DevTools Protocol (CDP) to simulate zooming out so everything fits in the viewport.
+
+To enable this, use:
+
+```python
+await crawler.crawl(
+    url="https://example.com",
+    adjust_viewport_to_content=True  # Dynamically adjusts the viewport
+)
+```
+
+This approach makes sure the entire page gets loaded into the viewport, especially for layouts that load content based on visibility.
+
+---
+
+### Simulating Full-Page Scrolling
+
+Some websites load data dynamically as you scroll down the page. To handle these cases, I added support for **full-page scanning**. It simulates scrolling to the bottom of the page, checking for new content, and capturing it all.
+
+Here’s an example:
+
+```python
+await crawler.crawl(
+    url="https://example.com",
+    scan_full_page=True,   # Enables scrolling
+    scroll_delay=0.2       # Waits 200ms between scrolls (optional)
+)
+```
+
+What happens here:
+1. The crawler scrolls down in increments, waiting for content to load after each scroll.
+2. It stops when no new content appears (i.e., dynamic elements stop loading).
+3. It scrolls back to the top before finishing (if necessary).
+
+If you’ve ever had to deal with infinite scroll pages, this is going to save you a lot of headaches.
+
+---
+
+### Reusing Browser Sessions (Save Time on Setup)
+
+By default, every time you crawl a page, a new browser context (or tab) is created. That’s fine for small crawls, but if you’re working on a large dataset, it’s more efficient to reuse the same session.
+
+I added a method called `create_session` for this:
+
+```python
+session_id = await crawler.create_session()
+
+# Use the same session for multiple crawls
+await crawler.crawl(
+    url="https://example.com/page1",
+    session_id=session_id  # Reuse the session
+)
+await crawler.crawl(
+    url="https://example.com/page2",
+    session_id=session_id
+)
+```
+
+This avoids creating a new tab for every page, speeding up the crawl and reducing memory usage.
+
+---
+
+### Other Updates
+
+Here are a few smaller updates I’ve made:
+- **Light Mode**: Use `light_mode=True` to disable background processes, extensions, and other unnecessary features, making the browser more efficient.
+- **Logging**: Improved logs to make debugging easier.
+- **Defaults**: Added sensible defaults for things like `delay_before_return_html` (now set to 0.1 seconds).
+
+---
+
+### How to Get the Update
+
+You can install or upgrade to version `0.4.1` like this:
+
+```bash
+pip install crawl4ai --upgrade
+```
+
+As always, I’d love to hear your thoughts. If there’s something you think could be improved or if you have suggestions for future versions, let me know!
+
+Enjoy the new features, and happy crawling! 🕷️
+
+--- 
+
+
--- a/docs/md_v2/blog/releases/0.4.2.md
+++ b/docs/md_v2/blog/releases/0.4.2.md
@@ -0,0 +1,86 @@
+## 🚀 Crawl4AI 0.4.2 Update: Smarter Crawling Just Got Easier (Dec 12, 2024)
+
+### Hey Developers,
+
+I’m excited to share Crawl4AI 0.4.2—a major upgrade that makes crawling smarter, faster, and a whole lot more intuitive. I’ve packed in a bunch of new features to simplify your workflows and improve your experience. Let’s cut to the chase!
+
+---
+
+### 🔧 **Configurable Browser and Crawler Behavior**
+
+You’ve asked for better control over how browsers and crawlers are configured, and now you’ve got it. With the new `BrowserConfig` and `CrawlerRunConfig` objects, you can set up your browser and crawling behavior exactly how you want. No more cluttering `arun` with a dozen arguments—just pass in your configs and go.
+
+**Example:**
+```python
+from crawl4ai import BrowserConfig, CrawlerRunConfig, AsyncWebCrawler
+
+browser_config = BrowserConfig(headless=True, viewport_width=1920, viewport_height=1080)
+crawler_config = CrawlerRunConfig(cache_mode="BYPASS")
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+    result = await crawler.arun(url="https://example.com", config=crawler_config)
+    print(result.markdown[:500])
+```
+
+This setup is a game-changer for scalability, keeping your code clean and flexible as we add more parameters in the future.
+
+Remember: If you like to use the old way, you can still pass arguments directly to `arun` as before, no worries!
+
+---
+
+### 🔐 **Streamlined Session Management**
+
+Here’s the big one: You can now pass local storage and cookies directly. Whether it’s setting values programmatically or importing a saved JSON state, managing sessions has never been easier. This is a must-have for authenticated crawls—just export your storage state once and reuse it effortlessly across runs.
+
+**Example:**
+1. Open a browser, log in manually, and export the storage state.
+2. Import the JSON file for seamless authenticated crawling:
+
+```python
+result = await crawler.arun(
+    url="https://example.com/protected",
+    storage_state="my_storage_state.json"
+)
+```
+
+---
+
+### 🔢 **Handling Large Pages: Supercharged Screenshots and PDF Conversion**
+
+Two big upgrades here:
+
+- **Blazing-fast long-page screenshots**: Turn extremely long web pages into clean, high-quality screenshots—without breaking a sweat. It’s optimized to handle large content without lag.
+
+- **Full-page PDF exports**: Now, you can also convert any page into a PDF with all the details intact. Perfect for archiving or sharing complex layouts.
+
+---
+
+### 🔧 **Other Cool Stuff**
+
+- **Anti-bot enhancements**: Magic mode now handles overlays, user simulation, and anti-detection features like a pro.
+- **JavaScript execution**: Execute custom JS snippets to handle dynamic content. No more wrestling with endless page interactions.
+
+---
+
+### 📊 **Performance Boosts and Dev-friendly Updates**
+
+- Faster rendering and viewport adjustments for better performance.
+- Improved cookie and local storage handling for seamless authentication.
+- Better debugging with detailed logs and actionable error messages.
+
+---
+
+### 🔠 **Use Cases You’ll Love**
+
+1. **Authenticated Crawls**: Login once, export your storage state, and reuse it across multiple requests without the headache.
+2. **Long-page Screenshots**: Perfect for blogs, e-commerce pages, or any endless-scroll website.
+3. **PDF Export**: Create professional-looking page PDFs in seconds.
+
+---
+
+### Let’s Get Crawling
+
+Crawl4AI 0.4.2 is ready for you to download and try. I’m always looking for ways to improve, so don’t hold back—share your thoughts and feedback.
+
+Happy Crawling! 🚀
+