Release prep (#749)

* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
2025-02-28 17:23:35 +05:30
parent 3a87b4e43b
commit a9e24307cc
38 changed files with 2040 additions and 326 deletions
--- a/docs/examples/browser_optimization_example.py
+++ b/docs/examples/browser_optimization_example.py
@@ -52,7 +52,7 @@ async def crawl_sequential(urls: List[str]):
            )
            if result.success:
                print(f"Successfully crawled {url}")
-                print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+                print(f"Content length: {len(result.markdown.raw_markdown)}")
    finally:
        await crawler.close()

@@ -101,7 +101,7 @@ async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
                    print(f"Error crawling {url}: {str(result)}")
                elif result.success:
                    print(f"Successfully crawled {url}")
-                    print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+                    print(f"Content length: {len(result.markdown.raw_markdown)}")
    finally:
        await crawler.close()

--- a/docs/examples/deepcrawl.py
+++ b/docs/examples/deepcrawl.py
@@ -0,0 +1,404 @@
+import asyncio
+import time
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import (
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+    ContentRelevanceFilter,
+    SEOFilter,
+)
+from crawl4ai.deep_crawling.scorers import (
+    KeywordRelevanceScorer,
+)
+
+
+# 1️⃣ Basic Deep Crawl Setup
+async def basic_deep_crawl():
+    """
+    PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
+
+    This function shows:
+    - How to set up BFSDeepCrawlStrategy (Breadth-First Search)
+    - Setting depth and domain parameters
+    - Processing the results to show the hierarchy
+    """
+    print("\n===== BASIC DEEP CRAWL SETUP =====")
+
+    # Configure a 2-level deep crawl using Breadth-First Search strategy
+    # max_depth=2 means: initial page (depth 0) + 2 more levels
+    # include_external=False means: only follow links within the same domain
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True,  # Show progress during crawling
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        start_time = time.perf_counter()
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        # Group results by depth to visualize the crawl tree
+        pages_by_depth = {}
+        for result in results:
+            depth = result.metadata.get("depth", 0)
+            if depth not in pages_by_depth:
+                pages_by_depth[depth] = []
+            pages_by_depth[depth].append(result.url)
+
+        print(f"✅ Crawled {len(results)} pages total")
+
+        # Display crawl structure by depth
+        for depth, urls in sorted(pages_by_depth.items()):
+            print(f"\nDepth {depth}: {len(urls)} pages")
+            # Show first 3 URLs for each depth as examples
+            for url in urls[:3]:
+                print(f"  → {url}")
+            if len(urls) > 3:
+                print(f"  ... and {len(urls) - 3} more")
+
+        print(
+            f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
+        )
+
+
+# 2️⃣ Stream vs. Non-Stream Execution
+async def stream_vs_nonstream():
+    """
+    PART 2: Demonstrates the difference between stream and non-stream execution.
+
+    Non-stream: Waits for all results before processing
+    Stream: Processes results as they become available
+    """
+    print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
+
+    # Common configuration for both examples
+    base_config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        verbose=True,
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        # NON-STREAMING MODE
+        print("\n📊 NON-STREAMING MODE:")
+        print("  In this mode, all results are collected before being returned.")
+
+        non_stream_config = base_config.clone()
+        non_stream_config.stream = False
+
+        start_time = time.perf_counter()
+        results = await crawler.arun(
+            url="https://docs.crawl4ai.com", config=non_stream_config
+        )
+
+        print(f"  ✅ Received all {len(results)} results at once")
+        print(f"  ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
+
+        # STREAMING MODE
+        print("\n📊 STREAMING MODE:")
+        print("  In this mode, results are processed as they become available.")
+
+        stream_config = base_config.clone()
+        stream_config.stream = True
+
+        start_time = time.perf_counter()
+        result_count = 0
+        first_result_time = None
+
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=stream_config
+        ):
+            result_count += 1
+            if result_count == 1:
+                first_result_time = time.perf_counter() - start_time
+                print(
+                    f"  ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
+                )
+            elif result_count % 5 == 0:  # Show every 5th result for brevity
+                print(f"  → Result #{result_count}: {result.url}")
+
+        print(f"  ✅ Total: {result_count} results")
+        print(f"  ✅ First result: {first_result_time:.2f} seconds")
+        print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
+        print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
+
+
+# 3️⃣ Introduce Filters & Scorers
+async def filters_and_scorers():
+    """
+    PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
+
+    This function progressively adds:
+    1. A single URL pattern filter
+    2. Multiple filters in a chain
+    3. Scorers for prioritizing pages
+    """
+    print("\n===== FILTERS AND SCORERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SINGLE FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
+        print("  Only crawl pages containing 'core' in the URL")
+
+        # Create a filter that only allows URLs with 'guide' in them
+        url_filter = URLPatternFilter(patterns=["*core*"])
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1,
+                include_external=False,
+                filter_chain=FilterChain([url_filter]),  # Single filter
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages matching '*core*'")
+        for result in results[:3]:  # Show first 3 results
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # MULTIPLE FILTERS EXAMPLE
+        print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
+        print("  Only crawl pages that:")
+        print("  1. Contain '2024' in the URL")
+        print("  2. Are from 'techcrunch.com'")
+        print("  3. Are of text/html or application/javascript content type")
+
+        # Create a chain of filters
+        filter_chain = FilterChain(
+            [
+                URLPatternFilter(patterns=["*2024*"]),
+                DomainFilter(
+                    allowed_domains=["techcrunch.com"],
+                    blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
+                ),
+                ContentTypeFilter(
+                    allowed_types=["text/html", "application/javascript"]
+                ),
+            ]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, include_external=False, filter_chain=filter_chain
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+        )
+
+        results = await crawler.arun(url="https://techcrunch.com", config=config)
+
+        print(f"  ✅ Crawled {len(results)} pages after applying all filters")
+        for result in results[:3]:
+            print(f"  → {result.url}")
+        if len(results) > 3:
+            print(f"  ... and {len(results) - 3} more")
+
+        # SCORERS EXAMPLE
+        print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
+        print(
+            "Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
+        )
+
+        # Create a keyword relevance scorer
+        keyword_scorer = KeywordRelevanceScorer(
+            keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BestFirstCrawlingStrategy(  # Note: Changed to BestFirst
+                max_depth=1, include_external=False, url_scorer=keyword_scorer
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            cache_mode=CacheMode.BYPASS,
+            verbose=True,
+            stream=True,
+        )
+
+        results = []
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score")
+            print(f"  → Score: {score:.2f} | {result.url}")
+
+        print(f"  ✅ Crawler prioritized {len(results)} pages by relevance score")
+        print("  🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
+
+
+# 4️⃣ Wrap-Up and Key Takeaways
+async def wrap_up():
+    """
+    PART 4: Wrap-Up and Key Takeaways
+
+    Summarize the key concepts learned in this tutorial.
+    """
+    print("\n===== COMPLETE CRAWLER EXAMPLE =====")
+    print("Combining filters, scorers, and streaming for an optimized crawl")
+
+    # Create a sophisticated filter chain
+    filter_chain = FilterChain(
+        [
+            DomainFilter(
+                allowed_domains=["docs.crawl4ai.com"],
+                blocked_domains=["old.docs.crawl4ai.com"],
+            ),
+            URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
+            ContentTypeFilter(allowed_types=["text/html"]),
+        ]
+    )
+
+    # Create a composite scorer that combines multiple scoring strategies
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"], weight=0.7
+    )
+    # Set up the configuration
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=1,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+    )
+
+    # Execute the crawl
+    results = []
+    start_time = time.perf_counter()
+
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun(
+            url="https://docs.crawl4ai.com", config=config
+        ):
+            results.append(result)
+            score = result.metadata.get("score", 0)
+            depth = result.metadata.get("depth", 0)
+            print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
+
+    duration = time.perf_counter() - start_time
+
+    # Summarize the results
+    print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+    print(
+        f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
+    )
+
+    # Group by depth
+    depth_counts = {}
+    for result in results:
+        depth = result.metadata.get("depth", 0)
+        depth_counts[depth] = depth_counts.get(depth, 0) + 1
+
+    print("\n📊 Pages crawled by depth:")
+    for depth, count in sorted(depth_counts.items()):
+        print(f"  Depth {depth}: {count} pages")
+
+
+# 5️⃣ Advanced Filters
+async def advanced_filters():
+    """
+    PART 5: Demonstrates advanced filtering techniques for specialized crawling.
+
+    This function covers:
+    - SEO filters
+    - Text relevancy filtering
+    - Combining advanced filters
+    """
+    print("\n===== ADVANCED FILTERS =====")
+
+    async with AsyncWebCrawler() as crawler:
+        # SEO FILTER EXAMPLE
+        print("\n📊 EXAMPLE 1: SEO FILTERS")
+        print(
+            "Quantitative SEO quality assessment filter based searching keywords in the head section"
+        )
+
+        seo_filter = SEOFilter(
+            threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([seo_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages with relevant keywords")
+        for result in results:
+            print(f"  → {result.url}")
+
+        # ADVANCED TEXT RELEVANCY FILTER
+        print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
+
+        # More sophisticated content relevance filter
+        relevance_filter = ContentRelevanceFilter(
+            query="Interact with the web using your authentic digital identity",
+            threshold=0.7,
+        )
+
+        config = CrawlerRunConfig(
+            deep_crawl_strategy=BFSDeepCrawlStrategy(
+                max_depth=1, filter_chain=FilterChain([relevance_filter])
+            ),
+            scraping_strategy=LXMLWebScrapingStrategy(),
+            verbose=True,
+            cache_mode=CacheMode.BYPASS,
+        )
+
+        results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
+
+        print(f"  ✅ Found {len(results)} pages")
+        for result in results:
+            relevance_score = result.metadata.get("relevance_score", 0)
+            print(f"  → Score: {relevance_score:.2f} | {result.url}")
+
+
+# Main function to run the entire tutorial
+async def run_tutorial():
+    """
+    Executes all tutorial sections in sequence.
+    """
+    print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
+    print("======================================")
+    print("This tutorial will walk you through deep crawling techniques,")
+    print("from basic to advanced, using the Crawl4AI library.")
+
+    # Define sections - uncomment to run specific parts during development
+    tutorial_sections = [
+        basic_deep_crawl,
+        stream_vs_nonstream,
+        filters_and_scorers,
+        wrap_up,
+        advanced_filters,
+    ]
+
+    for section in tutorial_sections:
+        await section()
+
+    print("\n🎉 TUTORIAL COMPLETE! 🎉")
+    print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
+    print("For more information, check out https://docs.crawl4ai.com")
+
+
+# Execute the tutorial when run directly
+if __name__ == "__main__":
+    asyncio.run(run_tutorial())
--- a/docs/examples/extraction_strategies_examples.py
+++ b/docs/examples/extraction_strategies_examples.py
@@ -39,9 +39,9 @@ async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str
        if result.success:
            print(f"\n=== {name} Results ===")
            print(f"Extracted Content: {result.extracted_content}")
-            print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
+            print(f"Raw Markdown Length: {len(result.markdown.raw_markdown)}")
            print(
-                f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}"
+                f"Citations Markdown Length: {len(result.markdown.markdown_with_citations)}"
            )
        else:
            print(f"Error in {name}: Crawl failed")
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -25,7 +25,7 @@ async def main():
            # url="https://www.helloworld.org", config=crawler_config
            url="https://www.kidocode.com", config=crawler_config
        )
-        print(result.markdown_v2.raw_markdown[:500])
+        print(result.markdown.raw_markdown[:500])
        # print(result.model_dump())


--- a/docs/examples/quickstart.ipynb
+++ b/docs/examples/quickstart.ipynb
@@ -80,7 +80,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "id": "003376f3",
   "metadata": {},
   "outputs": [
@@ -114,7 +114,7 @@
    "            url=\"https://www.nbcnews.com/business\",\n",
    "            bypass_cache=True # By default this is False, meaning the cache will be used\n",
    "        )\n",
-    "        print(result.markdown[:500])  # Print the first 500 characters\n",
+    "        print(result.markdown.raw_markdown[:500])  # Print the first 500 characters\n",
    "        \n",
    "asyncio.run(simple_crawl())"
   ]
@@ -129,7 +129,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
   "id": "5bb8c1e4",
   "metadata": {},
   "outputs": [
@@ -177,7 +177,7 @@
    "            # wait_for=wait_for,\n",
    "            bypass_cache=True,\n",
    "        )\n",
-    "        print(result.markdown[:500])  # Print first 500 characters\n",
+    "        print(result.markdown.raw_markdown[:500])  # Print first 500 characters\n",
    "\n",
    "asyncio.run(crawl_dynamic_content())"
   ]
@@ -206,11 +206,11 @@
    "            word_count_threshold=10,\n",
    "            bypass_cache=True\n",
    "        )\n",
-    "        full_markdown_length = len(result.markdown)\n",
-    "        fit_markdown_length = len(result.fit_markdown)\n",
+    "        full_markdown_length = len(result.markdown.raw_markdown)\n",
+    "        fit_markdown_length = len(result.markdown.fit_markdown)\n",
    "        print(f\"Full Markdown Length: {full_markdown_length}\")\n",
    "        print(f\"Fit Markdown Length: {fit_markdown_length}\")\n",
-    "        print(result.fit_markdown[:1000])\n",
+    "        print(result.markdown.fit_markdown[:1000])\n",
    "        \n",
    "\n",
    "asyncio.run(clean_content())"
@@ -342,7 +342,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
   "id": "bc4d2fc8",
   "metadata": {},
   "outputs": [
@@ -387,7 +387,7 @@
    "            url=\"https://crawl4ai.com\",\n",
    "            bypass_cache=True\n",
    "        )\n",
-    "        print(result.markdown[:500])  # Display the first 500 characters\n",
+    "        print(result.markdown.raw_markdown[:500])  # Display the first 500 characters\n",
    "\n",
    "asyncio.run(custom_hook_workflow())"
   ]
@@ -465,7 +465,7 @@
    "                bypass_cache=True\n",
    "            )\n",
    "            print(f\"Page {page_number} Content:\")\n",
-    "            print(result.markdown[:500])  # Print first 500 characters\n",
+    "            print(result.markdown.raw_markdown[:500])  # Print first 500 characters\n",
    "\n",
    "# asyncio.run(multi_page_session_crawl())"
   ]
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -59,8 +59,8 @@ async def clean_content():
            url="https://en.wikipedia.org/wiki/Apple",
            config=crawler_config,
        )
-        full_markdown_length = len(result.markdown_v2.raw_markdown)
-        fit_markdown_length = len(result.markdown_v2.fit_markdown)
+        full_markdown_length = len(result.markdown.raw_markdown)
+        fit_markdown_length = len(result.markdown.fit_markdown)
        print(f"Full Markdown Length: {full_markdown_length}")
        print(f"Fit Markdown Length: {fit_markdown_length}")

@@ -139,7 +139,7 @@ async def custom_hook_workflow(verbose=True):

        # Perform the crawl operation
        result = await crawler.arun(url="https://crawl4ai.com")
-        print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
+        print(result.markdown.raw_markdown[:500].replace("\n", " -- "))


 # Proxy Example
@@ -584,9 +584,9 @@ async def speed_comparison():
        end = time.time()
        print("Crawl4AI (Markdown Plus):")
        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
+        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
+        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
        print()


--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -514,9 +514,9 @@ async def speed_comparison():
        end = time.time()
        print("Crawl4AI (Markdown Plus):")
        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown_v2.raw_markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
+        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
+        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
        print()

        # Crawl4AI with JavaScript execution
@@ -539,9 +539,9 @@ async def speed_comparison():
        end = time.time()
        print("Crawl4AI (with JavaScript execution):")
        print(f"Time taken: {end - start:.2f} seconds")
-        print(f"Content length: {len(result.markdown)} characters")
-        print(f"Fit Markdown: {len(result.markdown_v2.fit_markdown)} characters")
-        print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
+        print(f"Content length: {len(result.markdown.raw_markdown)} characters")
+        print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
+        print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")

    print("\nNote on Speed Comparison:")
    print("The speed test conducted here may not reflect optimal conditions.")
@@ -613,9 +613,9 @@ async def fit_markdown_remove_overlay():
        )

        if result.success:
-            print(len(result.markdown_v2.raw_markdown))
-            print(len(result.markdown_v2.markdown_with_citations))
-            print(len(result.markdown_v2.fit_markdown))
+            print(len(result.markdown.raw_markdown))
+            print(len(result.markdown.markdown_with_citations))
+            print(len(result.markdown.fit_markdown))

            # Save clean html
            with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
@@ -624,18 +624,18 @@ async def fit_markdown_remove_overlay():
            with open(
                os.path.join(__location__, "output/output_raw_markdown.md"), "w"
            ) as f:
-                f.write(result.markdown_v2.raw_markdown)
+                f.write(result.markdown.raw_markdown)

            with open(
                os.path.join(__location__, "output/output_markdown_with_citations.md"),
                "w",
            ) as f:
-                f.write(result.markdown_v2.markdown_with_citations)
+                f.write(result.markdown.markdown_with_citations)

            with open(
                os.path.join(__location__, "output/output_fit_markdown.md"), "w"
            ) as f:
-                f.write(result.markdown_v2.fit_markdown)
+                f.write(result.markdown.fit_markdown)

    print("Done")

--- a/docs/examples/serp_api_project_11_feb.py
+++ b/docs/examples/serp_api_project_11_feb.py
@@ -26,7 +26,7 @@ async def little_hello_web():
        result : CrawlResult = await crawler.arun(
            url="https://www.helloworld.org"
        )
-        print(result.markdown_v2.raw_markdown[:500])
+        print(result.markdown.raw_markdown[:500])

 async def hello_web():
    browser_config = BrowserConfig(headless=True, verbose=True)
@@ -42,7 +42,7 @@ async def hello_web():
        result : CrawlResult = await crawler.arun(
            url="https://www.helloworld.org", config=crawler_config
        )
-        print(result.markdown_v2.fit_markdown[:500])
+        print(result.markdown.fit_markdown[:500])

 # Naive Approach Using Large Language Models
 async def extract_using_llm():
--- a/docs/examples/tutorial_v0.5.py
+++ b/docs/examples/tutorial_v0.5.py
@@ -0,0 +1,460 @@
+import asyncio
+import time
+import re
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, BrowserConfig, MemoryAdaptiveDispatcher, HTTPCrawlerConfig
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import (
+    BestFirstCrawlingStrategy,
+    FilterChain,
+    URLPatternFilter,
+    DomainFilter,
+    ContentTypeFilter,
+)
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
+from crawl4ai.configs import ProxyConfig
+from crawl4ai import RoundRobinProxyStrategy
+from crawl4ai.content_filter_strategy import LLMContentFilter
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai.async_configs import LlmConfig
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
+from pprint import pprint
+
+
+# 1️⃣ Deep Crawling with Best-First Strategy
+async def deep_crawl():
+    """
+    PART 1: Deep Crawling with Best-First Strategy
+    
+    This function demonstrates:
+    - Using the BestFirstCrawlingStrategy
+    - Creating filter chains to narrow down crawl targets
+    - Using a scorer to prioritize certain URLs
+    - Respecting robots.txt rules
+    """
+    print("\n===== DEEP CRAWLING =====")
+    print("This example shows how to implement deep crawling with filters, scorers, and robots.txt compliance.")
+    
+    # Create a filter chain to filter urls based on patterns, domains and content type
+    filter_chain = FilterChain(
+        [
+            DomainFilter(
+                allowed_domains=["docs.crawl4ai.com"],
+                blocked_domains=["old.docs.crawl4ai.com"],
+            ),
+            URLPatternFilter(patterns=["*core*", "*advanced*"],),
+            ContentTypeFilter(allowed_types=["text/html"]),
+        ]
+    )
+    
+    # Create a keyword scorer that prioritises the pages with certain keywords first
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["crawl", "example", "async", "configuration"], weight=0.7
+    )
+    
+    # Set up the configuration with robots.txt compliance enabled
+    deep_crawl_config = CrawlerRunConfig(
+        deep_crawl_strategy=BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=keyword_scorer,
+        ),
+        scraping_strategy=LXMLWebScrapingStrategy(),
+        stream=True,
+        verbose=True,
+        check_robots_txt=True,  # Enable robots.txt compliance
+    )
+    
+    # Execute the crawl
+    async with AsyncWebCrawler() as crawler:
+        print("\n📊 Starting deep crawl with Best-First strategy...")
+        print("  - Filtering by domain, URL patterns, and content type")
+        print("  - Scoring pages based on keyword relevance")
+        print("  - Respecting robots.txt rules")
+        
+        start_time = time.perf_counter()
+        results = []
+        
+        async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=deep_crawl_config):
+            # Print each result as it comes in
+            depth = result.metadata.get("depth", 0)
+            score = result.metadata.get("score", 0)
+            print(f"Crawled: {result.url} (Depth: {depth}), score: {score:.2f}")
+            results.append(result)
+            
+        duration = time.perf_counter() - start_time
+        
+        # Print summary statistics
+        print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
+        
+        # Group by depth
+        if results:
+            depth_counts = {}
+            for result in results:
+                depth = result.metadata.get("depth", 0)
+                depth_counts[depth] = depth_counts.get(depth, 0) + 1
+            
+            print("\n📊 Pages crawled by depth:")
+            for depth, count in sorted(depth_counts.items()):
+                print(f"  Depth {depth}: {count} pages")
+
+
+# 2️⃣ Memory-Adaptive Dispatcher
+async def memory_adaptive_dispatcher():
+    """
+    PART 2: Memory-Adaptive Dispatcher
+    
+    This function demonstrates:
+    - Using MemoryAdaptiveDispatcher to manage system memory
+    - Batch and streaming modes with multiple URLs
+    """
+    print("\n===== MEMORY-ADAPTIVE DISPATCHER =====")
+    print("This example shows how to use the memory-adaptive dispatcher for resource management.")
+    
+    # Configure the dispatcher (optional, defaults are used if not provided)
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=80.0,  # Pause if memory usage exceeds 80%
+        check_interval=0.5,  # Check memory every 0.5 seconds
+    )
+    
+    # Test URLs
+    urls = [
+        "https://docs.crawl4ai.com", 
+        "https://github.com/unclecode/crawl4ai"
+    ]
+    
+    async def batch_mode():
+        print("\n📊 BATCH MODE:")
+        print("  In this mode, all results are collected before being returned.")
+        
+        async with AsyncWebCrawler() as crawler:
+            start_time = time.perf_counter()
+            results = await crawler.arun_many(
+                urls=urls,
+                config=CrawlerRunConfig(stream=False),  # Batch mode
+                dispatcher=dispatcher,
+            )
+            
+            print(f"  ✅ Received all {len(results)} results after {time.perf_counter() - start_time:.2f} seconds")
+            for result in results:
+                print(f"  → {result.url} with status code: {result.status_code}")
+    
+    async def stream_mode():
+        print("\n📊 STREAMING MODE:")
+        print("  In this mode, results are processed as they become available.")
+        
+        async with AsyncWebCrawler() as crawler:
+            start_time = time.perf_counter()
+            count = 0
+            first_result_time = None
+            
+            async for result in await crawler.arun_many(
+                urls=urls,
+                config=CrawlerRunConfig(stream=True),  # Stream mode
+                dispatcher=dispatcher,
+            ):
+                count += 1
+                current_time = time.perf_counter() - start_time
+                
+                if count == 1:
+                    first_result_time = current_time
+                    print(f"  ✅ First result after {first_result_time:.2f} seconds: {result.url}")
+                else:
+                    print(f"  → Result #{count} after {current_time:.2f} seconds: {result.url}")
+            
+            print(f"  ✅ Total: {count} results")
+            print(f"  ✅ First result: {first_result_time:.2f} seconds")
+            print(f"  ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
+    
+    # Run both examples
+    await batch_mode()
+    await stream_mode()
+    
+    print("\n🔍 Key Takeaway: The memory-adaptive dispatcher prevents OOM errors")
+    print("  and manages concurrency based on system resources.")
+
+
+# 3️⃣ HTTP Crawler Strategy
+async def http_crawler_strategy():
+    """
+    PART 3: HTTP Crawler Strategy
+    
+    This function demonstrates:
+    - Using the lightweight HTTP-only crawler
+    - Setting custom headers and configurations
+    """
+    print("\n===== HTTP CRAWLER STRATEGY =====")
+    print("This example shows how to use the fast, lightweight HTTP-only crawler.")
+    
+    # Use the HTTP crawler strategy
+    http_config = HTTPCrawlerConfig(
+        method="GET",
+        headers={"User-Agent": "MyCustomBot/1.0"},
+        follow_redirects=True,
+        verify_ssl=True
+    )
+    
+    print("\n📊 Initializing HTTP crawler strategy...")
+    print("  - Using custom User-Agent: MyCustomBot/1.0")
+    print("  - Following redirects: Enabled")
+    print("  - Verifying SSL: Enabled")
+    
+    # Create crawler with HTTP strategy
+    async with AsyncWebCrawler(
+        crawler_strategy=AsyncHTTPCrawlerStrategy(browser_config=http_config)
+    ) as crawler:
+        start_time = time.perf_counter()
+        result = await crawler.arun("https://example.com")
+        duration = time.perf_counter() - start_time
+        
+        print(f"\n✅ Crawled in {duration:.2f} seconds")
+        print(f"✅ Status code: {result.status_code}")
+        print(f"✅ Content length: {len(result.html)} bytes")
+        
+        # Check if there was a redirect
+        if result.redirected_url and result.redirected_url != result.url:
+            print(f"ℹ️ Redirected from {result.url} to {result.redirected_url}")
+    
+    print("\n🔍 Key Takeaway: HTTP crawler is faster and more memory-efficient")
+    print("  than browser-based crawling for simple pages.")
+
+
+# 4️⃣ Proxy Rotation
+async def proxy_rotation():
+    """
+    PART 4: Proxy Rotation
+    
+    This function demonstrates:
+    - Setting up a proxy rotation strategy
+    - Using multiple proxies in a round-robin fashion
+    """
+    print("\n===== PROXY ROTATION =====")
+    print("This example shows how to implement proxy rotation for distributed crawling.")
+    
+    # Load proxies and create rotation strategy
+    proxies = ProxyConfig.from_env()
+    #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
+    if not proxies:
+        print("No proxies found in environment. Set PROXIES env variable!")
+        return
+        
+    proxy_strategy = RoundRobinProxyStrategy(proxies)
+    
+    # Create configs
+    browser_config = BrowserConfig(headless=True, verbose=False)
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        proxy_rotation_strategy=proxy_strategy
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        urls = ["https://httpbin.org/ip"] * (len(proxies) * 2)  # Test each proxy twice
+
+        print("\n📈 Initializing crawler with proxy rotation...")
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            print("\n🚀 Starting batch crawl with proxy rotation...")
+            results = await crawler.arun_many(
+                urls=urls,
+                config=run_config
+            )
+            for result in results:
+                if result.success:
+                    ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+                    current_proxy = run_config.proxy_config if run_config.proxy_config else None
+                    
+                    if current_proxy and ip_match:
+                        print(f"URL {result.url}")
+                        print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
+                        verified = ip_match.group(0) == current_proxy.ip
+                        if verified:
+                            print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
+                        else:
+                            print("❌ Proxy failed or IP mismatch!")
+                    print("---")
+                else:
+                    print(f"❌ Crawl via proxy failed!: {result.error_message}")
+
+
+# 5️⃣ LLM Content Filter (requires API key)
+async def llm_content_filter():
+    """
+    PART 5: LLM Content Filter
+    
+    This function demonstrates:
+    - Configuring LLM providers via LlmConfig
+    - Using LLM to generate focused markdown
+    - LlmConfig for configuration
+    
+    Note: Requires a valid API key for the chosen LLM provider
+    """
+    print("\n===== LLM CONTENT FILTER =====")
+    print("This example shows how to use LLM to generate focused markdown content.")
+    print("Note: This example requires an API key. Set it in environment variables.")
+    
+    # Create LLM configuration
+    # Replace with your actual API key or set as environment variable
+    llm_config = LlmConfig(
+        provider="gemini/gemini-1.5-pro", 
+        api_token="env:GEMINI_API_KEY"  # Will read from GEMINI_API_KEY environment variable
+    )
+    
+    print("\n📊 Setting up LLM content filter...")
+    print(f"  - Provider: {llm_config.provider}")
+    print("  - API token: Using environment variable")
+    print("  - Instruction: Extract key concepts and summaries")
+    
+    # Create markdown generator with LLM filter
+    markdown_generator = DefaultMarkdownGenerator(
+        content_filter=LLMContentFilter(
+            llmConfig=llm_config,
+            instruction="Extract key concepts and summaries"
+        )
+    )
+    
+    config = CrawlerRunConfig(markdown_generator=markdown_generator)
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://docs.crawl4ai.com", config=config)
+        pprint(result.markdown.fit_markdown)
+        print("\n✅ Generated focused markdown:")
+
+
+
+# 6️⃣ PDF Processing
+async def pdf_processing():
+    """
+    PART 6: PDF Processing
+    
+    This function demonstrates:
+    - Using PDFCrawlerStrategy and PDFContentScrapingStrategy
+    - Extracting text and metadata from PDFs
+    """
+    print("\n===== PDF PROCESSING =====")
+    print("This example shows how to extract text and metadata from PDF files.")
+    
+    # Sample PDF URL
+    pdf_url = "https://arxiv.org/pdf/2310.06825.pdf"
+    
+    print("\n📊 Initializing PDF crawler...")
+    print(f"  - Target PDF: {pdf_url}")
+    print("  - Using PDFCrawlerStrategy and PDFContentScrapingStrategy")
+    
+    # Create crawler with PDF strategy
+    async with AsyncWebCrawler(crawler_strategy=PDFCrawlerStrategy()) as crawler:
+        print("\n🚀 Starting PDF processing...")
+        
+        start_time = time.perf_counter()
+        result = await crawler.arun(
+            pdf_url,
+            config=CrawlerRunConfig(scraping_strategy=PDFContentScrapingStrategy())
+        )
+        duration = time.perf_counter() - start_time
+        
+        print(f"\n✅ Processed PDF in {duration:.2f} seconds")
+        
+        # Show metadata
+        print("\n📄 PDF Metadata:")
+        if result.metadata:
+            for key, value in result.metadata.items():
+                if key not in ["html", "text", "markdown"] and value:
+                    print(f"  - {key}: {value}")
+        else:
+            print("  No metadata available")
+        
+        # Show sample of content
+        if result.markdown:
+            print("\n📝 PDF Content Sample:")
+            content_sample = result.markdown[:500] + "..." if len(result.markdown) > 500 else result.markdown
+            print(f"---\n{content_sample}\n---")
+        else:
+            print("\n⚠️ No content extracted")
+    
+    print("\n🔍 Key Takeaway: Crawl4AI can now process PDF files")
+    print("  to extract both text content and metadata.")
+
+
+# 7️⃣ LLM Schema Generation (requires API key)
+async def llm_schema_generation():
+    """
+    PART 7: LLM Schema Generation
+    
+    This function demonstrates:
+    - Configuring LLM providers via LlmConfig
+    - Using LLM to generate extraction schemas
+    - JsonCssExtractionStrategy
+    
+    Note: Requires a valid API key for the chosen LLM provider
+    """
+    print("\n===== LLM SCHEMA GENERATION =====")
+    print("This example shows how to use LLM to automatically generate extraction schemas.")
+    print("Note: This example requires an API key. Set it in environment variables.")
+    
+    # Sample HTML
+    sample_html = """
+    <div class="product">
+        <h2 class="title">Awesome Gaming Laptop</h2>
+        <div class="price">$1,299.99</div>
+        <div class="specs">
+            <ul>
+                <li>16GB RAM</li>
+                <li>512GB SSD</li>
+                <li>RTX 3080</li>
+            </ul>
+        </div>
+        <div class="rating">4.7/5</div>
+    </div>
+    """
+    print("\n📊 Setting up LlmConfig...")
+    # Create LLM configuration
+    llm_config = LlmConfig(
+        provider="gemini/gemini-1.5-pro", 
+        api_token="env:GEMINI_API_KEY"
+    )
+    print("\n🚀 Generating schema for product extraction...")
+    print("  This would use the LLM to analyze HTML and create an extraction schema")
+    schema = JsonCssExtractionStrategy.generate_schema(
+    html=sample_html,
+    llmConfig = llm_config,
+    query="Extract product name and price"
+    )
+    print("\n✅ Generated Schema:")
+    pprint(schema)
+    
+# Run all sections
+async def run_tutorial():
+    """
+    Main function to run all tutorial sections.
+    """
+    print("\n🚀 CRAWL4AI v0.5.0 TUTORIAL 🚀")
+    print("===============================")
+    print("This tutorial demonstrates the key features of Crawl4AI v0.5.0")
+    print("Including deep crawling, memory-adaptive dispatching, advanced filtering,")
+    print("and more powerful extraction capabilities.")
+    
+    # Sections to run
+    sections = [
+        deep_crawl,                 # 1. Deep Crawling with Best-First Strategy
+        memory_adaptive_dispatcher, # 2. Memory-Adaptive Dispatcher
+        http_crawler_strategy,      # 3. HTTP Crawler Strategy
+        proxy_rotation,             # 4. Proxy Rotation
+        llm_content_filter,         # 5. LLM Content Filter
+        pdf_processing,             # 6. PDF Processing
+        llm_schema_generation,      # 7. Schema Generation using LLM
+    ]
+    
+    for section in sections:
+        try:
+            await section()
+        except Exception as e:
+            print(f"⚠️ Error in {section.__name__}: {e}")
+    
+    print("\n🎉 TUTORIAL COMPLETE! 🎉")
+    print("You've now explored the key features of Crawl4AI v0.5.0")
+    print("For more information, visit https://docs.crawl4ai.com")
+
+
+# Run the tutorial
+if __name__ == "__main__":
+    asyncio.run(run_tutorial())