Merge branch 'next-fix-markdown-source' into next

2025-04-17 20:16:15 +08:00
parent 30ec4f571f 7db6b468d9
commit fd899f66aa
9 changed files with 383 additions and 24 deletions
--- a/docs/examples/markdown/content_source_example.py
+++ b/docs/examples/markdown/content_source_example.py
@@ -0,0 +1,64 @@
+"""
+Example showing how to use the content_source parameter to control HTML input for markdown generation.
+"""
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+
+async def demo_content_source():
+    """Demonstrates different content_source options for markdown generation."""
+    url = "https://example.com"  # Simple demo site
+    
+    print("Crawling with different content_source options...")
+    
+    # --- Example 1: Default Behavior (cleaned_html) ---
+    # This uses the HTML after it has been processed by the scraping strategy
+    # The HTML is cleaned, simplified, and optimized for readability
+    default_generator = DefaultMarkdownGenerator()  # content_source="cleaned_html" is default
+    default_config = CrawlerRunConfig(markdown_generator=default_generator)
+    
+    # --- Example 2: Raw HTML ---
+    # This uses the original HTML directly from the webpage
+    # Preserves more original content but may include navigation, ads, etc.
+    raw_generator = DefaultMarkdownGenerator(content_source="raw_html")
+    raw_config = CrawlerRunConfig(markdown_generator=raw_generator)
+    
+    # --- Example 3: Fit HTML ---
+    # This uses preprocessed HTML optimized for schema extraction
+    # Better for structured data extraction but may lose some formatting
+    fit_generator = DefaultMarkdownGenerator(content_source="fit_html")
+    fit_config = CrawlerRunConfig(markdown_generator=fit_generator)
+    
+    # Execute all three crawlers in sequence
+    async with AsyncWebCrawler() as crawler:
+        # Default (cleaned_html)
+        result_default = await crawler.arun(url=url, config=default_config)
+        
+        # Raw HTML
+        result_raw = await crawler.arun(url=url, config=raw_config)
+        
+        # Fit HTML
+        result_fit = await crawler.arun(url=url, config=fit_config)
+    
+    # Print a summary of the results
+    print("\nMarkdown Generation Results:\n")
+    
+    print("1. Default (cleaned_html):")
+    print(f"   Length: {len(result_default.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_default.markdown.raw_markdown[:80]}...\n")
+    
+    print("2. Raw HTML:")
+    print(f"   Length: {len(result_raw.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_raw.markdown.raw_markdown[:80]}...\n")
+    
+    print("3. Fit HTML:")
+    print(f"   Length: {len(result_fit.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_fit.markdown.raw_markdown[:80]}...\n")
+    
+    # Demonstrate differences in output
+    print("\nKey Takeaways:")
+    print("- cleaned_html: Best for readable, focused content")
+    print("- raw_html: Preserves more original content, but may include noise")
+    print("- fit_html: Optimized for schema extraction and structured data")
+
+if __name__ == "__main__":
+    asyncio.run(demo_content_source())
--- a/docs/examples/markdown/content_source_short_example.py
+++ b/docs/examples/markdown/content_source_short_example.py
@@ -0,0 +1,42 @@
+"""
+Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
+"""
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+
+async def demo_markdown_source_config():
+    print("\n=== Demo: Configuring Markdown Source ===")
+
+    # Example 1: Generate markdown from cleaned HTML (default behavior)
+    cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
+    config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
+        print("Markdown from Cleaned HTML (default):")
+        print(f"  Length: {len(result_cleaned.markdown.raw_markdown)}")
+        print(f"  Start: {result_cleaned.markdown.raw_markdown[:100]}...")
+
+    # Example 2: Generate markdown directly from raw HTML
+    raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
+    config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_raw = await crawler.arun(url="https://example.com", config=config_raw)
+        print("\nMarkdown from Raw HTML:")
+        print(f"  Length: {len(result_raw.markdown.raw_markdown)}")
+        print(f"  Start: {result_raw.markdown.raw_markdown[:100]}...")
+
+    # Example 3: Generate markdown from preprocessed 'fit' HTML
+    fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
+    config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_fit = await crawler.arun(url="https://example.com", config=config_fit)
+        print("\nMarkdown from Fit HTML:")
+        print(f"  Length: {len(result_fit.markdown.raw_markdown)}")
+        print(f"  Start: {result_fit.markdown.raw_markdown[:100]}...")
+
+if __name__ == "__main__":
+    asyncio.run(demo_markdown_source_config())