feat(markdown): add content source selection for markdown generation

Adds a new content_source parameter to MarkdownGenerationStrategy that allows selecting which HTML content to use for markdown generation: - cleaned_html (default): uses post-processed HTML - raw_html: uses original webpage HTML - fit_html: uses preprocessed HTML for schema extraction Changes include: - Added content_source parameter to MarkdownGenerationStrategy - Updated AsyncWebCrawler to handle HTML source selection - Added examples and tests for the new feature - Updated documentation with new parameter details BREAKING CHANGE: Renamed cleaned_html parameter to input_html in generate_markdown() method signature to better reflect its generalized purpose
2025-04-17 20:13:53 +08:00
parent 94d486579c
commit 7db6b468d9
9 changed files with 383 additions and 24 deletions
--- a/docs/examples/markdown/content_source_example.py
+++ b/docs/examples/markdown/content_source_example.py
@@ -0,0 +1,64 @@
+"""
+Example showing how to use the content_source parameter to control HTML input for markdown generation.
+"""
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+
+async def demo_content_source():
+    """Demonstrates different content_source options for markdown generation."""
+    url = "https://example.com"  # Simple demo site
+    
+    print("Crawling with different content_source options...")
+    
+    # --- Example 1: Default Behavior (cleaned_html) ---
+    # This uses the HTML after it has been processed by the scraping strategy
+    # The HTML is cleaned, simplified, and optimized for readability
+    default_generator = DefaultMarkdownGenerator()  # content_source="cleaned_html" is default
+    default_config = CrawlerRunConfig(markdown_generator=default_generator)
+    
+    # --- Example 2: Raw HTML ---
+    # This uses the original HTML directly from the webpage
+    # Preserves more original content but may include navigation, ads, etc.
+    raw_generator = DefaultMarkdownGenerator(content_source="raw_html")
+    raw_config = CrawlerRunConfig(markdown_generator=raw_generator)
+    
+    # --- Example 3: Fit HTML ---
+    # This uses preprocessed HTML optimized for schema extraction
+    # Better for structured data extraction but may lose some formatting
+    fit_generator = DefaultMarkdownGenerator(content_source="fit_html")
+    fit_config = CrawlerRunConfig(markdown_generator=fit_generator)
+    
+    # Execute all three crawlers in sequence
+    async with AsyncWebCrawler() as crawler:
+        # Default (cleaned_html)
+        result_default = await crawler.arun(url=url, config=default_config)
+        
+        # Raw HTML
+        result_raw = await crawler.arun(url=url, config=raw_config)
+        
+        # Fit HTML
+        result_fit = await crawler.arun(url=url, config=fit_config)
+    
+    # Print a summary of the results
+    print("\nMarkdown Generation Results:\n")
+    
+    print("1. Default (cleaned_html):")
+    print(f"   Length: {len(result_default.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_default.markdown.raw_markdown[:80]}...\n")
+    
+    print("2. Raw HTML:")
+    print(f"   Length: {len(result_raw.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_raw.markdown.raw_markdown[:80]}...\n")
+    
+    print("3. Fit HTML:")
+    print(f"   Length: {len(result_fit.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_fit.markdown.raw_markdown[:80]}...\n")
+    
+    # Demonstrate differences in output
+    print("\nKey Takeaways:")
+    print("- cleaned_html: Best for readable, focused content")
+    print("- raw_html: Preserves more original content, but may include noise")
+    print("- fit_html: Optimized for schema extraction and structured data")
+
+if __name__ == "__main__":
+    asyncio.run(demo_content_source())