Merge branch 'next-fix-markdown-source' into next

2025-04-17 20:16:15 +08:00
parent 30ec4f571f 7db6b468d9
commit fd899f66aa
9 changed files with 383 additions and 24 deletions
--- a/docs/examples/markdown/content_source_example.py
+++ b/docs/examples/markdown/content_source_example.py
@@ -0,0 +1,64 @@
+"""
+Example showing how to use the content_source parameter to control HTML input for markdown generation.
+"""
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+
+async def demo_content_source():
+    """Demonstrates different content_source options for markdown generation."""
+    url = "https://example.com"  # Simple demo site
+    
+    print("Crawling with different content_source options...")
+    
+    # --- Example 1: Default Behavior (cleaned_html) ---
+    # This uses the HTML after it has been processed by the scraping strategy
+    # The HTML is cleaned, simplified, and optimized for readability
+    default_generator = DefaultMarkdownGenerator()  # content_source="cleaned_html" is default
+    default_config = CrawlerRunConfig(markdown_generator=default_generator)
+    
+    # --- Example 2: Raw HTML ---
+    # This uses the original HTML directly from the webpage
+    # Preserves more original content but may include navigation, ads, etc.
+    raw_generator = DefaultMarkdownGenerator(content_source="raw_html")
+    raw_config = CrawlerRunConfig(markdown_generator=raw_generator)
+    
+    # --- Example 3: Fit HTML ---
+    # This uses preprocessed HTML optimized for schema extraction
+    # Better for structured data extraction but may lose some formatting
+    fit_generator = DefaultMarkdownGenerator(content_source="fit_html")
+    fit_config = CrawlerRunConfig(markdown_generator=fit_generator)
+    
+    # Execute all three crawlers in sequence
+    async with AsyncWebCrawler() as crawler:
+        # Default (cleaned_html)
+        result_default = await crawler.arun(url=url, config=default_config)
+        
+        # Raw HTML
+        result_raw = await crawler.arun(url=url, config=raw_config)
+        
+        # Fit HTML
+        result_fit = await crawler.arun(url=url, config=fit_config)
+    
+    # Print a summary of the results
+    print("\nMarkdown Generation Results:\n")
+    
+    print("1. Default (cleaned_html):")
+    print(f"   Length: {len(result_default.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_default.markdown.raw_markdown[:80]}...\n")
+    
+    print("2. Raw HTML:")
+    print(f"   Length: {len(result_raw.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_raw.markdown.raw_markdown[:80]}...\n")
+    
+    print("3. Fit HTML:")
+    print(f"   Length: {len(result_fit.markdown.raw_markdown)} chars")
+    print(f"   First 80 chars: {result_fit.markdown.raw_markdown[:80]}...\n")
+    
+    # Demonstrate differences in output
+    print("\nKey Takeaways:")
+    print("- cleaned_html: Best for readable, focused content")
+    print("- raw_html: Preserves more original content, but may include noise")
+    print("- fit_html: Optimized for schema extraction and structured data")
+
+if __name__ == "__main__":
+    asyncio.run(demo_content_source())
--- a/docs/examples/markdown/content_source_short_example.py
+++ b/docs/examples/markdown/content_source_short_example.py
@@ -0,0 +1,42 @@
+"""
+Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
+"""
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
+
+async def demo_markdown_source_config():
+    print("\n=== Demo: Configuring Markdown Source ===")
+
+    # Example 1: Generate markdown from cleaned HTML (default behavior)
+    cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
+    config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
+        print("Markdown from Cleaned HTML (default):")
+        print(f"  Length: {len(result_cleaned.markdown.raw_markdown)}")
+        print(f"  Start: {result_cleaned.markdown.raw_markdown[:100]}...")
+
+    # Example 2: Generate markdown directly from raw HTML
+    raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
+    config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_raw = await crawler.arun(url="https://example.com", config=config_raw)
+        print("\nMarkdown from Raw HTML:")
+        print(f"  Length: {len(result_raw.markdown.raw_markdown)}")
+        print(f"  Start: {result_raw.markdown.raw_markdown[:100]}...")
+
+    # Example 3: Generate markdown from preprocessed 'fit' HTML
+    fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
+    config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
+
+    async with AsyncWebCrawler() as crawler:
+        result_fit = await crawler.arun(url="https://example.com", config=config_fit)
+        print("\nMarkdown from Fit HTML:")
+        print(f"  Length: {len(result_fit.markdown.raw_markdown)}")
+        print(f"  Start: {result_fit.markdown.raw_markdown[:100]}...")
+
+if __name__ == "__main__":
+    asyncio.run(demo_markdown_source_config())
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -70,7 +70,7 @@ We group them by category.
 |------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
 | **`word_count_threshold`**   | `int` (default: ~200)                | Skips text blocks below X words. Helps ignore trivial sections.                                 |
 | **`extraction_strategy`**    | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.).                                  |
-| **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.).                 |
+| **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html').                 |
 | **`css_selector`**           | `str` (None)                         | Retains only the part of the page matching this selector. Affects the entire extraction process. |
 | **`target_elements`**        | `List[str]` (None)                   | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
 | **`excluded_tags`**          | `list` (None)                        | Removes entire tags (e.g. `["script", "style"]`).                                               |
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -111,13 +111,71 @@ Some commonly used `options`:
 - **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page.  
 - **`include_sup_sub`** (bool): Attempt to handle `<sup>` / `<sub>` in a more readable way.

+## 4. Selecting the HTML Source for Markdown Generation
+
+The `content_source` parameter allows you to control which HTML content is used as input for markdown generation. This gives you flexibility in how the HTML is processed before conversion to markdown.
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+    # Option 1: Use the raw HTML directly from the webpage (before any processing)
+    raw_md_generator = DefaultMarkdownGenerator(
+        content_source="raw_html",
+        options={"ignore_links": True}
+    )
+    
+    # Option 2: Use the cleaned HTML (after scraping strategy processing - default)
+    cleaned_md_generator = DefaultMarkdownGenerator(
+        content_source="cleaned_html",  # This is the default
+        options={"ignore_links": True}
+    )
+    
+    # Option 3: Use preprocessed HTML optimized for schema extraction
+    fit_md_generator = DefaultMarkdownGenerator(
+        content_source="fit_html",
+        options={"ignore_links": True}
+    )
+    
+    # Use one of the generators in your crawler config
+    config = CrawlerRunConfig(
+        markdown_generator=raw_md_generator  # Try each of the generators
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun("https://example.com", config=config)
+        if result.success:
+            print("Markdown:\n", result.markdown.raw_markdown[:500])
+        else:
+            print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+### HTML Source Options
+
+- **`"cleaned_html"`** (default): Uses the HTML after it has been processed by the scraping strategy. This HTML is typically cleaner and more focused on content, with some boilerplate removed.
+
+- **`"raw_html"`**: Uses the original HTML directly from the webpage, before any cleaning or processing. This preserves more of the original content, but may include navigation bars, ads, footers, and other elements that might not be relevant to the main content.
+
+- **`"fit_html"`**: Uses HTML preprocessed for schema extraction. This HTML is optimized for structured data extraction and may have certain elements simplified or removed.
+
+### When to Use Each Option
+
+- Use **`"cleaned_html"`** (default) for most cases where you want a balance of content preservation and noise removal.
+- Use **`"raw_html"`** when you need to preserve all original content, or when the cleaning process is removing content you actually want to keep.
+- Use **`"fit_html"`** when working with structured data or when you need HTML that's optimized for schema extraction.
+
 ---

-## 4. Content Filters
+## 5. Content Filters

 **Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want.

-### 4.1 BM25ContentFilter
+### 5.1 BM25ContentFilter

 If you have a **search query**, BM25 is a good choice:

@@ -146,7 +204,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)

 **No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.

-### 4.2 PruningContentFilter
+### 5.2 PruningContentFilter

 If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections.

@@ -170,7 +228,7 @@ prune_filter = PruningContentFilter(
 - You want a broad cleanup without a user query.  
 - The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.

-### 4.3 LLMContentFilter
+### 5.3 LLMContentFilter

 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:

@@ -247,7 +305,7 @@ filter = LLMContentFilter(

 ---

-## 5. Using Fit Markdown
+## 6. Using Fit Markdown

 When a content filter is active, the library produces two forms of markdown inside `result.markdown`:

@@ -284,7 +342,7 @@ if __name__ == "__main__":

 ---

-## 6. The `MarkdownGenerationResult` Object
+## 7. The `MarkdownGenerationResult` Object

 If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as:

@@ -315,7 +373,7 @@ Below is a **revised section** under “Combining Filters (BM25 + Pruning)” th

 ---

-## 7. Combining Filters (BM25 + Pruning) in Two Passes
+## 8. Combining Filters (BM25 + Pruning) in Two Passes

 You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead:

@@ -407,7 +465,7 @@ If your codebase or pipeline design allows applying multiple filters in one pass

 ---

-## 8. Common Pitfalls & Tips
+## 9. Common Pitfalls & Tips

 1. **No Markdown Output?**  
   - Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements.  
@@ -427,11 +485,12 @@ If your codebase or pipeline design allows applying multiple filters in one pass

 ---

-## 9. Summary & Next Steps
+## 10. Summary & Next Steps

 In this **Markdown Generation Basics** tutorial, you learned to:

 - Configure the **DefaultMarkdownGenerator** with HTML-to-text options.  
+- Select different HTML sources using the `content_source` parameter.  
 - Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal.  
 - Distinguish between raw and filtered markdown (`fit_markdown`).  
 - Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.).