feat(markdown): add content source selection for markdown generation
Adds a new content_source parameter to MarkdownGenerationStrategy that allows selecting which HTML content to use for markdown generation: - cleaned_html (default): uses post-processed HTML - raw_html: uses original webpage HTML - fit_html: uses preprocessed HTML for schema extraction Changes include: - Added content_source parameter to MarkdownGenerationStrategy - Updated AsyncWebCrawler to handle HTML source selection - Added examples and tests for the new feature - Updated documentation with new parameter details BREAKING CHANGE: Renamed cleaned_html parameter to input_html in generate_markdown() method signature to better reflect its generalized purpose
This commit is contained in:
42
docs/examples/markdown/content_source_short_example.py
Normal file
42
docs/examples/markdown/content_source_short_example.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
|
||||
async def demo_markdown_source_config():
|
||||
print("\n=== Demo: Configuring Markdown Source ===")
|
||||
|
||||
# Example 1: Generate markdown from cleaned HTML (default behavior)
|
||||
cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
|
||||
config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
|
||||
print("Markdown from Cleaned HTML (default):")
|
||||
print(f" Length: {len(result_cleaned.markdown.raw_markdown)}")
|
||||
print(f" Start: {result_cleaned.markdown.raw_markdown[:100]}...")
|
||||
|
||||
# Example 2: Generate markdown directly from raw HTML
|
||||
raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
|
||||
config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result_raw = await crawler.arun(url="https://example.com", config=config_raw)
|
||||
print("\nMarkdown from Raw HTML:")
|
||||
print(f" Length: {len(result_raw.markdown.raw_markdown)}")
|
||||
print(f" Start: {result_raw.markdown.raw_markdown[:100]}...")
|
||||
|
||||
# Example 3: Generate markdown from preprocessed 'fit' HTML
|
||||
fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
|
||||
config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result_fit = await crawler.arun(url="https://example.com", config=config_fit)
|
||||
print("\nMarkdown from Fit HTML:")
|
||||
print(f" Length: {len(result_fit.markdown.raw_markdown)}")
|
||||
print(f" Start: {result_fit.markdown.raw_markdown[:100]}...")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(demo_markdown_source_config())
|
||||
Reference in New Issue
Block a user