merge next. Resolve conflicts. Fix some import errors and error handling in server.py

This commit is contained in:
Aravind Karnam
2025-04-19 20:27:47 +05:30
23 changed files with 5660 additions and 91 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,64 @@
"""
Example showing how to use the content_source parameter to control HTML input for markdown generation.
"""
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
async def demo_content_source():
"""Demonstrates different content_source options for markdown generation."""
url = "https://example.com" # Simple demo site
print("Crawling with different content_source options...")
# --- Example 1: Default Behavior (cleaned_html) ---
# This uses the HTML after it has been processed by the scraping strategy
# The HTML is cleaned, simplified, and optimized for readability
default_generator = DefaultMarkdownGenerator() # content_source="cleaned_html" is default
default_config = CrawlerRunConfig(markdown_generator=default_generator)
# --- Example 2: Raw HTML ---
# This uses the original HTML directly from the webpage
# Preserves more original content but may include navigation, ads, etc.
raw_generator = DefaultMarkdownGenerator(content_source="raw_html")
raw_config = CrawlerRunConfig(markdown_generator=raw_generator)
# --- Example 3: Fit HTML ---
# This uses preprocessed HTML optimized for schema extraction
# Better for structured data extraction but may lose some formatting
fit_generator = DefaultMarkdownGenerator(content_source="fit_html")
fit_config = CrawlerRunConfig(markdown_generator=fit_generator)
# Execute all three crawlers in sequence
async with AsyncWebCrawler() as crawler:
# Default (cleaned_html)
result_default = await crawler.arun(url=url, config=default_config)
# Raw HTML
result_raw = await crawler.arun(url=url, config=raw_config)
# Fit HTML
result_fit = await crawler.arun(url=url, config=fit_config)
# Print a summary of the results
print("\nMarkdown Generation Results:\n")
print("1. Default (cleaned_html):")
print(f" Length: {len(result_default.markdown.raw_markdown)} chars")
print(f" First 80 chars: {result_default.markdown.raw_markdown[:80]}...\n")
print("2. Raw HTML:")
print(f" Length: {len(result_raw.markdown.raw_markdown)} chars")
print(f" First 80 chars: {result_raw.markdown.raw_markdown[:80]}...\n")
print("3. Fit HTML:")
print(f" Length: {len(result_fit.markdown.raw_markdown)} chars")
print(f" First 80 chars: {result_fit.markdown.raw_markdown[:80]}...\n")
# Demonstrate differences in output
print("\nKey Takeaways:")
print("- cleaned_html: Best for readable, focused content")
print("- raw_html: Preserves more original content, but may include noise")
print("- fit_html: Optimized for schema extraction and structured data")
if __name__ == "__main__":
asyncio.run(demo_content_source())

View File

@@ -0,0 +1,42 @@
"""
Example demonstrating how to use the content_source parameter in MarkdownGenerationStrategy
"""
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
async def demo_markdown_source_config():
print("\n=== Demo: Configuring Markdown Source ===")
# Example 1: Generate markdown from cleaned HTML (default behavior)
cleaned_md_generator = DefaultMarkdownGenerator(content_source="cleaned_html")
config_cleaned = CrawlerRunConfig(markdown_generator=cleaned_md_generator)
async with AsyncWebCrawler() as crawler:
result_cleaned = await crawler.arun(url="https://example.com", config=config_cleaned)
print("Markdown from Cleaned HTML (default):")
print(f" Length: {len(result_cleaned.markdown.raw_markdown)}")
print(f" Start: {result_cleaned.markdown.raw_markdown[:100]}...")
# Example 2: Generate markdown directly from raw HTML
raw_md_generator = DefaultMarkdownGenerator(content_source="raw_html")
config_raw = CrawlerRunConfig(markdown_generator=raw_md_generator)
async with AsyncWebCrawler() as crawler:
result_raw = await crawler.arun(url="https://example.com", config=config_raw)
print("\nMarkdown from Raw HTML:")
print(f" Length: {len(result_raw.markdown.raw_markdown)}")
print(f" Start: {result_raw.markdown.raw_markdown[:100]}...")
# Example 3: Generate markdown from preprocessed 'fit' HTML
fit_md_generator = DefaultMarkdownGenerator(content_source="fit_html")
config_fit = CrawlerRunConfig(markdown_generator=fit_md_generator)
async with AsyncWebCrawler() as crawler:
result_fit = await crawler.arun(url="https://example.com", config=config_fit)
print("\nMarkdown from Fit HTML:")
print(f" Length: {len(result_fit.markdown.raw_markdown)}")
print(f" Start: {result_fit.markdown.raw_markdown[:100]}...")
if __name__ == "__main__":
asyncio.run(demo_markdown_source_config())