crawl4ai/docs/examples/url_seeder/url_seeder_demo.py

"""
URL Seeder Demo - Interactive showcase of Crawl4AI's URL discovery capabilities

This demo shows:
1. Basic URL discovery from sitemaps and Common Crawl
2. Cache management and forced refresh
3. Live URL validation and metadata extraction
4. BM25 relevance scoring for intelligent filtering
5. Integration with AsyncWebCrawler for the complete pipeline
6. Multi-domain discovery across multiple sites

Note: The AsyncUrlSeeder now supports context manager protocol for automatic cleanup.
"""

import asyncio
import time
from datetime import datetime
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, BarColumn, TimeElapsedColumn
from rich.prompt import Prompt, Confirm
from crawl4ai import (
    AsyncWebCrawler,
    CrawlerRunConfig,
    AsyncUrlSeeder,
    SeedingConfig
)

console = Console()

console.rule("[bold green]🌐 Crawl4AI URL Seeder: Interactive Demo")

DOMAIN = "crawl4ai.com"

# Utils

def print_head_info(head_data):
    table = Table(title="<head> Metadata", expand=True)
    table.add_column("Key", style="cyan", no_wrap=True)
    table.add_column("Value", style="magenta")

    if not head_data:
        console.print("[yellow]No head data found.")
        return

    if head_data.get("title"):
        table.add_row("title", head_data["title"])
    if head_data.get("charset"):
        table.add_row("charset", head_data["charset"])
    for k, v in head_data.get("meta", {}).items():
        table.add_row(f"meta:{k}", v)
    for rel, items in head_data.get("link", {}).items():
        for item in items:
            table.add_row(f"link:{rel}", item.get("href", ""))
    console.print(table)


async def section_1_basic_exploration(seed: AsyncUrlSeeder):
    console.rule("[bold cyan]1. Basic Seeding")
    cfg = SeedingConfig(source="cc+sitemap", pattern="*", verbose=True)

    start_time = time.time()
    with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
        p.add_task(description="Fetching from Common Crawl + Sitemap...", total=None)
        urls = await seed.urls(DOMAIN, cfg)
    elapsed = time.time() - start_time

    console.print(f"[green]✓ Fetched {len(urls)} URLs in {elapsed:.2f} seconds")
    console.print(f"[dim]  Speed: {len(urls)/elapsed:.0f} URLs/second[/dim]\n")

    console.print("[bold]Sample URLs:[/bold]")
    for u in urls[:5]:
        console.print(f"  • {u['url']}")


async def section_2_cache_demo(seed: AsyncUrlSeeder):
    console.rule("[bold cyan]2. Caching Demonstration")
    console.print("[yellow]Using `force=True` to bypass cache and fetch fresh data.[/yellow]")
    cfg = SeedingConfig(source="cc", pattern="*crawl4ai.com/core/*", verbose=False, force = True)
    await seed.urls(DOMAIN, cfg)

async def section_3_live_head(seed: AsyncUrlSeeder):
    console.rule("[bold cyan]3. Live Check + Head Extraction")
    cfg = SeedingConfig(
        extract_head=True,
        concurrency=10,
        hits_per_sec=5,
        pattern="*crawl4ai.com/*",
        max_urls=10,
        verbose=False,
    )
    urls = await seed.urls(DOMAIN, cfg)

    valid = [u for u in urls if u["status"] == "valid"]
    console.print(f"[green]Valid: {len(valid)} / {len(urls)}")
    if valid:
        print_head_info(valid[0]["head_data"])


async def section_4_bm25_scoring(seed: AsyncUrlSeeder):
    console.rule("[bold cyan]4. BM25 Relevance Scoring")
    console.print("[yellow]Using AI-powered relevance scoring to find the most relevant content[/yellow]")

    query = "markdown generation extraction strategies"
    cfg = SeedingConfig(
        source="sitemap",
        extract_head=True,
        query=query,
        scoring_method="bm25",
        score_threshold=0.3,  # Only URLs with >30% relevance
        max_urls=20,
        verbose=False
    )

    with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
        p.add_task(description=f"Searching for: '{query}'", total=None)
        urls = await seed.urls(DOMAIN, cfg)

    console.print(f"[green]Found {len(urls)} relevant URLs (score > 0.3)")

    # Show top results with scores
    table = Table(title="Top 5 Most Relevant Pages", expand=True)
    table.add_column("Score", style="cyan", width=8)
    table.add_column("Title", style="magenta")
    table.add_column("URL", style="blue", overflow="fold")

    for url in urls[:5]:
        score = f"{url['relevance_score']:.2f}"
        title = url['head_data'].get('title', 'No title')[:60] + "..."
        table.add_row(score, title, url['url'])

    console.print(table)

async def section_5_keyword_filter_to_agent(seed: AsyncUrlSeeder):
    console.rule("[bold cyan]5. Complete Pipeline: Discover → Filter → Crawl")
    cfg = SeedingConfig(
        extract_head=True,
        concurrency=20,
        hits_per_sec=10,
        max_urls=10,
        pattern="*crawl4ai.com/*",
        force=True,
    )
    urls = await seed.urls(DOMAIN, cfg)

    keywords = ["deep crawling", "markdown", "llm"]
    selected = [u for u in urls if any(k in str(u["head_data"]).lower() for k in keywords)]

    console.print(f"[cyan]Selected {len(selected)} URLs with relevant keywords:")
    for u in selected[:10]:
        console.print("•", u["url"])

    console.print("\n[yellow]Passing above URLs to arun_many() LLM agent for crawling...")
    async with AsyncWebCrawler(verbose=True) as crawler:
        crawl_run_config = CrawlerRunConfig(
                # Example crawl settings for these URLs:
                only_text=True, # Just get text content
                screenshot=False,
                pdf=False,
                word_count_threshold=50, # Only process pages with at least 50 words
                stream=True,
                verbose=False # Keep logs clean for arun_many in this demo
            )

        # Extract just the URLs from the selected results
        urls_to_crawl = [u["url"] for u in selected]

        # We'll stream results for large lists, but collect them here for demonstration
        crawled_results_stream = await crawler.arun_many(urls_to_crawl, config=crawl_run_config)
        final_crawled_data = []
        async for result in crawled_results_stream:
            final_crawled_data.append(result)
            if len(final_crawled_data) % 5 == 0:
                print(f"   Processed {len(final_crawled_data)}/{len(urls_to_crawl)} URLs...")

        print(f"\n   Successfully crawled {len(final_crawled_data)} URLs.")
        if final_crawled_data:
            print("\n   Example of a crawled result's URL and Markdown (first successful one):")
            for result in final_crawled_data:
                if result.success and result.markdown.raw_markdown:
                    print(f"     URL: {result.url}")
                    print(f"     Markdown snippet: {result.markdown.raw_markdown[:200]}...")
                    break
            else:
                print("   No successful crawls with markdown found.")
        else:
            print("   No successful crawls found.")


async def section_6_multi_domain(seed: AsyncUrlSeeder):
    console.rule("[bold cyan]6. Multi-Domain Discovery")
    console.print("[yellow]Discovering Python tutorials across multiple educational sites[/yellow]\n")

    domains = ["docs.python.org", "realpython.com", "docs.crawl4ai.com"]
    cfg = SeedingConfig(
        source="sitemap",
        extract_head=True,
        query="python tutorial guide",
        scoring_method="bm25",
        score_threshold=0.2,
        max_urls=5  # Per domain
    )

    start_time = time.time()
    with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
        task = p.add_task(description="Discovering across domains...", total=None)
        results = await seed.many_urls(domains, cfg)
    elapsed = time.time() - start_time

    total_urls = sum(len(urls) for urls in results.values())
    console.print(f"[green]✓ Found {total_urls} relevant URLs across {len(domains)} domains in {elapsed:.2f}s\n")

    # Show results per domain
    for domain, urls in results.items():
        console.print(f"[bold]{domain}:[/bold] {len(urls)} relevant pages")
        if urls:
            top = urls[0]
            console.print(f"  Top result: [{top['relevance_score']:.2f}] {top['head_data'].get('title', 'No title')}")


async def main():
    async with AsyncUrlSeeder() as seed:
        # Interactive menu
        sections = {
            "1": ("Basic URL Discovery", section_1_basic_exploration),
            "2": ("Cache Management Demo", section_2_cache_demo),
            "3": ("Live Check & Metadata Extraction", section_3_live_head),
            "4": ("BM25 Relevance Scoring", section_4_bm25_scoring),
            "5": ("Complete Pipeline (Discover → Filter → Crawl)", section_5_keyword_filter_to_agent),
            "6": ("Multi-Domain Discovery", section_6_multi_domain),
            "7": ("Run All Demos", None)
        }

        console.print("\n[bold]Available Demos:[/bold]")
        for key, (title, _) in sections.items():
            console.print(f"  {key}. {title}")

        choice = Prompt.ask("\n[cyan]Which demo would you like to run?[/cyan]",
                           choices=list(sections.keys()),
                           default="7")

        console.print()

        if choice == "7":
            # Run all demos
            for key, (title, func) in sections.items():
                if key != "7" and func:
                    await func(seed)
                    if key != "6":  # Don't pause after the last demo
                        if not Confirm.ask("\n[yellow]Continue to next demo?[/yellow]", default=True):
                            break
                        console.print()
        else:
            # Run selected demo
            _, func = sections[choice]
            await func(seed)

        console.rule("[bold green]Demo Complete ✔︎")


if __name__ == "__main__":
    asyncio.run(main())