crawl4ai/docs/examples/url_seeder/url_seeder_quick_demo.py

"""
🚀 URL Seeder + AsyncWebCrawler = Magic!
Quick demo showing discovery → filter → crawl pipeline
"""
import asyncio
from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunConfig, AsyncLogger, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter

# 🔍 Example 1: Discover ALL → Filter → Crawl
async def discover_and_crawl():
    """Find Python module tutorials & extract them all!"""
    seeder = AsyncUrlSeeder(
        logger=AsyncLogger()  # Log everything
    )

    # Step 1: See how many URLs exist (spoiler: A LOT!)
    print("📊 Let's see what RealPython has...")
    all_urls = await seeder.urls("realpython.com",
                                SeedingConfig(source="sitemap"))
    print(f"😱 Found {len(all_urls)} total URLs!")

    # Step 2: Filter for Python modules (perfect size ~13)
    print("\n🎯 Filtering for 'python-modules' tutorials...")
    module_urls = await seeder.urls("realpython.com",
                                  SeedingConfig(
                                      source="sitemap",
                                      pattern="*python-modules*",
                                      live_check=True  # Make sure they're alive!
                                  ))

    print(f"✨ Found {len(module_urls)} module tutorials")
    for url in module_urls[:3]:  # Show first 3
        status = "✅" if url["status"] == "valid" else "❌"
        print(f"{status} {url['url']}")

    # Step 3: Crawl them all with pruning (keep it lean!)
    print("\n🕷️ Crawling all module tutorials...")
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter(  # Smart filtering!
                    threshold=0.48,  # Remove fluff
                    threshold_type="fixed",
                ),
            ),
            only_text=True,
            stream=True,
        )

        # Extract just the URLs from the seeder results
        urls_to_crawl = [u["url"] for u in module_urls[:5]]
        results = await crawler.arun_many(urls_to_crawl, config=config)

        # Process & save
        saved = 0
        async for result in results:
            if result.success:
                # Save each tutorial (name from URL)
                name = result.url.split("/")[-2] + ".md"
                with open(name, "w") as f:
                    f.write(result.markdown.fit_markdown)
                saved += 1
                print(f"💾 Saved: {name}")

        print(f"\n🎉 Successfully saved {saved} tutorials!")

# 🔍 Example 2: Beautiful Soup articles with metadata peek
async def explore_beautifulsoup():
    """Discover BeautifulSoup content & peek at metadata"""
    seeder = AsyncUrlSeeder(logger=AsyncLogger() )

    print("🍲 Looking for Beautiful Soup articles...")
    soup_urls = await seeder.urls("realpython.com",
                                SeedingConfig(
                                    source="sitemap",
                                    pattern="*beautiful-soup*",
                                    extract_head=True  # Get the metadata!
                                ))

    print(f"\n📚 Found {len(soup_urls)} Beautiful Soup articles:\n")

    # Show what we discovered
    for i, url in enumerate(soup_urls, 1):
        meta = url["head_data"]["meta"]

        print(f"{i}. {url['head_data']['title']}")
        print(f"   📝 {meta.get('description', 'No description')[:60]}...")
        print(f"   👤 By: {meta.get('author', 'Unknown')}")
        print(f"   🔗 {url['url']}\n")

# 🔍 Example 3: Smart search with BM25 relevance scoring
async def smart_search_with_bm25():
    """Use AI-powered relevance scoring to find the best content"""
    seeder = AsyncUrlSeeder(logger=AsyncLogger() )

    print("🧠 Smart search: 'web scraping tutorial quiz'")

    # Search with BM25 scoring - AI finds the best matches!
    results = await seeder.urls("realpython.com",
                              SeedingConfig(
                                  source="sitemap",
                                  pattern="*beautiful-soup*",
                                  extract_head=True,
                                  query="web scraping tutorial quiz",  # Our search
                                  scoring_method="bm25",
                                  score_threshold=0.2  # Quality filter
                              ))

    print(f"\n🎯 Top {len(results)} most relevant results:\n")

    # Show ranked results with relevance scores
    for i, result in enumerate(results[:3], 1):
        print(f"{i}. [{result['relevance_score']:.2f}] {result['head_data']['title']}")
        print(f"   🔗 {result['url'][:60]}...")

    print("\n✨ BM25 automatically ranked by relevance!")

# 🎬 Run the show!
async def main():
    # print("=" * 60)
    # await discover_and_crawl()
    # print("\n" + "=" * 60 + "\n")
    # await explore_beautifulsoup()
    # print("\n" + "=" * 60 + "\n")
    await smart_search_with_bm25()

if __name__ == "__main__":
    asyncio.run(main())