This commit introduces AsyncUrlSeeder, a high-performance URL discovery system that enables intelligent crawling at scale by pre-discovering and filtering URLs before crawling. ## Core Features ### AsyncUrlSeeder Component - Discovers URLs from multiple sources: - Sitemaps (including nested and gzipped) - Common Crawl index - Combined sources for maximum coverage - Extracts page metadata without full crawling: - Title, description, keywords - Open Graph and Twitter Card tags - JSON-LD structured data - Language and charset information - BM25 relevance scoring for intelligent filtering: - Query-based URL discovery - Configurable score thresholds - Automatic ranking by relevance - Performance optimizations: - Async/concurrent processing with configurable workers - Rate limiting (hits per second) - Automatic caching with TTL - Streaming results for large datasets ### SeedingConfig - Comprehensive configuration for URL seeding: - Source selection (sitemap, cc, or both) - URL pattern filtering with wildcards - Live URL validation options - Metadata extraction controls - BM25 scoring parameters - Concurrency and rate limiting ### Integration with AsyncWebCrawler - Seamless pipeline: discover → filter → crawl - Direct compatibility with arun_many() - Significant resource savings by pre-filtering URLs ## Documentation - Comprehensive guide comparing URL seeding vs deep crawling - Complete API reference with parameter tables - Practical examples showing all features - Performance benchmarks and best practices - Integration patterns with AsyncWebCrawler ## Examples - url_seeder_demo.py: Interactive Rich-based demo with: - Basic discovery - Cache management - Live validation - BM25 scoring - Multi-domain discovery - Complete pipeline integration - url_seeder_quick_demo.py: Screenshot-friendly examples: - Pattern-based filtering - Metadata exploration - Smart search with BM25 ## Testing - Comprehensive test suite (test_async_url_seeder_bm25.py) - Coverage of all major features - Edge cases and error handling - Performance and consistency tests ## Implementation Details - Built on httpx with HTTP/2 support - Optional dependencies: lxml, brotli, rank_bm25 - Cache management in ~/.crawl4ai/seeder_cache/ - Logger integration with AsyncLoggerBase - Proper error handling and retry logic ## Bug Fixes - Fixed logger color compatibility (lightblack → bright_black) - Corrected URL extraction from seeder results for arun_many() - Updated all examples and documentation with proper usage This feature enables users to crawl smarter, not harder, by discovering and analyzing URLs before committing resources to crawling them.
128 lines
5.1 KiB
Python
128 lines
5.1 KiB
Python
"""
|
|
🚀 URL Seeder + AsyncWebCrawler = Magic!
|
|
Quick demo showing discovery → filter → crawl pipeline
|
|
"""
|
|
import asyncio
|
|
from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunConfig, AsyncLogger, DefaultMarkdownGenerator
|
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
|
|
|
# 🔍 Example 1: Discover ALL → Filter → Crawl
|
|
async def discover_and_crawl():
|
|
"""Find Python module tutorials & extract them all!"""
|
|
seeder = AsyncUrlSeeder(
|
|
logger=AsyncLogger() # Log everything
|
|
)
|
|
|
|
# Step 1: See how many URLs exist (spoiler: A LOT!)
|
|
print("📊 Let's see what RealPython has...")
|
|
all_urls = await seeder.urls("realpython.com",
|
|
SeedingConfig(source="sitemap"))
|
|
print(f"😱 Found {len(all_urls)} total URLs!")
|
|
|
|
# Step 2: Filter for Python modules (perfect size ~13)
|
|
print("\n🎯 Filtering for 'python-modules' tutorials...")
|
|
module_urls = await seeder.urls("realpython.com",
|
|
SeedingConfig(
|
|
source="sitemap",
|
|
pattern="*python-modules*",
|
|
live_check=True # Make sure they're alive!
|
|
))
|
|
|
|
print(f"✨ Found {len(module_urls)} module tutorials")
|
|
for url in module_urls[:3]: # Show first 3
|
|
status = "✅" if url["status"] == "valid" else "❌"
|
|
print(f"{status} {url['url']}")
|
|
|
|
# Step 3: Crawl them all with pruning (keep it lean!)
|
|
print("\n🕷️ Crawling all module tutorials...")
|
|
async with AsyncWebCrawler() as crawler:
|
|
config = CrawlerRunConfig(
|
|
markdown_generator=DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter( # Smart filtering!
|
|
threshold=0.48, # Remove fluff
|
|
threshold_type="fixed",
|
|
),
|
|
),
|
|
only_text=True,
|
|
stream=True,
|
|
)
|
|
|
|
# Extract just the URLs from the seeder results
|
|
urls_to_crawl = [u["url"] for u in module_urls[:5]]
|
|
results = await crawler.arun_many(urls_to_crawl, config=config)
|
|
|
|
# Process & save
|
|
saved = 0
|
|
async for result in results:
|
|
if result.success:
|
|
# Save each tutorial (name from URL)
|
|
name = result.url.split("/")[-2] + ".md"
|
|
with open(name, "w") as f:
|
|
f.write(result.markdown.fit_markdown)
|
|
saved += 1
|
|
print(f"💾 Saved: {name}")
|
|
|
|
print(f"\n🎉 Successfully saved {saved} tutorials!")
|
|
|
|
# 🔍 Example 2: Beautiful Soup articles with metadata peek
|
|
async def explore_beautifulsoup():
|
|
"""Discover BeautifulSoup content & peek at metadata"""
|
|
seeder = AsyncUrlSeeder(logger=AsyncLogger() )
|
|
|
|
print("🍲 Looking for Beautiful Soup articles...")
|
|
soup_urls = await seeder.urls("realpython.com",
|
|
SeedingConfig(
|
|
source="sitemap",
|
|
pattern="*beautiful-soup*",
|
|
extract_head=True # Get the metadata!
|
|
))
|
|
|
|
print(f"\n📚 Found {len(soup_urls)} Beautiful Soup articles:\n")
|
|
|
|
# Show what we discovered
|
|
for i, url in enumerate(soup_urls, 1):
|
|
meta = url["head_data"]["meta"]
|
|
|
|
print(f"{i}. {url['head_data']['title']}")
|
|
print(f" 📝 {meta.get('description', 'No description')[:60]}...")
|
|
print(f" 👤 By: {meta.get('author', 'Unknown')}")
|
|
print(f" 🔗 {url['url']}\n")
|
|
|
|
# 🔍 Example 3: Smart search with BM25 relevance scoring
|
|
async def smart_search_with_bm25():
|
|
"""Use AI-powered relevance scoring to find the best content"""
|
|
seeder = AsyncUrlSeeder(logger=AsyncLogger() )
|
|
|
|
print("🧠 Smart search: 'web scraping tutorial quiz'")
|
|
|
|
# Search with BM25 scoring - AI finds the best matches!
|
|
results = await seeder.urls("realpython.com",
|
|
SeedingConfig(
|
|
source="sitemap",
|
|
pattern="*beautiful-soup*",
|
|
extract_head=True,
|
|
query="web scraping tutorial quiz", # Our search
|
|
scoring_method="bm25",
|
|
score_threshold=0.2 # Quality filter
|
|
))
|
|
|
|
print(f"\n🎯 Top {len(results)} most relevant results:\n")
|
|
|
|
# Show ranked results with relevance scores
|
|
for i, result in enumerate(results[:3], 1):
|
|
print(f"{i}. [{result['relevance_score']:.2f}] {result['head_data']['title']}")
|
|
print(f" 🔗 {result['url'][:60]}...")
|
|
|
|
print("\n✨ BM25 automatically ranked by relevance!")
|
|
|
|
# 🎬 Run the show!
|
|
async def main():
|
|
# print("=" * 60)
|
|
# await discover_and_crawl()
|
|
# print("\n" + "=" * 60 + "\n")
|
|
# await explore_beautifulsoup()
|
|
# print("\n" + "=" * 60 + "\n")
|
|
await smart_search_with_bm25()
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |