This commit introduces AsyncUrlSeeder, a high-performance URL discovery system that enables intelligent crawling at scale by pre-discovering and filtering URLs before crawling. ## Core Features ### AsyncUrlSeeder Component - Discovers URLs from multiple sources: - Sitemaps (including nested and gzipped) - Common Crawl index - Combined sources for maximum coverage - Extracts page metadata without full crawling: - Title, description, keywords - Open Graph and Twitter Card tags - JSON-LD structured data - Language and charset information - BM25 relevance scoring for intelligent filtering: - Query-based URL discovery - Configurable score thresholds - Automatic ranking by relevance - Performance optimizations: - Async/concurrent processing with configurable workers - Rate limiting (hits per second) - Automatic caching with TTL - Streaming results for large datasets ### SeedingConfig - Comprehensive configuration for URL seeding: - Source selection (sitemap, cc, or both) - URL pattern filtering with wildcards - Live URL validation options - Metadata extraction controls - BM25 scoring parameters - Concurrency and rate limiting ### Integration with AsyncWebCrawler - Seamless pipeline: discover → filter → crawl - Direct compatibility with arun_many() - Significant resource savings by pre-filtering URLs ## Documentation - Comprehensive guide comparing URL seeding vs deep crawling - Complete API reference with parameter tables - Practical examples showing all features - Performance benchmarks and best practices - Integration patterns with AsyncWebCrawler ## Examples - url_seeder_demo.py: Interactive Rich-based demo with: - Basic discovery - Cache management - Live validation - BM25 scoring - Multi-domain discovery - Complete pipeline integration - url_seeder_quick_demo.py: Screenshot-friendly examples: - Pattern-based filtering - Metadata exploration - Smart search with BM25 ## Testing - Comprehensive test suite (test_async_url_seeder_bm25.py) - Coverage of all major features - Edge cases and error handling - Performance and consistency tests ## Implementation Details - Built on httpx with HTTP/2 support - Optional dependencies: lxml, brotli, rank_bm25 - Cache management in ~/.crawl4ai/seeder_cache/ - Logger integration with AsyncLoggerBase - Proper error handling and retry logic ## Bug Fixes - Fixed logger color compatibility (lightblack → bright_black) - Corrected URL extraction from seeder results for arun_many() - Updated all examples and documentation with proper usage This feature enables users to crawl smarter, not harder, by discovering and analyzing URLs before committing resources to crawling them.
262 lines
9.6 KiB
Python
262 lines
9.6 KiB
Python
"""
|
|
URL Seeder Demo - Interactive showcase of Crawl4AI's URL discovery capabilities
|
|
|
|
This demo shows:
|
|
1. Basic URL discovery from sitemaps and Common Crawl
|
|
2. Cache management and forced refresh
|
|
3. Live URL validation and metadata extraction
|
|
4. BM25 relevance scoring for intelligent filtering
|
|
5. Integration with AsyncWebCrawler for the complete pipeline
|
|
"""
|
|
|
|
import asyncio
|
|
import time
|
|
from datetime import datetime
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
from rich.panel import Panel
|
|
from rich.progress import Progress, SpinnerColumn, BarColumn, TimeElapsedColumn
|
|
from rich.prompt import Prompt, Confirm
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
CrawlerRunConfig,
|
|
AsyncUrlSeeder,
|
|
SeedingConfig
|
|
)
|
|
|
|
console = Console()
|
|
|
|
console.rule("[bold green]🌐 Crawl4AI URL Seeder: Interactive Demo")
|
|
|
|
DOMAIN = "crawl4ai.com"
|
|
|
|
# Utils
|
|
|
|
def print_head_info(head_data):
|
|
table = Table(title="<head> Metadata", expand=True)
|
|
table.add_column("Key", style="cyan", no_wrap=True)
|
|
table.add_column("Value", style="magenta")
|
|
|
|
if not head_data:
|
|
console.print("[yellow]No head data found.")
|
|
return
|
|
|
|
if head_data.get("title"):
|
|
table.add_row("title", head_data["title"])
|
|
if head_data.get("charset"):
|
|
table.add_row("charset", head_data["charset"])
|
|
for k, v in head_data.get("meta", {}).items():
|
|
table.add_row(f"meta:{k}", v)
|
|
for rel, items in head_data.get("link", {}).items():
|
|
for item in items:
|
|
table.add_row(f"link:{rel}", item.get("href", ""))
|
|
console.print(table)
|
|
|
|
|
|
async def section_1_basic_exploration(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]1. Basic Seeding")
|
|
cfg = SeedingConfig(source="cc+sitemap", pattern="*", verbose=True)
|
|
|
|
start_time = time.time()
|
|
with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
|
|
p.add_task(description="Fetching from Common Crawl + Sitemap...", total=None)
|
|
urls = await seed.urls(DOMAIN, cfg)
|
|
elapsed = time.time() - start_time
|
|
|
|
console.print(f"[green]✓ Fetched {len(urls)} URLs in {elapsed:.2f} seconds")
|
|
console.print(f"[dim] Speed: {len(urls)/elapsed:.0f} URLs/second[/dim]\n")
|
|
|
|
console.print("[bold]Sample URLs:[/bold]")
|
|
for u in urls[:5]:
|
|
console.print(f" • {u['url']}")
|
|
|
|
|
|
async def section_2_cache_demo(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]2. Caching Demonstration")
|
|
console.print("[yellow]Using `force=True` to bypass cache and fetch fresh data.[/yellow]")
|
|
cfg = SeedingConfig(source="cc", pattern="*crawl4ai.com/core/*", verbose=False, force = True)
|
|
await seed.urls(DOMAIN, cfg)
|
|
|
|
async def section_3_live_head(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]3. Live Check + Head Extraction")
|
|
cfg = SeedingConfig(
|
|
extract_head=True,
|
|
concurrency=10,
|
|
hits_per_sec=5,
|
|
pattern="*crawl4ai.com/*",
|
|
max_urls=10,
|
|
verbose=False,
|
|
)
|
|
urls = await seed.urls(DOMAIN, cfg)
|
|
|
|
valid = [u for u in urls if u["status"] == "valid"]
|
|
console.print(f"[green]Valid: {len(valid)} / {len(urls)}")
|
|
if valid:
|
|
print_head_info(valid[0]["head_data"])
|
|
|
|
|
|
async def section_4_bm25_scoring(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]4. BM25 Relevance Scoring")
|
|
console.print("[yellow]Using AI-powered relevance scoring to find the most relevant content[/yellow]")
|
|
|
|
query = "markdown generation extraction strategies"
|
|
cfg = SeedingConfig(
|
|
source="sitemap",
|
|
extract_head=True,
|
|
query=query,
|
|
scoring_method="bm25",
|
|
score_threshold=0.3, # Only URLs with >30% relevance
|
|
max_urls=20,
|
|
verbose=False
|
|
)
|
|
|
|
with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
|
|
p.add_task(description=f"Searching for: '{query}'", total=None)
|
|
urls = await seed.urls(DOMAIN, cfg)
|
|
|
|
console.print(f"[green]Found {len(urls)} relevant URLs (score > 0.3)")
|
|
|
|
# Show top results with scores
|
|
table = Table(title="Top 5 Most Relevant Pages", expand=True)
|
|
table.add_column("Score", style="cyan", width=8)
|
|
table.add_column("Title", style="magenta")
|
|
table.add_column("URL", style="blue", overflow="fold")
|
|
|
|
for url in urls[:5]:
|
|
score = f"{url['relevance_score']:.2f}"
|
|
title = url['head_data'].get('title', 'No title')[:60] + "..."
|
|
table.add_row(score, title, url['url'])
|
|
|
|
console.print(table)
|
|
|
|
async def section_5_keyword_filter_to_agent(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]5. Complete Pipeline: Discover → Filter → Crawl")
|
|
cfg = SeedingConfig(
|
|
extract_head=True,
|
|
concurrency=20,
|
|
hits_per_sec=10,
|
|
max_urls=10,
|
|
pattern="*crawl4ai.com/*",
|
|
force=True,
|
|
)
|
|
urls = await seed.urls(DOMAIN, cfg)
|
|
|
|
keywords = ["deep crawling", "markdown", "llm"]
|
|
selected = [u for u in urls if any(k in str(u["head_data"]).lower() for k in keywords)]
|
|
|
|
console.print(f"[cyan]Selected {len(selected)} URLs with relevant keywords:")
|
|
for u in selected[:10]:
|
|
console.print("•", u["url"])
|
|
|
|
console.print("\n[yellow]Passing above URLs to arun_many() LLM agent for crawling...")
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
crawl_run_config = CrawlerRunConfig(
|
|
# Example crawl settings for these URLs:
|
|
only_text=True, # Just get text content
|
|
screenshot=False,
|
|
pdf=False,
|
|
word_count_threshold=50, # Only process pages with at least 50 words
|
|
stream=True,
|
|
verbose=False # Keep logs clean for arun_many in this demo
|
|
)
|
|
|
|
# Extract just the URLs from the selected results
|
|
urls_to_crawl = [u["url"] for u in selected]
|
|
|
|
# We'll stream results for large lists, but collect them here for demonstration
|
|
crawled_results_stream = await crawler.arun_many(urls_to_crawl, config=crawl_run_config)
|
|
final_crawled_data = []
|
|
async for result in crawled_results_stream:
|
|
final_crawled_data.append(result)
|
|
if len(final_crawled_data) % 5 == 0:
|
|
print(f" Processed {len(final_crawled_data)}/{len(urls_to_crawl)} URLs...")
|
|
|
|
print(f"\n Successfully crawled {len(final_crawled_data)} URLs.")
|
|
if final_crawled_data:
|
|
print("\n Example of a crawled result's URL and Markdown (first successful one):")
|
|
for result in final_crawled_data:
|
|
if result.success and result.markdown.raw_markdown:
|
|
print(f" URL: {result.url}")
|
|
print(f" Markdown snippet: {result.markdown.raw_markdown[:200]}...")
|
|
break
|
|
else:
|
|
print(" No successful crawls with markdown found.")
|
|
else:
|
|
print(" No successful crawls found.")
|
|
|
|
|
|
async def section_6_multi_domain(seed: AsyncUrlSeeder):
|
|
console.rule("[bold cyan]6. Multi-Domain Discovery")
|
|
console.print("[yellow]Discovering Python tutorials across multiple educational sites[/yellow]\n")
|
|
|
|
domains = ["docs.python.org", "realpython.com", "docs.crawl4ai.com"]
|
|
cfg = SeedingConfig(
|
|
source="sitemap",
|
|
extract_head=True,
|
|
query="python tutorial guide",
|
|
scoring_method="bm25",
|
|
score_threshold=0.2,
|
|
max_urls=5 # Per domain
|
|
)
|
|
|
|
start_time = time.time()
|
|
with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
|
|
task = p.add_task(description="Discovering across domains...", total=None)
|
|
results = await seed.many_urls(domains, cfg)
|
|
elapsed = time.time() - start_time
|
|
|
|
total_urls = sum(len(urls) for urls in results.values())
|
|
console.print(f"[green]✓ Found {total_urls} relevant URLs across {len(domains)} domains in {elapsed:.2f}s\n")
|
|
|
|
# Show results per domain
|
|
for domain, urls in results.items():
|
|
console.print(f"[bold]{domain}:[/bold] {len(urls)} relevant pages")
|
|
if urls:
|
|
top = urls[0]
|
|
console.print(f" Top result: [{top['relevance_score']:.2f}] {top['head_data'].get('title', 'No title')}")
|
|
|
|
|
|
async def main():
|
|
seed = AsyncUrlSeeder()
|
|
|
|
# Interactive menu
|
|
sections = {
|
|
"1": ("Basic URL Discovery", section_1_basic_exploration),
|
|
"2": ("Cache Management Demo", section_2_cache_demo),
|
|
"3": ("Live Check & Metadata Extraction", section_3_live_head),
|
|
"4": ("BM25 Relevance Scoring", section_4_bm25_scoring),
|
|
"5": ("Complete Pipeline (Discover → Filter → Crawl)", section_5_keyword_filter_to_agent),
|
|
"6": ("Multi-Domain Discovery", section_6_multi_domain),
|
|
"7": ("Run All Demos", None)
|
|
}
|
|
|
|
console.print("\n[bold]Available Demos:[/bold]")
|
|
for key, (title, _) in sections.items():
|
|
console.print(f" {key}. {title}")
|
|
|
|
choice = Prompt.ask("\n[cyan]Which demo would you like to run?[/cyan]",
|
|
choices=list(sections.keys()),
|
|
default="7")
|
|
|
|
console.print()
|
|
|
|
if choice == "7":
|
|
# Run all demos
|
|
for key, (title, func) in sections.items():
|
|
if key != "7" and func:
|
|
await func(seed)
|
|
if key != "6": # Don't pause after the last demo
|
|
if not Confirm.ask("\n[yellow]Continue to next demo?[/yellow]", default=True):
|
|
break
|
|
console.print()
|
|
else:
|
|
# Run selected demo
|
|
_, func = sections[choice]
|
|
await func(seed)
|
|
|
|
console.rule("[bold green]Demo Complete ✔︎")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|