docs: Update URL seeding examples to use proper async context managers

- Wrap all AsyncUrlSeeder usage with async context managers
- Update URL seeding adventure example to use "sitemap+cc" source, focus on course posts, and add stream=True parameter to fix runtime error
This commit is contained in:
Soham Kukreti
2025-08-13 18:16:46 +05:30
parent 11b310edef
commit ecbe5ffb84

View File

@@ -102,16 +102,16 @@ async def smart_blog_crawler():
# Step 2: Configure discovery - let's find all blog posts # Step 2: Configure discovery - let's find all blog posts
config = SeedingConfig( config = SeedingConfig(
source="sitemap", # Use the website's sitemap source="sitemap+cc", # Use the website's sitemap+cc
pattern="*/blog/*.html", # Only blog posts pattern="*/courses/*", # Only courses related posts
extract_head=True, # Get page metadata extract_head=True, # Get page metadata
max_urls=100 # Limit for this example max_urls=100 # Limit for this example
) )
# Step 3: Discover URLs from the Python blog # Step 3: Discover URLs from the Python blog
print("🔍 Discovering blog posts...") print("🔍 Discovering course posts...")
urls = await seeder.urls("realpython.com", config) urls = await seeder.urls("realpython.com", config)
print(f"✅ Found {len(urls)} blog posts") print(f"✅ Found {len(urls)} course posts")
# Step 4: Filter for Python tutorials (using metadata!) # Step 4: Filter for Python tutorials (using metadata!)
tutorials = [ tutorials = [
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig( config = CrawlerRunConfig(
only_text=True, only_text=True,
word_count_threshold=300 # Only substantial articles word_count_threshold=300, # Only substantial articles
stream=True
) )
# Extract URLs and crawl them # Extract URLs and crawl them
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
**What just happened?** **What just happened?**
1. We discovered all blog URLs from the sitemap 1. We discovered all blog URLs from the sitemap+cc
2. We filtered using metadata (no crawling needed!) 2. We filtered using metadata (no crawling needed!)
3. We crawled only the relevant tutorials 3. We crawled only the relevant tutorials
4. We saved tons of time and bandwidth 4. We saved tons of time and bandwidth
@@ -282,8 +283,8 @@ config = SeedingConfig(
live_check=True, # Verify each URL is accessible live_check=True, # Verify each URL is accessible
concurrency=20 # Check 20 URLs in parallel concurrency=20 # Check 20 URLs in parallel
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
# Now you can filter by status # Now you can filter by status
live_urls = [u for u in urls if u["status"] == "valid"] live_urls = [u for u in urls if u["status"] == "valid"]
@@ -311,8 +312,8 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
config = SeedingConfig( config = SeedingConfig(
extract_head=True # Extract metadata from <head> section extract_head=True # Extract metadata from <head> section
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
# Now each URL has rich metadata # Now each URL has rich metadata
for url in urls[:3]: for url in urls[:3]:
@@ -387,8 +388,8 @@ config = SeedingConfig(
scoring_method="bm25", scoring_method="bm25",
score_threshold=0.3 score_threshold=0.3
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
# URLs are scored based on: # URLs are scored based on:
# 1. Domain parts matching (e.g., 'python' in python.example.com) # 1. Domain parts matching (e.g., 'python' in python.example.com)
@@ -429,8 +430,8 @@ config = SeedingConfig(
extract_head=True, extract_head=True,
live_check=True live_check=True
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("blog.example.com", config) urls = await seeder.urls("blog.example.com", config)
# Analyze the results # Analyze the results
for url in urls[:5]: for url in urls[:5]:
@@ -488,8 +489,8 @@ config = SeedingConfig(
scoring_method="bm25", # Use BM25 algorithm scoring_method="bm25", # Use BM25 algorithm
score_threshold=0.3 # Minimum relevance score score_threshold=0.3 # Minimum relevance score
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("realpython.com", config) urls = await seeder.urls("realpython.com", config)
# Results are automatically sorted by relevance! # Results are automatically sorted by relevance!
for url in urls[:5]: for url in urls[:5]:
@@ -511,8 +512,8 @@ config = SeedingConfig(
score_threshold=0.5, score_threshold=0.5,
max_urls=20 max_urls=20
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("docs.example.com", config) urls = await seeder.urls("docs.example.com", config)
# The highest scoring URLs will be API docs! # The highest scoring URLs will be API docs!
``` ```
@@ -529,8 +530,8 @@ config = SeedingConfig(
score_threshold=0.4, score_threshold=0.4,
pattern="*/product/*" # Combine with pattern matching pattern="*/product/*" # Combine with pattern matching
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("shop.example.com", config) urls = await seeder.urls("shop.example.com", config)
# Filter further by price (from metadata) # Filter further by price (from metadata)
affordable = [ affordable = [
@@ -550,8 +551,8 @@ config = SeedingConfig(
scoring_method="bm25", scoring_method="bm25",
score_threshold=0.35 score_threshold=0.35
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("technews.com", config) urls = await seeder.urls("technews.com", config)
# Filter by date # Filter by date
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -591,8 +592,8 @@ for query in queries:
score_threshold=0.4, score_threshold=0.4,
max_urls=10 # Top 10 per topic max_urls=10 # Top 10 per topic
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("learning-platform.com", config) urls = await seeder.urls("learning-platform.com", config)
all_tutorials.extend(urls) all_tutorials.extend(urls)
# Remove duplicates while preserving order # Remove duplicates while preserving order
@@ -625,7 +626,8 @@ config = SeedingConfig(
) )
# Returns a dictionary: {domain: [urls]} # Returns a dictionary: {domain: [urls]}
results = await seeder.many_urls(domains, config) async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(domains, config)
# Process results # Process results
for domain, urls in results.items(): for domain, urls in results.items():
@@ -654,8 +656,8 @@ config = SeedingConfig(
pattern="*/blog/*", pattern="*/blog/*",
max_urls=100 max_urls=100
) )
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(competitors, config) results = await seeder.many_urls(competitors, config)
# Analyze content types # Analyze content types
for domain, urls in results.items(): for domain, urls in results.items():
@@ -690,8 +692,8 @@ config = SeedingConfig(
score_threshold=0.3, score_threshold=0.3,
max_urls=20 # Per site max_urls=20 # Per site
) )
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(educational_sites, config) results = await seeder.many_urls(educational_sites, config)
# Find the best beginner tutorials # Find the best beginner tutorials
all_tutorials = [] all_tutorials = []
@@ -731,8 +733,8 @@ config = SeedingConfig(
score_threshold=0.5, # High threshold for relevance score_threshold=0.5, # High threshold for relevance
max_urls=10 max_urls=10
) )
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(news_sites, config) results = await seeder.many_urls(news_sites, config)
# Collect all mentions # Collect all mentions
mentions = [] mentions = []