docs: Update URL seeding examples to use proper async context managers

- Wrap all AsyncUrlSeeder usage with async context managers
- Update URL seeding adventure example to use "sitemap+cc" source, focus on course posts, and add stream=True parameter to fix runtime error
This commit is contained in:
Soham Kukreti
2025-08-13 18:16:46 +05:30
parent 11b310edef
commit ecbe5ffb84

View File

@@ -102,16 +102,16 @@ async def smart_blog_crawler():
# Step 2: Configure discovery - let's find all blog posts
config = SeedingConfig(
source="sitemap", # Use the website's sitemap
pattern="*/blog/*.html", # Only blog posts
source="sitemap+cc", # Use the website's sitemap+cc
pattern="*/courses/*", # Only courses related posts
extract_head=True, # Get page metadata
max_urls=100 # Limit for this example
)
# Step 3: Discover URLs from the Python blog
print("🔍 Discovering blog posts...")
print("🔍 Discovering course posts...")
urls = await seeder.urls("realpython.com", config)
print(f"✅ Found {len(urls)} blog posts")
print(f"✅ Found {len(urls)} course posts")
# Step 4: Filter for Python tutorials (using metadata!)
tutorials = [
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
only_text=True,
word_count_threshold=300 # Only substantial articles
word_count_threshold=300, # Only substantial articles
stream=True
)
# Extract URLs and crawl them
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
**What just happened?**
1. We discovered all blog URLs from the sitemap
1. We discovered all blog URLs from the sitemap+cc
2. We filtered using metadata (no crawling needed!)
3. We crawled only the relevant tutorials
4. We saved tons of time and bandwidth
@@ -282,7 +283,7 @@ config = SeedingConfig(
live_check=True, # Verify each URL is accessible
concurrency=20 # Check 20 URLs in parallel
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config)
# Now you can filter by status
@@ -311,7 +312,7 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
config = SeedingConfig(
extract_head=True # Extract metadata from <head> section
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config)
# Now each URL has rich metadata
@@ -387,7 +388,7 @@ config = SeedingConfig(
scoring_method="bm25",
score_threshold=0.3
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config)
# URLs are scored based on:
@@ -429,7 +430,7 @@ config = SeedingConfig(
extract_head=True,
live_check=True
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("blog.example.com", config)
# Analyze the results
@@ -488,7 +489,7 @@ config = SeedingConfig(
scoring_method="bm25", # Use BM25 algorithm
score_threshold=0.3 # Minimum relevance score
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("realpython.com", config)
# Results are automatically sorted by relevance!
@@ -511,7 +512,7 @@ config = SeedingConfig(
score_threshold=0.5,
max_urls=20
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("docs.example.com", config)
# The highest scoring URLs will be API docs!
@@ -529,7 +530,7 @@ config = SeedingConfig(
score_threshold=0.4,
pattern="*/product/*" # Combine with pattern matching
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("shop.example.com", config)
# Filter further by price (from metadata)
@@ -550,7 +551,7 @@ config = SeedingConfig(
scoring_method="bm25",
score_threshold=0.35
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("technews.com", config)
# Filter by date
@@ -591,7 +592,7 @@ for query in queries:
score_threshold=0.4,
max_urls=10 # Top 10 per topic
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("learning-platform.com", config)
all_tutorials.extend(urls)
@@ -625,6 +626,7 @@ config = SeedingConfig(
)
# Returns a dictionary: {domain: [urls]}
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(domains, config)
# Process results
@@ -654,7 +656,7 @@ config = SeedingConfig(
pattern="*/blog/*",
max_urls=100
)
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(competitors, config)
# Analyze content types
@@ -690,7 +692,7 @@ config = SeedingConfig(
score_threshold=0.3,
max_urls=20 # Per site
)
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(educational_sites, config)
# Find the best beginner tutorials
@@ -731,7 +733,7 @@ config = SeedingConfig(
score_threshold=0.5, # High threshold for relevance
max_urls=10
)
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(news_sites, config)
# Collect all mentions