docs: Update URL seeding examples to use proper async context managers

- Wrap all AsyncUrlSeeder usage with async context managers - Update URL seeding adventure example to use "sitemap+cc" source, focus on course posts, and add stream=True parameter to fix runtime error
2025-08-13 18:16:46 +05:30
parent 11b310edef
commit ecbe5ffb84
1 changed files with 33 additions and 31 deletions
--- a/docs/md_v2/core/url-seeding.md
+++ b/docs/md_v2/core/url-seeding.md
@@ -102,16 +102,16 @@ async def smart_blog_crawler():
    
    # Step 2: Configure discovery - let's find all blog posts
    config = SeedingConfig(
-        source="sitemap",           # Use the website's sitemap
-        pattern="*/blog/*.html",    # Only blog posts
+        source="sitemap+cc",      # Use the website's sitemap+cc
+        pattern="*/courses/*",    # Only courses related posts
        extract_head=True,          # Get page metadata
        max_urls=100               # Limit for this example
    )
    
    # Step 3: Discover URLs from the Python blog
-    print("🔍 Discovering blog posts...")
+    print("🔍 Discovering course posts...")
    urls = await seeder.urls("realpython.com", config)
-    print(f"✅ Found {len(urls)} blog posts")
+    print(f"✅ Found {len(urls)} course posts")
    
    # Step 4: Filter for Python tutorials (using metadata!)
    tutorials = [
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            only_text=True,
-            word_count_threshold=300  # Only substantial articles
+            word_count_threshold=300,  # Only substantial articles
+            stream=True
        )
        
        # Extract URLs and crawl them
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())

 **What just happened?**

-1. We discovered all blog URLs from the sitemap
+1. We discovered all blog URLs from the sitemap+cc
 2. We filtered using metadata (no crawling needed!)
 3. We crawled only the relevant tutorials
 4. We saved tons of time and bandwidth
@@ -282,7 +283,7 @@ config = SeedingConfig(
    live_check=True,  # Verify each URL is accessible
    concurrency=20    # Check 20 URLs in parallel
 )
-
+async with AsyncUrlSeeder() as seeder:
    urls = await seeder.urls("example.com", config)

 # Now you can filter by status
@@ -311,7 +312,7 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
 config = SeedingConfig(
    extract_head=True  # Extract metadata from <head> section
 )
-
+async with AsyncUrlSeeder() as seeder:
    urls = await seeder.urls("example.com", config)

 # Now each URL has rich metadata
@@ -387,7 +388,7 @@ config = SeedingConfig(
    scoring_method="bm25",
    score_threshold=0.3
 )
-
+async with AsyncUrlSeeder() as seeder:
    urls = await seeder.urls("example.com", config)

 # URLs are scored based on:
@@ -429,7 +430,7 @@ config = SeedingConfig(
    extract_head=True,
    live_check=True
 )
-
+async with AsyncUrlSeeder() as seeder:
    urls = await seeder.urls("blog.example.com", config)

 # Analyze the results
@@ -488,7 +489,7 @@ config = SeedingConfig(
    scoring_method="bm25",       # Use BM25 algorithm
    score_threshold=0.3          # Minimum relevance score
 )
-
+async with AsyncUrlSeeder() as seeder:
    urls = await seeder.urls("realpython.com", config)

 # Results are automatically sorted by relevance!
@@ -511,7 +512,7 @@ config = SeedingConfig(
    score_threshold=0.5,
    max_urls=20
 )
-
+async with AsyncUrlSeeder() as seeder:
    urls = await seeder.urls("docs.example.com", config)

 # The highest scoring URLs will be API docs!
@@ -529,7 +530,7 @@ config = SeedingConfig(
    score_threshold=0.4,
    pattern="*/product/*"  # Combine with pattern matching
 )
-
+async with AsyncUrlSeeder() as seeder:
    urls = await seeder.urls("shop.example.com", config)

 # Filter further by price (from metadata)
@@ -550,7 +551,7 @@ config = SeedingConfig(
    scoring_method="bm25",
    score_threshold=0.35
 )
-
+async with AsyncUrlSeeder() as seeder:
    urls = await seeder.urls("technews.com", config)

 # Filter by date
@@ -591,7 +592,7 @@ for query in queries:
        score_threshold=0.4,
        max_urls=10  # Top 10 per topic
    )
-    
+    async with AsyncUrlSeeder() as seeder:
        urls = await seeder.urls("learning-platform.com", config)
    all_tutorials.extend(urls)

@@ -625,6 +626,7 @@ config = SeedingConfig(
 )

 # Returns a dictionary: {domain: [urls]}
+async with AsyncUrlSeeder() as seeder:
    results = await seeder.many_urls(domains, config)

 # Process results
@@ -654,7 +656,7 @@ config = SeedingConfig(
    pattern="*/blog/*",
    max_urls=100
 )
-
+async with AsyncUrlSeeder() as seeder:
    results = await seeder.many_urls(competitors, config)

 # Analyze content types
@@ -690,7 +692,7 @@ config = SeedingConfig(
    score_threshold=0.3,
    max_urls=20  # Per site
 )
-
+async with AsyncUrlSeeder() as seeder:
    results = await seeder.many_urls(educational_sites, config)

 # Find the best beginner tutorials
@@ -731,7 +733,7 @@ config = SeedingConfig(
    score_threshold=0.5,  # High threshold for relevance
    max_urls=10
 )
-
+async with AsyncUrlSeeder() as seeder:
    results = await seeder.many_urls(news_sites, config)

 # Collect all mentions