From 1e1c887a2f59dc2fcef1bd139ddee990ddc28ddd Mon Sep 17 00:00:00 2001
From: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Date: Tue, 13 May 2025 00:04:58 -0700
Subject: [PATCH 1/3] fix(docker-api): migrate to modern datetime library API

Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
---
 deploy/docker/api.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 732371f7..edfa51e5 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -4,7 +4,7 @@ import asyncio
 from typing import List, Tuple, Dict
 from functools import partial
 from uuid import uuid4
-from datetime import datetime
+from datetime import datetime, timezone
 
 import logging
 from typing import Optional, AsyncGenerator
@@ -542,7 +542,7 @@ async def handle_crawl_job(
     task_id = f"crawl_{uuid4().hex[:8]}"
     await redis.hset(f"task:{task_id}", mapping={
         "status": TaskStatus.PROCESSING,         # <-- keep enum values consistent
-        "created_at": datetime.utcnow().isoformat(),
+        "created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
         "url": json.dumps(urls),                 # store list as JSON string
         "result": "",
         "error": "",

From 7a8190ecb67020743e1b1fb41c8b197e4507ed59 Mon Sep 17 00:00:00 2001
From: Nezar Ali <abu5sohaib@gmail.com>
Date: Wed, 6 Aug 2025 11:58:29 +0300
Subject: [PATCH 2/3] Fix examples in README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f70eb264..274e8075 100644
--- a/README.md
+++ b/README.md
@@ -347,7 +347,7 @@ async def main():
     
     async with AsyncWebCrawler(config=browser_config) as crawler:
         result = await crawler.arun(
-            url="https://docs.micronaut.io/4.7.6/guide/",
+            url="https://docs.micronaut.io/4.9.9/guide/",
             config=run_config
         )
         print(len(result.markdown.raw_markdown))
@@ -399,7 +399,7 @@ async def main():
             "type": "attribute",
             "attribute": "src"
         }
-    }
+    ]
 }
 
     extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

From ecbe5ffb84a769492863b6a602f013dfaa920ce7 Mon Sep 17 00:00:00 2001
From: Soham Kukreti <kukretisoham@gmail.com>
Date: Wed, 13 Aug 2025 18:16:46 +0530
Subject: [PATCH 3/3] docs: Update URL seeding examples to use proper async
 context managers - Wrap all AsyncUrlSeeder usage with async context managers
 - Update URL seeding adventure example to use "sitemap+cc" source, focus on
 course posts, and add stream=True parameter to fix runtime error

---
 docs/md_v2/core/url-seeding.md | 64 ++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/docs/md_v2/core/url-seeding.md b/docs/md_v2/core/url-seeding.md
index f891c204..106a80a0 100644
--- a/docs/md_v2/core/url-seeding.md
+++ b/docs/md_v2/core/url-seeding.md
@@ -102,16 +102,16 @@ async def smart_blog_crawler():
     
     # Step 2: Configure discovery - let's find all blog posts
     config = SeedingConfig(
-        source="sitemap",           # Use the website's sitemap
-        pattern="*/blog/*.html",    # Only blog posts
+        source="sitemap+cc",      # Use the website's sitemap+cc
+        pattern="*/courses/*",    # Only courses related posts
         extract_head=True,          # Get page metadata
         max_urls=100               # Limit for this example
     )
     
     # Step 3: Discover URLs from the Python blog
-    print("🔍 Discovering blog posts...")
+    print("🔍 Discovering course posts...")
     urls = await seeder.urls("realpython.com", config)
-    print(f"✅ Found {len(urls)} blog posts")
+    print(f"✅ Found {len(urls)} course posts")
     
     # Step 4: Filter for Python tutorials (using metadata!)
     tutorials = [
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
     async with AsyncWebCrawler() as crawler:
         config = CrawlerRunConfig(
             only_text=True,
-            word_count_threshold=300  # Only substantial articles
+            word_count_threshold=300,  # Only substantial articles
+            stream=True
         )
         
         # Extract URLs and crawl them
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
 
 **What just happened?**
 
-1. We discovered all blog URLs from the sitemap
+1. We discovered all blog URLs from the sitemap+cc
 2. We filtered using metadata (no crawling needed!)
 3. We crawled only the relevant tutorials
 4. We saved tons of time and bandwidth
@@ -282,8 +283,8 @@ config = SeedingConfig(
     live_check=True,  # Verify each URL is accessible
     concurrency=20    # Check 20 URLs in parallel
 )
-
-urls = await seeder.urls("example.com", config)
+async with AsyncUrlSeeder() as seeder:
+    urls = await seeder.urls("example.com", config)
 
 # Now you can filter by status
 live_urls = [u for u in urls if u["status"] == "valid"]
@@ -311,8 +312,8 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
 config = SeedingConfig(
     extract_head=True  # Extract metadata from <head> section
 )
-
-urls = await seeder.urls("example.com", config)
+async with AsyncUrlSeeder() as seeder:
+    urls = await seeder.urls("example.com", config)
 
 # Now each URL has rich metadata
 for url in urls[:3]:
@@ -387,8 +388,8 @@ config = SeedingConfig(
     scoring_method="bm25",
     score_threshold=0.3
 )
-
-urls = await seeder.urls("example.com", config)
+async with AsyncUrlSeeder() as seeder:
+    urls = await seeder.urls("example.com", config)
 
 # URLs are scored based on:
 # 1. Domain parts matching (e.g., 'python' in python.example.com)
@@ -429,8 +430,8 @@ config = SeedingConfig(
     extract_head=True,
     live_check=True
 )
-
-urls = await seeder.urls("blog.example.com", config)
+async with AsyncUrlSeeder() as seeder:
+    urls = await seeder.urls("blog.example.com", config)
 
 # Analyze the results
 for url in urls[:5]:
@@ -488,8 +489,8 @@ config = SeedingConfig(
     scoring_method="bm25",       # Use BM25 algorithm
     score_threshold=0.3          # Minimum relevance score
 )
-
-urls = await seeder.urls("realpython.com", config)
+async with AsyncUrlSeeder() as seeder:
+    urls = await seeder.urls("realpython.com", config)
 
 # Results are automatically sorted by relevance!
 for url in urls[:5]:
@@ -511,8 +512,8 @@ config = SeedingConfig(
     score_threshold=0.5,
     max_urls=20
 )
-
-urls = await seeder.urls("docs.example.com", config)
+async with AsyncUrlSeeder() as seeder:
+    urls = await seeder.urls("docs.example.com", config)
 
 # The highest scoring URLs will be API docs!
 ```
@@ -529,8 +530,8 @@ config = SeedingConfig(
     score_threshold=0.4,
     pattern="*/product/*"  # Combine with pattern matching
 )
-
-urls = await seeder.urls("shop.example.com", config)
+async with AsyncUrlSeeder() as seeder:
+    urls = await seeder.urls("shop.example.com", config)
 
 # Filter further by price (from metadata)
 affordable = [
@@ -550,8 +551,8 @@ config = SeedingConfig(
     scoring_method="bm25",
     score_threshold=0.35
 )
-
-urls = await seeder.urls("technews.com", config)
+async with AsyncUrlSeeder() as seeder:
+    urls = await seeder.urls("technews.com", config)
 
 # Filter by date
 from datetime import datetime, timedelta
@@ -591,8 +592,8 @@ for query in queries:
         score_threshold=0.4,
         max_urls=10  # Top 10 per topic
     )
-    
-    urls = await seeder.urls("learning-platform.com", config)
+    async with AsyncUrlSeeder() as seeder:
+        urls = await seeder.urls("learning-platform.com", config)
     all_tutorials.extend(urls)
 
 # Remove duplicates while preserving order
@@ -625,7 +626,8 @@ config = SeedingConfig(
 )
 
 # Returns a dictionary: {domain: [urls]}
-results = await seeder.many_urls(domains, config)
+async with AsyncUrlSeeder() as seeder:
+    results = await seeder.many_urls(domains, config)
 
 # Process results
 for domain, urls in results.items():
@@ -654,8 +656,8 @@ config = SeedingConfig(
     pattern="*/blog/*",
     max_urls=100
 )
-
-results = await seeder.many_urls(competitors, config)
+async with AsyncUrlSeeder() as seeder:
+    results = await seeder.many_urls(competitors, config)
 
 # Analyze content types
 for domain, urls in results.items():
@@ -690,8 +692,8 @@ config = SeedingConfig(
     score_threshold=0.3,
     max_urls=20  # Per site
 )
-
-results = await seeder.many_urls(educational_sites, config)
+async with AsyncUrlSeeder() as seeder:
+    results = await seeder.many_urls(educational_sites, config)
 
 # Find the best beginner tutorials
 all_tutorials = []
@@ -731,8 +733,8 @@ config = SeedingConfig(
     score_threshold=0.5,  # High threshold for relevance
     max_urls=10
 )
-
-results = await seeder.many_urls(news_sites, config)
+async with AsyncUrlSeeder() as seeder:
+    results = await seeder.many_urls(news_sites, config)
 
 # Collect all mentions
 mentions = []