diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 00000000..e8f289d7
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,3 @@
+{
+  "enableAllProjectMcpServers": false
+}
\ No newline at end of file
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 2c6c2f29..c0d2b424 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -2,7 +2,8 @@
 import warnings
 
 from .async_webcrawler import AsyncWebCrawler, CacheMode
-from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig
+# MODIFIED: Add SeedingConfig here
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig
 
 from .content_scraping_strategy import (
     ContentScrapingStrategy,
@@ -65,6 +66,8 @@ from .deep_crawling import (
     DFSDeepCrawlStrategy,
     DeepCrawlDecorator,
 )
+# NEW: Import AsyncUrlSeeder
+from .async_url_seeder import AsyncUrlSeeder
 
 from .utils import (
     start_colab_display_server,
@@ -78,6 +81,10 @@ __all__ = [
     "BrowserProfiler",
     "LLMConfig",
     "GeolocationConfig",
+    # NEW: Add SeedingConfig
+    "SeedingConfig",
+    # NEW: Add AsyncUrlSeeder
+    "AsyncUrlSeeder",
     "DeepCrawlStrategy",
     "BFSDeepCrawlStrategy",
     "BestFirstCrawlingStrategy",
@@ -160,4 +167,4 @@ __all__ = [
 
 # Disable all Pydantic warnings
 warnings.filterwarnings("ignore", module="pydantic")
-# pydantic_warnings.filter_warnings()
+# pydantic_warnings.filter_warnings()
\ No newline at end of file
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 3fcd9911..4eb116c6 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -207,7 +207,6 @@ class GeolocationConfig:
         config_dict.update(kwargs)
         return GeolocationConfig.from_dict(config_dict)
 
-
 class ProxyConfig:
     def __init__(
         self,
@@ -318,8 +317,6 @@ class ProxyConfig:
         config_dict.update(kwargs)
         return ProxyConfig.from_dict(config_dict)
 
-
-
 class BrowserConfig:
     """
     Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
@@ -597,7 +594,6 @@ class BrowserConfig:
             return config
         return BrowserConfig.from_kwargs(config)
 
-
 class HTTPCrawlerConfig:
     """HTTP-specific crawler configuration"""
 
@@ -1329,7 +1325,6 @@ class CrawlerRunConfig():
         config_dict.update(kwargs)
         return CrawlerRunConfig.from_kwargs(config_dict)
 
-
 class LLMConfig:
     def __init__(
         self,
@@ -1414,4 +1409,51 @@ class LLMConfig:
         config_dict.update(kwargs)
         return LLMConfig.from_kwargs(config_dict)
 
+class SeedingConfig:
+    """
+    Configuration class for URL discovery and pre-validation via AsyncUrlSeeder.
+    """
+    def __init__(
+        self,
+        source: str = "sitemap+cc",  # Options: "sitemap", "cc", "sitemap+cc"
+        pattern: Optional[str] = "*", # URL pattern to filter discovered URLs (e.g., "*example.com/blog/*")
+        live_check: bool = False,    # Whether to perform HEAD requests to verify URL liveness
+        extract_head: bool = False,  # Whether to fetch and parse <head> section for metadata
+        max_urls: int = -1, # Maximum number of URLs to discover (default: -1 for no limit)
+        concurrency: int = 1000,      # Maximum concurrent requests for live checks/head extraction
+        hits_per_sec: int = 5,      # Rate limit in requests per second
+        force: bool = False, # If True, bypasses the AsyncUrlSeeder's internal .jsonl cache
+        base_directory: Optional[str] = None, # Base directory for UrlSeeder's cache files (.jsonl)
+        llm_config: Optional[LLMConfig] = None, # Forward LLM config for future use (e.g., relevance scoring)
+        verbose: Optional[bool] = None, # Override crawler's general verbose setting
+        query: Optional[str] = None,  # Search query for relevance scoring
+        score_threshold: Optional[float] = None,  # Minimum relevance score to include URL (0.0-1.0)
+        scoring_method: str = "bm25",  # Scoring method: "bm25" (default), future: "semantic"
+    ):
+        self.source = source
+        self.pattern = pattern
+        self.live_check = live_check
+        self.extract_head = extract_head
+        self.max_urls = max_urls
+        self.concurrency = concurrency
+        self.hits_per_sec = hits_per_sec
+        self.force = force
+        self.base_directory = base_directory
+        self.llm_config = llm_config
+        self.verbose = verbose
+        self.query = query
+        self.score_threshold = score_threshold
+        self.scoring_method = scoring_method
 
+    # Add to_dict, from_kwargs, and clone methods for consistency
+    def to_dict(self) -> Dict[str, Any]:
+        return {k: v for k, v in self.__dict__.items() if k != 'llm_config' or v is not None}
+
+    @staticmethod
+    def from_kwargs(kwargs: Dict[str, Any]) -> 'SeedingConfig':
+        return SeedingConfig(**kwargs)
+
+    def clone(self, **kwargs: Any) -> 'SeedingConfig':
+        config_dict = self.to_dict()
+        config_dict.update(kwargs)
+        return SeedingConfig.from_kwargs(config_dict)
diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
index 49c7ee6f..e203b6c9 100644
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -29,7 +29,7 @@ class LogLevel(Enum):
 class LogColor(str, Enum):
     """Enum for log colors."""
 
-    DEBUG = "lightblack"
+    DEBUG = "bright_black"
     INFO = "cyan"
     SUCCESS = "green"
     WARNING = "yellow"
diff --git a/crawl4ai/async_url_seeder.py b/crawl4ai/async_url_seeder.py
new file mode 100644
index 00000000..b9dce91a
--- /dev/null
+++ b/crawl4ai/async_url_seeder.py
@@ -0,0 +1,944 @@
+"""
+async_url_seeder.py
+Fast async URL discovery for Crawl4AI
+
+Features
+--------
+* Common-Crawl streaming via httpx.AsyncClient (HTTP/2, keep-alive)
+* robots.txt → sitemap chain (.gz + nested indexes) via async httpx
+* Per-domain CDX result cache on disk (~/.crawl4ai/<index>_<domain>_<hash>.jsonl)
+* Optional HEAD-only liveness check
+* Optional partial <head> download + meta parsing
+* Global hits-per-second rate-limit via asyncio.Semaphore
+* Concurrency in the thousands — fine on a single event-loop
+"""
+
+from __future__ import annotations
+import aiofiles, asyncio, gzip, hashlib, io, json, os, pathlib, re, time
+from datetime import timedelta
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
+from urllib.parse import quote, urljoin
+
+import httpx
+import fnmatch
+try:
+    from lxml import html as lxml_html
+    from lxml import etree
+    LXML = True
+except ImportError:
+    LXML = False
+try:
+    import brotli
+    HAS_BROTLI = True
+except ImportError:
+    HAS_BROTLI = False
+try:
+    import rank_bm25
+    HAS_BM25 = True
+except ImportError:
+    HAS_BM25 = False
+
+# Import AsyncLoggerBase from crawl4ai's logger module
+# Assuming crawl4ai/async_logger.py defines AsyncLoggerBase
+# You might need to adjust this import based on your exact file structure
+from .async_logger import AsyncLoggerBase, AsyncLogger # Import AsyncLogger for default if needed
+
+# Import SeedingConfig for type hints
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from .async_configs import SeedingConfig
+
+
+# ────────────────────────────────────────────────────────────────────────── consts
+COLLINFO_URL = "https://index.commoncrawl.org/collinfo.json"
+# CACHE_DIR = pathlib.Path("~/.crawl4ai").expanduser() # REMOVED: now managed by __init__
+# CACHE_DIR.mkdir(exist_ok=True) # REMOVED: now managed by __init__
+# INDEX_CACHE = CACHE_DIR / "latest_cc_index.txt" # REMOVED: now managed by __init__
+TTL = timedelta(days=7) # Keeping this constant as it's a seeder-specific TTL
+
+_meta_rx = re.compile(
+    r'<meta\s+(?:[^>]*?(?:name|property|http-equiv)\s*=\s*["\']?([^"\' >]+)[^>]*?content\s*=\s*["\']?([^"\' >]+)[^>]*?)\/?>',
+    re.I)
+_charset_rx = re.compile(r'<meta\s+[^>]*charset=["\']?([^"\' >]+)', re.I)
+_title_rx   = re.compile(r'<title>(.*?)</title>', re.I|re.S)
+_link_rx    = re.compile(r'<link\s+[^>]*rel=["\']?([^"\' >]+)[^>]*href=["\']?([^"\' >]+)', re.I)
+
+# ────────────────────────────────────────────────────────────────────────── helpers
+def _match(url: str, pattern: str) -> bool:
+    if fnmatch.fnmatch(url, pattern):
+        return True
+    canon = url.split("://", 1)[-1]
+    return (fnmatch.fnmatch(canon, pattern)
+            or (canon.startswith("www.") and fnmatch.fnmatch(canon[4:], pattern)))
+
+def _parse_head(src: str) -> Dict[str, Any]:
+    if LXML:
+        try:
+            if isinstance(src, str):
+                src = src.encode("utf-8", "replace")     # strip Unicode, let lxml decode
+            doc = lxml_html.fromstring(src)
+        except (ValueError, etree.ParserError):
+            return {}        # malformed, bail gracefully
+        info: Dict[str, Any] = {
+            "title": (doc.find(".//title").text or "").strip()
+            if doc.find(".//title") is not None else None,
+            "charset": None,
+            "meta": {}, "link": {}, "jsonld": []
+        }
+        for el in doc.xpath(".//meta"):
+            k = el.attrib.get("name") or el.attrib.get("property") or el.attrib.get("http-equiv")
+            if k: info["meta"][k.lower()] = el.attrib.get("content", "")
+            elif "charset" in el.attrib: info["charset"] = el.attrib["charset"].lower()
+        for el in doc.xpath(".//link"):
+            rel = " ".join(el.attrib.get("rel", [])).lower()
+            if not rel: continue
+            entry = {a: el.attrib[a] for a in ("href","as","type","hreflang") if a in el.attrib}
+            info["link"].setdefault(rel, []).append(entry)
+        # Extract JSON-LD structured data
+        for script in doc.xpath('.//script[@type="application/ld+json"]'):
+            if script.text:
+                try:
+                    jsonld_data = json.loads(script.text.strip())
+                    info["jsonld"].append(jsonld_data)
+                except json.JSONDecodeError:
+                    pass
+        # Extract html lang attribute
+        html_elem = doc.find(".//html")
+        if html_elem is not None:
+            info["lang"] = html_elem.attrib.get("lang", "")
+        return info
+    # regex fallback
+    info: Dict[str,Any] = {"title":None,"charset":None,"meta":{},"link":{},"jsonld":[],"lang":""}
+    m=_title_rx.search(src);            info["title"]=m.group(1).strip() if m else None
+    for k,v in _meta_rx.findall(src):   info["meta"][k.lower()]=v
+    m=_charset_rx.search(src);          info["charset"]=m.group(1).lower() if m else None
+    for rel,href in _link_rx.findall(src):
+        info["link"].setdefault(rel.lower(),[]).append({"href":href})
+    # Try to extract JSON-LD with regex
+    jsonld_pattern = re.compile(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.I|re.S)
+    for match in jsonld_pattern.findall(src):
+        try:
+            jsonld_data = json.loads(match.strip())
+            info["jsonld"].append(jsonld_data)
+        except json.JSONDecodeError:
+            pass
+    # Try to extract lang attribute
+    lang_match = re.search(r'<html[^>]*lang=["\']?([^"\' >]+)', src, re.I)
+    if lang_match:
+        info["lang"] = lang_match.group(1)
+    return info
+
+# ────────────────────────────────────────────────────────────────────────── class
+class AsyncUrlSeeder:
+    """
+    Async version of UrlSeeder.
+    Call pattern is await/async for / async with.
+
+    Public coroutine
+    ----------------
+    await seed.urls(...)
+        returns List[Dict[str,Any]]  (url, status, head_data)
+    """
+
+    def __init__(
+        self,
+        ttl: timedelta = TTL,
+        client: Optional[httpx.AsyncClient]=None,
+        logger: Optional[AsyncLoggerBase] = None, # NEW: Add logger parameter
+        base_directory: Optional[Union[str, pathlib.Path]] = None, # NEW: Add base_directory
+        cache_root: Optional[Union[str, Path]] = None,
+    ):
+        self.ttl = ttl
+        self.client = client or httpx.AsyncClient(http2=True, timeout=20, headers={
+            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+        })
+        self.logger = logger # Store the logger instance
+        self.base_directory = pathlib.Path(base_directory or os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())) # Resolve base_directory
+        self.cache_dir = self.base_directory / ".crawl4ai" / "seeder_cache" # NEW: Specific cache dir for seeder
+        self.cache_dir.mkdir(parents=True, exist_ok=True) # Ensure it exists
+        self.index_cache_path = self.cache_dir / "latest_cc_index.txt" # NEW: Index cache path
+
+        # defer – grabbing the index inside an active loop blows up
+        self.index_id: Optional[str] = None
+        self._rate_sem: Optional[asyncio.Semaphore] = None
+
+        # ───────── cache dirs ─────────
+        self.cache_root = Path(os.path.expanduser(cache_root or "~/.cache/url_seeder"))
+        (self.cache_root / "live").mkdir(parents=True, exist_ok=True)
+        (self.cache_root / "head").mkdir(exist_ok=True)
+
+    def _log(self, level: str, message: str, tag: str = "URL_SEED", **kwargs: Any):
+        """Helper to log messages using the provided logger, if available."""
+        if self.logger:
+            log_method = getattr(self.logger, level, None)
+            if log_method:
+                log_method(message=message, tag=tag, params=kwargs.get('params', {}))
+            # else: # Fallback for unknown level, should not happen with AsyncLoggerBase
+            #     print(f"[{tag}] {level.upper()}: {message.format(**kwargs)}")
+
+    # ───────── cache helpers ─────────
+    def _cache_path(self, kind: str, url: str) -> Path:
+        h = hashlib.sha1(url.encode()).hexdigest()
+        return self.cache_root / kind / f"{h}.json"
+
+    def _cache_get(self, kind: str, url: str) -> Optional[Dict[str, Any]]:
+        p = self._cache_path(kind, url)
+        if not p.exists():
+            return None
+        # TTL check
+        if time.time() - p.stat().st_mtime > self.ttl.total_seconds():
+            return None
+        try:
+            return json.loads(p.read_text())
+        except Exception:
+            return None
+
+    def _cache_set(self, kind: str, url: str, data: Dict[str, Any]) -> None:
+        try:
+            self._cache_path(kind, url).write_text(
+                json.dumps(data, separators=(",", ":"))
+            )
+        except Exception:
+            pass
+
+
+    # ─────────────────────────────── discovery entry
+    async def urls(self,
+        domain: str,
+        config: "SeedingConfig",
+    ) -> List[Dict[str,Any]]:
+        """
+        Fetch URLs for a domain using configuration from SeedingConfig.
+        
+        Parameters
+        ----------
+        domain : str
+            The domain to fetch URLs for (e.g., "example.com")
+        config : SeedingConfig
+            Configuration object containing all seeding parameters
+        """
+        # Extract parameters from config
+        pattern = config.pattern or "*"
+        source = config.source
+        live_check = config.live_check
+        extract_head = config.extract_head
+        concurrency = config.concurrency
+        head_timeout = 5  # Default timeout for HEAD requests
+        hits_per_sec = config.hits_per_sec
+        self.force = config.force  # Store force flag as instance attribute
+        force = config.force
+        verbose = config.verbose if config.verbose is not None else (self.logger.verbose if self.logger else False)
+        max_urls = config.max_urls if config.max_urls is not None else -1
+        query = config.query
+        score_threshold = config.score_threshold
+        scoring_method = config.scoring_method    
+        
+        # Ensure seeder's logger verbose matches the config's verbose if it's set
+        if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
+            self.logger.verbose = config.verbose
+
+        # ensure we have the latest CC collection id
+        if self.index_id is None:
+            self.index_id = await self._latest_index()        
+
+        # Parse source parameter - split by '+' to get list of sources
+        sources = source.split('+')
+        valid_sources = {"cc", "sitemap"}
+        for s in sources:
+            if s not in valid_sources:
+                raise ValueError(f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
+        
+        if hits_per_sec:
+            if hits_per_sec <= 0:
+                self._log("warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
+                self._rate_sem = None
+            else:
+                self._rate_sem = asyncio.Semaphore(hits_per_sec)
+        else:
+            self._rate_sem = None # Ensure it's None if no rate limiting
+
+        self._log("info", "Starting URL seeding for {domain} with source={source}", 
+                  params={"domain": domain, "source": source}, tag="URL_SEED")
+
+        # choose stream
+        async def gen():
+            if "sitemap" in sources:
+                self._log("debug", "Fetching from sitemaps...", tag="URL_SEED")
+                async for u in self._from_sitemaps(domain, pattern, force):
+                    yield u
+            if "cc" in sources:
+                self._log("debug", "Fetching from Common Crawl...", tag="URL_SEED")
+                async for u in self._from_cc(domain, pattern, force):
+                    yield u
+
+        queue = asyncio.Queue()
+        producer_done = asyncio.Event()
+        stop_event    = asyncio.Event()
+        seen: set[str] = set()
+
+        async def producer():
+            try:
+                async for u in gen():
+                    if u in seen:
+                        self._log("debug", "Skipping duplicate URL: {url}", 
+                                  params={"url": u}, tag="URL_SEED")
+                        continue
+                    if stop_event.is_set():
+                        self._log("info", "Producer stopping due to max_urls limit.", tag="URL_SEED")
+                        break
+                    await queue.put(u)
+            except Exception as e:
+                self._log("error", "Producer encountered an error: {error}", params={"error": str(e)}, tag="URL_SEED")
+            finally:
+                producer_done.set()
+                self._log("debug", "Producer finished.", tag="URL_SEED")
+
+
+        async def worker(res_list: List[Dict[str,Any]]):
+            while True:
+                if queue.empty() and producer_done.is_set():
+                    # self._log("debug", "Worker exiting: queue empty and producer done.", tag="URL_SEED")
+                    break
+                try:
+                    url = await asyncio.wait_for(queue.get(), 5) # Increased timeout slightly
+                except asyncio.TimeoutError:
+                    continue # Keep checking queue and producer_done status
+                except Exception as e:
+                    self._log("error", "Worker failed to get URL from queue: {error}", params={"error": str(e)}, tag="URL_SEED")
+                    continue
+
+                if max_urls > 0 and len(res_list) >= max_urls:
+                    self._log(
+                        "info",
+                        "Worker stopping due to max_urls limit.",
+                        tag="URL_SEED",
+                    )
+                    stop_event.set()
+
+                    # mark the current item done
+                    queue.task_done()
+
+                    # flush whatever is still sitting in the queue so
+                    # queue.join() can finish cleanly
+                    while not queue.empty():
+                        try:
+                            queue.get_nowait()
+                            queue.task_done()
+                        except asyncio.QueueEmpty:
+                            break
+                    break
+
+                if self._rate_sem:  # global QPS control
+                    async with self._rate_sem:
+                        await self._validate(url, res_list, live_check, extract_head,
+                                             head_timeout, verbose)
+                else:
+                    await self._validate(url, res_list, live_check, extract_head,
+                                         head_timeout, verbose)
+                queue.task_done() # Mark task as done for queue.join() if ever used
+
+        # launch
+        results: List[Dict[str,Any]] = []
+        prod_task = asyncio.create_task(producer())
+        workers = [asyncio.create_task(worker(results)) for _ in range(concurrency)]
+
+        # Wait for all workers to finish
+        await asyncio.gather(prod_task, *workers)
+        await queue.join() # Ensure all queued items are processed
+
+        self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}", 
+                  params={"domain": domain, "count": len(results)}, tag="URL_SEED")
+
+        # Apply BM25 scoring if query is provided and extract_head is enabled
+        if query and extract_head and scoring_method == "bm25":
+            self._log("info", "Applying BM25 scoring for query: '{query}'", 
+                      params={"query": query}, tag="URL_SEED")
+            
+            # Extract text contexts from all results
+            documents = []
+            valid_indices = []
+            for i, result in enumerate(results):
+                if result.get("head_data"):
+                    text_context = self._extract_text_context(result["head_data"])
+                    if text_context:  # Only include non-empty contexts
+                        documents.append(text_context)
+                        valid_indices.append(i)
+            
+            if documents:
+                # Calculate BM25 scores
+                scores = self._calculate_bm25_score(query, documents)
+                
+                # Add scores to results
+                for idx, score in zip(valid_indices, scores):
+                    results[idx]["relevance_score"] = float(score)
+                
+                # Add zero scores to results without head_data
+                for i, result in enumerate(results):
+                    if i not in valid_indices:
+                        result["relevance_score"] = 0.0
+                
+                # Filter by score threshold if specified
+                if score_threshold is not None:
+                    original_count = len(results)
+                    results = [r for r in results if r.get("relevance_score", 0.0) >= score_threshold]
+                    self._log("info", "Filtered {filtered} URLs below score threshold {threshold}. Remaining: {remaining}", 
+                              params={"filtered": original_count - len(results), 
+                                      "threshold": score_threshold, 
+                                      "remaining": len(results)}, tag="URL_SEED")
+                
+                # Sort by relevance score (highest first)
+                results.sort(key=lambda x: x.get("relevance_score", 0.0), reverse=True)
+            else:
+                self._log("warning", "No valid head data found for BM25 scoring.", tag="URL_SEED")
+                # Add zero scores to all results
+                for result in results:
+                    result["relevance_score"] = 0.0
+        elif query and not extract_head:
+            self._log("warning", "Query provided but extract_head is False. Enable extract_head for relevance scoring.", tag="URL_SEED")
+
+        return results[:max_urls] if max_urls > 0 else results
+
+    async def many_urls(
+        self,
+        domains: Sequence[str],
+        config: "SeedingConfig",
+    ) -> Dict[str, List[Dict[str,Any]]]:
+        """
+        Fetch URLs for many domains in parallel.
+
+        Parameters
+        ----------
+        domains : Sequence[str]
+            List of domains to fetch URLs for
+        config : SeedingConfig
+            Configuration object containing all seeding parameters
+
+        Returns a {domain: urls-list} dict.
+        """
+        self._log("info", "Starting URL seeding for {count} domains...", 
+                  params={"count": len(domains)}, tag="URL_SEED")
+        
+        # Ensure seeder's logger verbose matches the config's verbose if it's set
+        if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
+            self.logger.verbose = config.verbose
+
+        tasks = [
+            self.urls(domain, config)
+            for domain in domains
+        ]
+        results = await asyncio.gather(*tasks)
+        
+        final_results = dict(zip(domains, results))
+        self._log("info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
+        return final_results
+
+    async def _resolve_head(self, url: str) -> Optional[str]:
+        """
+        HEAD-probe a URL.
+
+        Returns:
+            * the same URL if it answers 2xx,
+            * the absolute redirect target if it answers 3xx,
+            * None on any other status or network error.
+        """
+        try:
+            r = await self.client.head(url, timeout=10, follow_redirects=False)
+
+            # direct hit
+            if 200 <= r.status_code < 300:
+                return str(r.url)
+
+            # single level redirect
+            if r.status_code in (301, 302, 303, 307, 308):
+                loc = r.headers.get("location")
+                if loc:
+                    return urljoin(url, loc)
+
+            return None
+
+        except Exception as e:
+            self._log("debug", "HEAD {url} failed: {err}",
+                    params={"url": url, "err": str(e)}, tag="URL_SEED")
+            return None
+
+
+    # ─────────────────────────────── CC
+    async def _from_cc(self, domain:str, pattern:str, force:bool):
+        import re
+        digest = hashlib.md5(pattern.encode()).hexdigest()[:8]
+
+        # ── normalise for CC   (strip scheme, query, fragment)
+        raw = re.sub(r'^https?://', '', domain).split('#', 1)[0].split('?', 1)[0].lstrip('.')
+
+        # ── sanitize only for cache-file name
+        safe = re.sub('[/?#]+', '_', raw)
+        path = self.cache_dir / f"{self.index_id}_{safe}_{digest}.jsonl"
+
+        if path.exists() and not force:
+            self._log("info", "Loading CC URLs for {domain} from cache: {path}", 
+                      params={"domain": domain, "path": path}, tag="URL_SEED")
+            async with aiofiles.open(path,"r") as fp:
+                async for line in fp:
+                    url=line.strip()
+                    if _match(url,pattern): yield url
+            return
+
+        # build CC glob – if a path is present keep it, else add trailing /*
+        glob = f"*.{raw}*" if '/' in raw else f"*.{raw}/*"
+        url = f"https://index.commoncrawl.org/{self.index_id}-index?url={quote(glob, safe='*')}&output=json"
+
+        retries=(1,3,7)
+        self._log("info", "Fetching CC URLs for {domain} from Common Crawl index: {url}", 
+                  params={"domain": domain, "url": url}, tag="URL_SEED")
+        for i,d in enumerate(retries+(-1,)):  # last -1 means don't retry
+            try:
+                async with self.client.stream("GET", url) as r:
+                    r.raise_for_status()
+                    async with aiofiles.open(path,"w") as fp:
+                        async for line in r.aiter_lines():
+                            rec = json.loads(line)
+                            u = rec["url"]
+                            await fp.write(u+"\n")
+                            if _match(u,pattern): yield u
+                return
+            except httpx.HTTPStatusError as e:
+                if e.response.status_code==503 and i<len(retries):
+                    self._log("warning", "Common Crawl API returned 503 for {domain}. Retrying in {delay}s.", 
+                              params={"domain": domain, "delay": retries[i]}, tag="URL_SEED")
+                    await asyncio.sleep(retries[i])
+                    continue
+                self._log("error", "HTTP error fetching CC index for {domain}: {error}", 
+                          params={"domain": domain, "error": str(e)}, tag="URL_SEED")
+                raise
+            except Exception as e:
+                self._log("error", "Error fetching CC index for {domain}: {error}", 
+                          params={"domain": domain, "error": str(e)}, tag="URL_SEED")
+                raise
+
+
+    # ─────────────────────────────── Sitemaps
+    async def _from_sitemaps(self, domain:str, pattern:str, force:bool=False):
+        """
+        1. Probe default sitemap locations.
+        2. If none exist, parse robots.txt for alternative sitemap URLs.
+        3. Yield only URLs that match `pattern`.
+        """
+
+       # ── cache file (same logic as _from_cc)
+        host = re.sub(r'^https?://', '', domain).rstrip('/')
+        host = re.sub('[/?#]+', '_', domain)
+        digest = hashlib.md5(pattern.encode()).hexdigest()[:8]
+        path = self.cache_dir / f"sitemap_{host}_{digest}.jsonl"
+
+        if path.exists() and not force:
+            self._log("info", "Loading sitemap URLs for {d} from cache: {p}",
+                      params={"d": host, "p": str(path)}, tag="URL_SEED")
+            async with aiofiles.open(path, "r") as fp:
+                async for line in fp:
+                    url = line.strip()
+                    if _match(url, pattern):
+                        yield url
+            return
+
+        # 1️⃣ direct sitemap probe
+        # strip any scheme so we can handle https → http fallback
+        host=re.sub(r'^https?://','',domain).rstrip('/')
+
+        schemes=('https','http')  # prefer TLS, downgrade if needed
+        for scheme in schemes:
+            for suffix in ("/sitemap.xml","/sitemap_index.xml"):
+                sm=f"{scheme}://{host}{suffix}"
+                sm = await self._resolve_head(sm)
+                if sm:
+                    self._log("info","Found sitemap at {url}",params={"url":sm},tag="URL_SEED")
+                    async with aiofiles.open(path, "w") as fp:
+                        async for u in self._iter_sitemap(sm):
+                            await fp.write(u + "\n")
+                            if _match(u, pattern):
+                                yield u
+                    return
+
+        # 2️⃣ robots.txt fallback
+        robots=f"https://{domain.rstrip('/')}/robots.txt"
+        try:
+            r=await self.client.get(robots,timeout=10,follow_redirects=True)
+            if not 200<=r.status_code<300:
+                self._log("warning","robots.txt unavailable for {d} HTTP{c}",params={"d":domain,"c":r.status_code},tag="URL_SEED")
+                return
+            sitemap_lines=[l.split(":",1)[1].strip() for l in r.text.splitlines() if l.lower().startswith("sitemap:")]
+        except Exception as e:
+            self._log("warning","Failed to fetch robots.txt for {d}: {e}",params={"d":domain,"e":str(e)},tag="URL_SEED")
+            return
+
+        if sitemap_lines:
+            async with aiofiles.open(path, "w") as fp:
+                for sm in sitemap_lines:
+                    async for u in self._iter_sitemap(sm):
+                        await fp.write(u + "\n")
+                        if _match(u, pattern):
+                            yield u
+
+    async def _iter_sitemap(self, url:str):
+        try:
+            r = await self.client.get(url, timeout=15)
+            r.raise_for_status()
+        except httpx.HTTPStatusError as e:
+            self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}", 
+                      params={"url": url, "status_code": e.response.status_code}, tag="URL_SEED")
+            return
+        except httpx.RequestError as e:
+            self._log("warning", "Network error fetching sitemap {url}: {error}", 
+                      params={"url": url, "error": str(e)}, tag="URL_SEED")
+            return
+        except Exception as e:
+            self._log("error", "Unexpected error fetching sitemap {url}: {error}", 
+                      params={"url": url, "error": str(e)}, tag="URL_SEED")
+            return
+
+        data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
+        
+        # Use lxml for XML parsing if available, as it's generally more robust
+        if LXML:
+            try:
+                # Use XML parser for sitemaps, not HTML parser
+                parser = etree.XMLParser(recover=True)
+                root = etree.fromstring(data, parser=parser)
+                
+                # Define namespace for sitemap
+                ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
+                
+                # First check if this is a sitemap index
+                for sitemap_elem in root.xpath('//s:sitemap/s:loc', namespaces=ns):
+                    loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
+                    if loc:
+                        self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED")
+                        async for u in self._iter_sitemap(loc):
+                            yield u
+                
+                # Then check for regular URLs
+                for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
+                    loc = loc_elem.text.strip() if loc_elem.text else ""
+                    if loc:
+                        yield loc
+            except Exception as e:
+                self._log("error", "LXML parsing error for sitemap {url}: {error}", 
+                          params={"url": url, "error": str(e)}, tag="URL_SEED")
+                return
+        else: # Fallback to xml.etree.ElementTree
+            import xml.etree.ElementTree as ET
+            try:
+                # Parse the XML
+                root = ET.fromstring(data)
+                # Remove namespace from tags for easier processing
+                for elem in root.iter():
+                    if '}' in elem.tag:
+                        elem.tag = elem.tag.split('}')[1]
+                
+                # Check for sitemap index entries
+                for sitemap in root.findall('.//sitemap'):
+                    loc_elem = sitemap.find('loc')
+                    if loc_elem is not None and loc_elem.text:
+                        loc = loc_elem.text.strip()
+                        self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED")
+                        async for u in self._iter_sitemap(loc):
+                            yield u
+                
+                # Check for regular URL entries
+                for url in root.findall('.//url'):
+                    loc_elem = url.find('loc')
+                    if loc_elem is not None and loc_elem.text:
+                        yield loc_elem.text.strip()
+            except Exception as e:
+                self._log("error", "ElementTree parsing error for sitemap {url}: {error}", 
+                          params={"url": url, "error": str(e)}, tag="URL_SEED")
+                return
+
+
+    # ─────────────────────────────── validate helpers
+    async def _validate(self, url:str, res_list:List[Dict[str,Any]], live:bool,
+                        extract:bool, timeout:int, verbose:bool):
+        # Local verbose parameter for this function is used to decide if intermediate logs should be printed
+        # The main logger's verbose status should be controlled by the caller.
+        
+        cache_kind = "head" if extract else "live"
+
+        # ---------- try cache ----------
+        if (live or extract) and not (hasattr(self, 'force') and self.force):
+            cached = self._cache_get(cache_kind, url)
+            if cached:
+                res_list.append(cached)
+                return
+
+        if extract:
+            self._log("debug", "Fetching head for {url}", params={"url": url}, tag="URL_SEED")
+            ok,html,final = await self._fetch_head(url,timeout)
+            status="valid" if ok else "not_valid"
+            self._log("info" if ok else "warning", "HEAD {status} for {final_url}", 
+                      params={"status": status.upper(), "final_url": final or url}, tag="URL_SEED")
+            entry = {
+                "url": final or url,
+                "status": status,
+                "head_data": _parse_head(html) if ok else {},
+            }
+            if live or extract:
+                self._cache_set(cache_kind, url, entry)
+            res_list.append(entry)
+        elif live:
+            self._log("debug", "Performing live check for {url}", params={"url": url}, tag="URL_SEED")
+            ok=await self._resolve_head(url)
+            status="valid" if ok else "not_valid"
+            self._log("info" if ok else "warning", "LIVE CHECK {status} for {url}", 
+                      params={"status": status.upper(), "url": url}, tag="URL_SEED")
+            entry = {"url": url, "status": status, "head_data": {}}
+            if live or extract:
+                self._cache_set(cache_kind, url, entry)
+            res_list.append(entry)
+        else:
+            entry = {"url": url, "status": "unknown", "head_data": {}}
+            if live or extract:
+                self._cache_set(cache_kind, url, entry)
+            res_list.append(entry)
+
+
+    async def _head_ok(self, url:str, timeout:int)->bool:
+        try:
+            r=await self.client.head(url, timeout=timeout,
+                headers={"Range":"bytes=0-0","Accept-Encoding":"identity"})
+            r.raise_for_status() # Raise for bad status codes (4xx, 5xx)
+            return True
+        except httpx.RequestError as e:
+            self._log("debug", "HEAD check network error for {url}: {error}", 
+                      params={"url": url, "error": str(e)}, tag="URL_SEED")
+            return False
+        except httpx.HTTPStatusError as e:
+            self._log("debug", "HEAD check HTTP status error for {url}: {status_code}", 
+                      params={"url": url, "status_code": e.response.status_code}, tag="URL_SEED")
+            return False
+        except Exception as e:
+            self._log("error", "Unexpected error during HEAD check for {url}: {error}", 
+                      params={"url": url, "error": str(e)}, tag="URL_SEED")
+            return False
+
+    async def _fetch_head(
+        self,
+        url: str,
+        timeout: int,
+        max_redirects: int = 5,
+        max_bytes: int = 65_536,  # stop after 64 kB even if </head> never comes
+        chunk_size: int = 4096,       # how much we read per await        
+    ):
+        for _ in range(max_redirects+1):
+            try:
+                # ask the first `max_bytes` and force plain text to avoid
+                # partial-gzip decode headaches
+                async with self.client.stream(
+                    "GET",
+                    url,
+                    timeout=timeout,
+                    headers={
+                        # "Range": f"bytes=0-{max_bytes-1}", # Dropped the Range header – no need now, and some servers ignore it. We still keep an upper‐bound max_bytes as a fail-safe.
+                        "Accept-Encoding": "identity",
+                    },
+                    follow_redirects=False,
+                ) as r:
+                    
+                    if r.status_code in (301,302,303,307,308):
+                        location = r.headers.get("Location")
+                        if location:
+                            url = urljoin(url, location)
+                            self._log("debug", "Redirecting from {original_url} to {new_url}", 
+                                      params={"original_url": r.url, "new_url": url}, tag="URL_SEED")
+                            continue
+                        else:
+                            self._log("warning", "Redirect status {status_code} but no Location header for {url}", 
+                                      params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
+                            return False, "", str(r.url) # Return original URL if no new location
+
+                    # For 2xx or other non-redirect codes, proceed to read content
+                    if not (200 <= r.status_code < 400): # Only allow successful codes, or continue
+                        self._log("warning", "Non-success status {status_code} when fetching head for {url}", 
+                                  params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
+                        return False, "", str(r.url)
+
+                    buf = bytearray()
+                    async for chunk in r.aiter_bytes(chunk_size):
+                        buf.extend(chunk)
+                        low = buf.lower()
+                        if b"</head>" in low or len(buf) >= max_bytes:
+                            await r.aclose()
+                            break
+                    
+                    enc = r.headers.get("Content-Encoding", "").lower()
+                    try:
+                        if enc == "gzip" and buf[:2] == b"\x1f\x8b":
+                            buf = gzip.decompress(buf)
+                        elif enc == "br" and HAS_BROTLI and buf[:4] == b"\x8b\x6c\x0a\x1a":
+                            buf = brotli.decompress(buf)
+                        elif enc in {"gzip", "br"}:
+                            # Header says “gzip” or “br” but payload is plain – ignore
+                            self._log(
+                                "debug",
+                               "Skipping bogus {encoding} for {url}",
+                                params={"encoding": enc, "url": r.url},
+                                tag="URL_SEED",
+                            )
+                    except Exception as e:
+                        self._log(
+                           "warning",
+                            "Decompression error for {url} ({encoding}): {error}",
+                            params={"url": r.url, "encoding": enc, "error": str(e)},
+                            tag="URL_SEED",
+                       )
+                        # fall through with raw buf
+                    
+                    # Find the </head> tag case-insensitively and decode
+                    idx = buf.lower().find(b"</head>")
+                    if idx==-1: 
+                        self._log("debug", "No </head> tag found in initial bytes of {url}", 
+                                  params={"url": r.url}, tag="URL_SEED")
+                        # If no </head> is found, take a reasonable chunk or all if small
+                        html_bytes = buf if len(buf) < 10240 else buf[:10240] # Take max 10KB if no head tag
+                    else:
+                        html_bytes = buf[:idx+7] # Include </head> tag
+
+                    try:
+                        html = html_bytes.decode("utf-8", "replace")
+                    except Exception as e:
+                        self._log(
+                            "warning",
+                            "Failed to decode head content for {url}: {error}",
+                            params={"url": r.url, "error": str(e)},
+                            tag="URL_SEED",
+                        )
+                        html = html_bytes.decode("latin-1", "replace")
+
+                    return True,html,str(r.url) # Return the actual URL after redirects
+                    
+            except httpx.RequestError as e:
+                self._log("debug", "Fetch head network error for {url}: {error}", 
+                          params={"url": url, "error": str(e)}, tag="URL_SEED")
+                return False,"",url
+        
+        # If loop finishes without returning (e.g. too many redirects)
+        self._log("warning", "Exceeded max redirects ({max_redirects}) for {url}", 
+                  params={"max_redirects": max_redirects, "url": url}, tag="URL_SEED")
+        return False,"",url
+
+    # ─────────────────────────────── BM25 scoring helpers
+    def _extract_text_context(self, head_data: Dict[str, Any]) -> str:
+        """Extract all relevant text from head metadata for scoring."""
+        # Priority fields with their weights (for future enhancement)
+        text_parts = []
+        
+        # Title
+        if head_data.get("title"):
+            text_parts.append(head_data["title"])
+        
+        # Standard meta tags
+        meta = head_data.get("meta", {})
+        for key in ["description", "keywords", "author", "subject", "summary", "abstract"]:
+            if meta.get(key):
+                text_parts.append(meta[key])
+        
+        # Open Graph tags
+        for key in ["og:title", "og:description", "og:site_name", "article:tag"]:
+            if meta.get(key):
+                text_parts.append(meta[key])
+        
+        # Twitter Card tags
+        for key in ["twitter:title", "twitter:description", "twitter:image:alt"]:
+            if meta.get(key):
+                text_parts.append(meta[key])
+        
+        # Dublin Core tags
+        for key in ["dc.title", "dc.description", "dc.subject", "dc.creator"]:
+            if meta.get(key):
+                text_parts.append(meta[key])
+        
+        # JSON-LD structured data
+        for jsonld in head_data.get("jsonld", []):
+            if isinstance(jsonld, dict):
+                # Extract common fields from JSON-LD
+                for field in ["name", "headline", "description", "abstract", "keywords"]:
+                    if field in jsonld:
+                        if isinstance(jsonld[field], str):
+                            text_parts.append(jsonld[field])
+                        elif isinstance(jsonld[field], list):
+                            text_parts.extend(str(item) for item in jsonld[field] if item)
+                
+                # Handle @graph structures
+                if "@graph" in jsonld and isinstance(jsonld["@graph"], list):
+                    for item in jsonld["@graph"]:
+                        if isinstance(item, dict):
+                            for field in ["name", "headline", "description"]:
+                                if field in item and isinstance(item[field], str):
+                                    text_parts.append(item[field])
+        
+        # Combine all text parts
+        return " ".join(filter(None, text_parts))
+    
+    def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]:
+        """Calculate BM25 scores for documents against a query."""
+        if not HAS_BM25:
+            self._log("warning", "rank_bm25 not installed. Returning zero scores.", tag="URL_SEED")
+            return [0.0] * len(documents)
+        
+        if not query or not documents:
+            return [0.0] * len(documents)
+        
+        # Tokenize query and documents (simple whitespace tokenization)
+        # For production, consider using a proper tokenizer
+        query_tokens = query.lower().split()
+        tokenized_docs = [doc.lower().split() for doc in documents]
+        
+        # Handle edge case where all documents are empty
+        if all(len(doc) == 0 for doc in tokenized_docs):
+            return [0.0] * len(documents)
+        
+        # Create BM25 instance and calculate scores
+        try:
+            from rank_bm25 import BM25Okapi
+            bm25 = BM25Okapi(tokenized_docs)
+            scores = bm25.get_scores(query_tokens)
+            
+            # Normalize scores to 0-1 range
+            max_score = max(scores) if max(scores) > 0 else 1.0
+            normalized_scores = [score / max_score for score in scores]
+            
+            return normalized_scores
+        except Exception as e:
+            self._log("error", "Error calculating BM25 scores: {error}", 
+                      params={"error": str(e)}, tag="URL_SEED")
+            return [0.0] * len(documents)
+
+    # ─────────────────────────────── index helper
+    async def _latest_index(self)->str:
+        if self.index_cache_path.exists() and (time.time()-self.index_cache_path.stat().st_mtime)<self.ttl.total_seconds():
+            self._log("info", "Loading latest CC index from cache: {path}", 
+                      params={"path": self.index_cache_path}, tag="URL_SEED")
+            return self.index_cache_path.read_text().strip()
+        
+        self._log("info", "Fetching latest Common Crawl index from {url}", 
+                  params={"url": COLLINFO_URL}, tag="URL_SEED")
+        try:
+            async with httpx.AsyncClient() as c:
+                j=await c.get(COLLINFO_URL,timeout=10)
+                j.raise_for_status() # Raise an exception for bad status codes
+                idx=j.json()[0]["id"]
+                self.index_cache_path.write_text(idx)
+                self._log("success", "Successfully fetched and cached CC index: {index_id}", 
+                          params={"index_id": idx}, tag="URL_SEED")
+                return idx
+        except httpx.RequestError as e:
+            self._log("error", "Network error fetching CC index info: {error}", 
+                      params={"error": str(e)}, tag="URL_SEED")
+            raise
+        except httpx.HTTPStatusError as e:
+            self._log("error", "HTTP error fetching CC index info: {status_code}", 
+                      params={"status_code": e.response.status_code}, tag="URL_SEED")
+            raise
+        except Exception as e:
+            self._log("error", "Unexpected error fetching CC index info: {error}", 
+                      params={"error": str(e)}, tag="URL_SEED")
+            raise
\ No newline at end of file
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 19b98522..ce9a0633 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -35,9 +35,10 @@ from .markdown_generation_strategy import (
 )
 from .deep_crawling import DeepCrawlDecorator
 from .async_logger import AsyncLogger, AsyncLoggerBase
-from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig, SeedingConfig
 from .async_dispatcher import *  # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
+from .async_url_seeder import AsyncUrlSeeder
 
 from .utils import (
     sanitize_input_encode,
@@ -163,6 +164,8 @@ class AsyncWebCrawler:
         # Decorate arun method with deep crawling capabilities
         self._deep_handler = DeepCrawlDecorator(self)
         self.arun = self._deep_handler(self.arun)
+        
+        self.url_seeder: Optional[AsyncUrlSeeder] = None
 
     async def start(self):
         """
@@ -744,3 +747,94 @@ class AsyncWebCrawler:
         else:
             _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
             return [transform_result(res) for res in _results]
+
+    async def aseed_urls(
+        self,
+        domain_or_domains: Union[str, List[str]],
+        config: Optional[SeedingConfig] = None,
+        **kwargs
+    ) -> Union[List[str], Dict[str, List[Union[str, Dict[str, Any]]]]]:
+        """
+        Discovers, filters, and optionally validates URLs for a given domain(s)
+        using sitemaps and Common Crawl archives.
+
+        Args:
+            domain_or_domains: A single domain string (e.g., "iana.org") or a list of domains.
+            config: A SeedingConfig object to control the seeding process.
+                    Parameters passed directly via kwargs will override those in 'config'.
+            **kwargs: Additional parameters (e.g., `source`, `live_check`, `extract_head`,
+                      `pattern`, `concurrency`, `hits_per_sec`, `force_refresh`, `verbose`)
+                      that will be used to construct or update the SeedingConfig.
+
+        Returns:
+            If `extract_head` is False:
+                - For a single domain: `List[str]` of discovered URLs.
+                - For multiple domains: `Dict[str, List[str]]` mapping each domain to its URLs.
+            If `extract_head` is True:
+                - For a single domain: `List[Dict[str, Any]]` where each dict contains 'url'
+                  and 'head_data' (parsed <head> metadata).
+                - For multiple domains: `Dict[str, List[Dict[str, Any]]]` mapping each domain
+                  to a list of URL data dictionaries.
+
+        Raises:
+            ValueError: If `domain_or_domains` is not a string or a list of strings.
+            Exception: Any underlying exceptions from AsyncUrlSeeder or network operations.
+
+        Example:
+            >>> # Discover URLs from sitemap with live check for 'example.com'
+            >>> result = await crawler.aseed_urls("example.com", source="sitemap", live_check=True, hits_per_sec=10)
+
+            >>> # Discover URLs from Common Crawl, extract head data for 'example.com' and 'python.org'
+            >>> multi_domain_result = await crawler.aseed_urls(
+            >>>     ["example.com", "python.org"],
+            >>>     source="cc", extract_head=True, concurrency=200, hits_per_sec=50
+            >>> )
+        """
+        # Initialize AsyncUrlSeeder here if it hasn't been already
+        if not self.url_seeder:
+            # Pass the crawler's base_directory for seeder's cache management
+            # Pass the crawler's logger for consistent logging
+            self.url_seeder = AsyncUrlSeeder(
+                base_directory=self.crawl4ai_folder,
+                logger=self.logger
+            )                    
+
+        # Merge config object with direct kwargs, giving kwargs precedence
+        seeding_config = config.clone(**kwargs) if config else SeedingConfig.from_kwargs(kwargs)
+        
+        # Ensure base_directory is set for the seeder's cache
+        seeding_config.base_directory = seeding_config.base_directory or self.crawl4ai_folder        
+        # Ensure the seeder uses the crawler's logger (if not already set)
+        if not self.url_seeder.logger:
+            self.url_seeder.logger = self.logger
+
+        # Pass verbose setting if explicitly provided in SeedingConfig or kwargs
+        if seeding_config.verbose is not None:
+            self.url_seeder.logger.verbose = seeding_config.verbose
+        else: # Default to crawler's verbose setting
+            self.url_seeder.logger.verbose = self.logger.verbose
+
+
+        if isinstance(domain_or_domains, str):
+            self.logger.info(
+                message="Starting URL seeding for domain: {domain}",
+                tag="SEED",
+                params={"domain": domain_or_domains}
+            )
+            return await self.url_seeder.urls(
+                domain_or_domains,
+                seeding_config
+            )
+        elif isinstance(domain_or_domains, (list, tuple)):
+            self.logger.info(
+                message="Starting URL seeding for {count} domains",
+                tag="SEED",
+                params={"count": len(domain_or_domains)}
+            )
+            # AsyncUrlSeeder.many_urls directly accepts a list of domains and individual params.
+            return await self.url_seeder.many_urls(
+                domain_or_domains,
+                seeding_config
+            )
+        else:
+            raise ValueError("`domain_or_domains` must be a string or a list of strings.")
\ No newline at end of file
diff --git a/crawl4ai/types.py b/crawl4ai/types.py
index 63fd45ba..2b044ebd 100644
--- a/crawl4ai/types.py
+++ b/crawl4ai/types.py
@@ -10,12 +10,16 @@ CacheMode = Union['CacheModeType']
 CrawlResult = Union['CrawlResultType']
 CrawlerHub = Union['CrawlerHubType']
 BrowserProfiler = Union['BrowserProfilerType']
+# NEW: Add AsyncUrlSeederType
+AsyncUrlSeeder = Union['AsyncUrlSeederType']
 
 # Configuration types
 BrowserConfig = Union['BrowserConfigType']
 CrawlerRunConfig = Union['CrawlerRunConfigType']
 HTTPCrawlerConfig = Union['HTTPCrawlerConfigType']
 LLMConfig = Union['LLMConfigType']
+# NEW: Add SeedingConfigType
+SeedingConfig = Union['SeedingConfigType']
 
 # Content scraping types
 ContentScrapingStrategy = Union['ContentScrapingStrategyType']
@@ -94,6 +98,8 @@ if TYPE_CHECKING:
     from .models import CrawlResult as CrawlResultType
     from .hub import CrawlerHub as CrawlerHubType
     from .browser_profiler import BrowserProfiler as BrowserProfilerType
+    # NEW: Import AsyncUrlSeeder for type checking
+    from .async_url_seeder import AsyncUrlSeeder as AsyncUrlSeederType
     
     # Configuration imports
     from .async_configs import (
@@ -101,6 +107,8 @@ if TYPE_CHECKING:
         CrawlerRunConfig as CrawlerRunConfigType,
         HTTPCrawlerConfig as HTTPCrawlerConfigType,
         LLMConfig as LLMConfigType,
+        # NEW: Import SeedingConfig for type checking
+        SeedingConfig as SeedingConfigType,
     )
     
     # Content scraping imports
@@ -184,4 +192,4 @@ if TYPE_CHECKING:
 
 def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
     from .async_configs import LLMConfig
-    return LLMConfig(*args, **kwargs)
+    return LLMConfig(*args, **kwargs)
\ No newline at end of file
diff --git a/docs/examples/url_seeder/url_seeder_demo.py b/docs/examples/url_seeder/url_seeder_demo.py
new file mode 100644
index 00000000..faf730f9
--- /dev/null
+++ b/docs/examples/url_seeder/url_seeder_demo.py
@@ -0,0 +1,261 @@
+"""
+URL Seeder Demo - Interactive showcase of Crawl4AI's URL discovery capabilities
+
+This demo shows:
+1. Basic URL discovery from sitemaps and Common Crawl
+2. Cache management and forced refresh
+3. Live URL validation and metadata extraction
+4. BM25 relevance scoring for intelligent filtering
+5. Integration with AsyncWebCrawler for the complete pipeline
+"""
+
+import asyncio
+import time
+from datetime import datetime
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+from rich.progress import Progress, SpinnerColumn, BarColumn, TimeElapsedColumn
+from rich.prompt import Prompt, Confirm
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    AsyncUrlSeeder,
+    SeedingConfig
+)
+
+console = Console()
+
+console.rule("[bold green]🌐 Crawl4AI URL Seeder: Interactive Demo")
+
+DOMAIN = "crawl4ai.com"
+
+# Utils
+
+def print_head_info(head_data):
+    table = Table(title="<head> Metadata", expand=True)
+    table.add_column("Key", style="cyan", no_wrap=True)
+    table.add_column("Value", style="magenta")
+    
+    if not head_data:
+        console.print("[yellow]No head data found.")
+        return
+
+    if head_data.get("title"):
+        table.add_row("title", head_data["title"])
+    if head_data.get("charset"):
+        table.add_row("charset", head_data["charset"])
+    for k, v in head_data.get("meta", {}).items():
+        table.add_row(f"meta:{k}", v)
+    for rel, items in head_data.get("link", {}).items():
+        for item in items:
+            table.add_row(f"link:{rel}", item.get("href", ""))
+    console.print(table)
+
+
+async def section_1_basic_exploration(seed: AsyncUrlSeeder):
+    console.rule("[bold cyan]1. Basic Seeding")
+    cfg = SeedingConfig(source="cc+sitemap", pattern="*", verbose=True)
+    
+    start_time = time.time()
+    with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
+        p.add_task(description="Fetching from Common Crawl + Sitemap...", total=None)
+        urls = await seed.urls(DOMAIN, cfg)
+    elapsed = time.time() - start_time
+
+    console.print(f"[green]✓ Fetched {len(urls)} URLs in {elapsed:.2f} seconds")
+    console.print(f"[dim]  Speed: {len(urls)/elapsed:.0f} URLs/second[/dim]\n")
+    
+    console.print("[bold]Sample URLs:[/bold]")
+    for u in urls[:5]:
+        console.print(f"  • {u['url']}")
+
+
+async def section_2_cache_demo(seed: AsyncUrlSeeder):
+    console.rule("[bold cyan]2. Caching Demonstration")   
+    console.print("[yellow]Using `force=True` to bypass cache and fetch fresh data.[/yellow]")
+    cfg = SeedingConfig(source="cc", pattern="*crawl4ai.com/core/*", verbose=False, force = True)
+    await seed.urls(DOMAIN, cfg)
+
+async def section_3_live_head(seed: AsyncUrlSeeder):
+    console.rule("[bold cyan]3. Live Check + Head Extraction")
+    cfg = SeedingConfig(
+        extract_head=True,
+        concurrency=10,
+        hits_per_sec=5,
+        pattern="*crawl4ai.com/*",
+        max_urls=10,
+        verbose=False,
+    )
+    urls = await seed.urls(DOMAIN, cfg)
+    
+    valid = [u for u in urls if u["status"] == "valid"]
+    console.print(f"[green]Valid: {len(valid)} / {len(urls)}")
+    if valid:
+        print_head_info(valid[0]["head_data"])
+
+
+async def section_4_bm25_scoring(seed: AsyncUrlSeeder):
+    console.rule("[bold cyan]4. BM25 Relevance Scoring")
+    console.print("[yellow]Using AI-powered relevance scoring to find the most relevant content[/yellow]")
+    
+    query = "markdown generation extraction strategies"
+    cfg = SeedingConfig(
+        source="sitemap",
+        extract_head=True,
+        query=query,
+        scoring_method="bm25",
+        score_threshold=0.3,  # Only URLs with >30% relevance
+        max_urls=20,
+        verbose=False
+    )
+    
+    with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
+        p.add_task(description=f"Searching for: '{query}'", total=None)
+        urls = await seed.urls(DOMAIN, cfg)
+    
+    console.print(f"[green]Found {len(urls)} relevant URLs (score > 0.3)")
+    
+    # Show top results with scores
+    table = Table(title="Top 5 Most Relevant Pages", expand=True)
+    table.add_column("Score", style="cyan", width=8)
+    table.add_column("Title", style="magenta")
+    table.add_column("URL", style="blue", overflow="fold")
+    
+    for url in urls[:5]:
+        score = f"{url['relevance_score']:.2f}"
+        title = url['head_data'].get('title', 'No title')[:60] + "..."
+        table.add_row(score, title, url['url'])
+    
+    console.print(table)
+
+async def section_5_keyword_filter_to_agent(seed: AsyncUrlSeeder):
+    console.rule("[bold cyan]5. Complete Pipeline: Discover → Filter → Crawl")
+    cfg = SeedingConfig(
+        extract_head=True,
+        concurrency=20,
+        hits_per_sec=10,
+        max_urls=10,
+        pattern="*crawl4ai.com/*",
+        force=True,
+    )
+    urls = await seed.urls(DOMAIN, cfg)
+
+    keywords = ["deep crawling", "markdown", "llm"]
+    selected = [u for u in urls if any(k in str(u["head_data"]).lower() for k in keywords)]
+
+    console.print(f"[cyan]Selected {len(selected)} URLs with relevant keywords:")
+    for u in selected[:10]:
+        console.print("•", u["url"])
+
+    console.print("\n[yellow]Passing above URLs to arun_many() LLM agent for crawling...")
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        crawl_run_config = CrawlerRunConfig(
+                # Example crawl settings for these URLs:
+                only_text=True, # Just get text content
+                screenshot=False,
+                pdf=False,
+                word_count_threshold=50, # Only process pages with at least 50 words
+                stream=True,
+                verbose=False # Keep logs clean for arun_many in this demo
+            )
+
+        # Extract just the URLs from the selected results
+        urls_to_crawl = [u["url"] for u in selected]
+        
+        # We'll stream results for large lists, but collect them here for demonstration
+        crawled_results_stream = await crawler.arun_many(urls_to_crawl, config=crawl_run_config)
+        final_crawled_data = []
+        async for result in crawled_results_stream:
+            final_crawled_data.append(result)
+            if len(final_crawled_data) % 5 == 0:
+                print(f"   Processed {len(final_crawled_data)}/{len(urls_to_crawl)} URLs...")
+
+        print(f"\n   Successfully crawled {len(final_crawled_data)} URLs.")
+        if final_crawled_data:
+            print("\n   Example of a crawled result's URL and Markdown (first successful one):")
+            for result in final_crawled_data:
+                if result.success and result.markdown.raw_markdown:
+                    print(f"     URL: {result.url}")
+                    print(f"     Markdown snippet: {result.markdown.raw_markdown[:200]}...")
+                    break
+            else:
+                print("   No successful crawls with markdown found.")
+        else:
+            print("   No successful crawls found.")    
+
+
+async def section_6_multi_domain(seed: AsyncUrlSeeder):
+    console.rule("[bold cyan]6. Multi-Domain Discovery")
+    console.print("[yellow]Discovering Python tutorials across multiple educational sites[/yellow]\n")
+    
+    domains = ["docs.python.org", "realpython.com", "docs.crawl4ai.com"]
+    cfg = SeedingConfig(
+        source="sitemap",
+        extract_head=True,
+        query="python tutorial guide",
+        scoring_method="bm25",
+        score_threshold=0.2,
+        max_urls=5  # Per domain
+    )
+    
+    start_time = time.time()
+    with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
+        task = p.add_task(description="Discovering across domains...", total=None)
+        results = await seed.many_urls(domains, cfg)
+    elapsed = time.time() - start_time
+    
+    total_urls = sum(len(urls) for urls in results.values())
+    console.print(f"[green]✓ Found {total_urls} relevant URLs across {len(domains)} domains in {elapsed:.2f}s\n")
+    
+    # Show results per domain
+    for domain, urls in results.items():
+        console.print(f"[bold]{domain}:[/bold] {len(urls)} relevant pages")
+        if urls:
+            top = urls[0]
+            console.print(f"  Top result: [{top['relevance_score']:.2f}] {top['head_data'].get('title', 'No title')}")
+
+
+async def main():
+    seed = AsyncUrlSeeder()
+    
+    # Interactive menu
+    sections = {
+        "1": ("Basic URL Discovery", section_1_basic_exploration),
+        "2": ("Cache Management Demo", section_2_cache_demo),
+        "3": ("Live Check & Metadata Extraction", section_3_live_head),
+        "4": ("BM25 Relevance Scoring", section_4_bm25_scoring),
+        "5": ("Complete Pipeline (Discover → Filter → Crawl)", section_5_keyword_filter_to_agent),
+        "6": ("Multi-Domain Discovery", section_6_multi_domain),
+        "7": ("Run All Demos", None)
+    }
+    
+    console.print("\n[bold]Available Demos:[/bold]")
+    for key, (title, _) in sections.items():
+        console.print(f"  {key}. {title}")
+    
+    choice = Prompt.ask("\n[cyan]Which demo would you like to run?[/cyan]", 
+                       choices=list(sections.keys()), 
+                       default="7")
+    
+    console.print()
+    
+    if choice == "7":
+        # Run all demos
+        for key, (title, func) in sections.items():
+            if key != "7" and func:
+                await func(seed)
+                if key != "6":  # Don't pause after the last demo
+                    if not Confirm.ask("\n[yellow]Continue to next demo?[/yellow]", default=True):
+                        break
+                    console.print()
+    else:
+        # Run selected demo
+        _, func = sections[choice]
+        await func(seed)
+    
+    console.rule("[bold green]Demo Complete ✔︎")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/examples/url_seeder/url_seeder_quick_demo.py b/docs/examples/url_seeder/url_seeder_quick_demo.py
new file mode 100644
index 00000000..3dc58b98
--- /dev/null
+++ b/docs/examples/url_seeder/url_seeder_quick_demo.py
@@ -0,0 +1,128 @@
+"""
+🚀 URL Seeder + AsyncWebCrawler = Magic!
+Quick demo showing discovery → filter → crawl pipeline
+"""
+import asyncio
+from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunConfig, AsyncLogger, DefaultMarkdownGenerator 
+from crawl4ai.content_filter_strategy import PruningContentFilter
+
+# 🔍 Example 1: Discover ALL → Filter → Crawl
+async def discover_and_crawl():
+    """Find Python module tutorials & extract them all!"""
+    seeder = AsyncUrlSeeder(
+        logger=AsyncLogger()  # Log everything
+    )
+    
+    # Step 1: See how many URLs exist (spoiler: A LOT!)
+    print("📊 Let's see what RealPython has...")
+    all_urls = await seeder.urls("realpython.com", 
+                                SeedingConfig(source="sitemap"))
+    print(f"😱 Found {len(all_urls)} total URLs!")
+    
+    # Step 2: Filter for Python modules (perfect size ~13)
+    print("\n🎯 Filtering for 'python-modules' tutorials...")
+    module_urls = await seeder.urls("realpython.com", 
+                                  SeedingConfig(
+                                      source="sitemap",
+                                      pattern="*python-modules*",
+                                      live_check=True  # Make sure they're alive!
+                                  ))
+    
+    print(f"✨ Found {len(module_urls)} module tutorials")
+    for url in module_urls[:3]:  # Show first 3
+        status = "✅" if url["status"] == "valid" else "❌"
+        print(f"{status} {url['url']}")
+    
+    # Step 3: Crawl them all with pruning (keep it lean!)
+    print("\n🕷️ Crawling all module tutorials...")
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(  # Smart filtering!
+                    threshold=0.48,  # Remove fluff
+                    threshold_type="fixed",
+                ),
+            ),
+            only_text=True,
+            stream=True,  
+        )
+        
+        # Extract just the URLs from the seeder results
+        urls_to_crawl = [u["url"] for u in module_urls[:5]]
+        results = await crawler.arun_many(urls_to_crawl, config=config)
+        
+        # Process & save
+        saved = 0
+        async for result in results:
+            if result.success:
+                # Save each tutorial (name from URL)
+                name = result.url.split("/")[-2] + ".md"
+                with open(name, "w") as f:
+                    f.write(result.markdown.fit_markdown)
+                saved += 1
+                print(f"💾 Saved: {name}")
+        
+        print(f"\n🎉 Successfully saved {saved} tutorials!")
+
+# 🔍 Example 2: Beautiful Soup articles with metadata peek
+async def explore_beautifulsoup():
+    """Discover BeautifulSoup content & peek at metadata"""
+    seeder = AsyncUrlSeeder(logger=AsyncLogger() )
+    
+    print("🍲 Looking for Beautiful Soup articles...")
+    soup_urls = await seeder.urls("realpython.com",
+                                SeedingConfig(
+                                    source="sitemap",
+                                    pattern="*beautiful-soup*",
+                                    extract_head=True  # Get the metadata!
+                                ))
+    
+    print(f"\n📚 Found {len(soup_urls)} Beautiful Soup articles:\n")
+    
+    # Show what we discovered
+    for i, url in enumerate(soup_urls, 1):
+        meta = url["head_data"]["meta"]
+        
+        print(f"{i}. {url['head_data']['title']}")
+        print(f"   📝 {meta.get('description', 'No description')[:60]}...")
+        print(f"   👤 By: {meta.get('author', 'Unknown')}")
+        print(f"   🔗 {url['url']}\n")
+
+# 🔍 Example 3: Smart search with BM25 relevance scoring
+async def smart_search_with_bm25():
+    """Use AI-powered relevance scoring to find the best content"""
+    seeder = AsyncUrlSeeder(logger=AsyncLogger() )
+    
+    print("🧠 Smart search: 'web scraping tutorial quiz'")
+    
+    # Search with BM25 scoring - AI finds the best matches!
+    results = await seeder.urls("realpython.com",
+                              SeedingConfig(
+                                  source="sitemap",
+                                  pattern="*beautiful-soup*",
+                                  extract_head=True,
+                                  query="web scraping tutorial quiz",  # Our search
+                                  scoring_method="bm25",
+                                  score_threshold=0.2  # Quality filter
+                              ))
+    
+    print(f"\n🎯 Top {len(results)} most relevant results:\n")
+    
+    # Show ranked results with relevance scores
+    for i, result in enumerate(results[:3], 1):
+        print(f"{i}. [{result['relevance_score']:.2f}] {result['head_data']['title']}")
+        print(f"   🔗 {result['url'][:60]}...")
+    
+    print("\n✨ BM25 automatically ranked by relevance!")
+
+# 🎬 Run the show!
+async def main():
+    # print("=" * 60)
+    # await discover_and_crawl()
+    # print("\n" + "=" * 60 + "\n")
+    # await explore_beautifulsoup()
+    # print("\n" + "=" * 60 + "\n")
+    await smart_search_with_bm25()
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css
index 46b90ab0..fcd56b7a 100644
--- a/docs/md_v2/assets/styles.css
+++ b/docs/md_v2/assets/styles.css
@@ -16,22 +16,10 @@
     --mono-font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
         Courier New, monospace, serif;
 
-    --background-color: #151515; /* Dark background */
-    --font-color: #eaeaea; /* Light font color for contrast */
-    --invert-font-color: #151515; /* Dark color for inverted elements */
-    --primary-color: #1a95e0; /* Primary color can remain the same or be adjusted for better contrast */
-    --secondary-color: #727578; /* Secondary color for less important text */
     --secondary-dimmed-color: #8b857a; /* Dimmed secondary color */
-    --error-color: #ff5555; /* Bright color for errors */
-    --progress-bar-background: #444; /* Darker background for progress bar */
-    --progress-bar-fill: #1a95e0; /* Bright color for progress bar fill */
-    --code-bg-color: #1e1e1e; /* Darker background for code blocks */
-    --input-style: solid; /* Keeping input style solid */
     --block-background-color: #202020; /* Darker background for block elements */
     --global-font-color: #eaeaea; /* Light font color for global elements */
 
-    --background-color: #222225;
-
     --background-color: #070708;
     --page-width: 70em;
     --font-color: #e8e9ed;
@@ -40,7 +28,7 @@
     --secondary-color: #d5cec0;
     --tertiary-color: #a3abba;
     --primary-dimmed-color: #09b5a5; /* Updated to the brand color */
-    --primary-color: #50ffff; /* Updated to the brand color */
+    --primary-color: #0fbbaa; /* Updated to the brand color */
     --accent-color: rgb(243, 128, 245);
     --error-color: #ff3c74;
     --progress-bar-background: #3f3f44;
diff --git a/docs/md_v2/core/url-seeding.md b/docs/md_v2/core/url-seeding.md
new file mode 100644
index 00000000..7ffe9ea7
--- /dev/null
+++ b/docs/md_v2/core/url-seeding.md
@@ -0,0 +1,1000 @@
+# URL Seeding: The Smart Way to Crawl at Scale
+
+## Why URL Seeding?
+
+Web crawling comes in different flavors, each with its own strengths. Let's understand when to use URL seeding versus deep crawling.
+
+### Deep Crawling: Real-Time Discovery
+
+Deep crawling is perfect when you need:
+- **Fresh, real-time data** - discovering pages as they're created
+- **Dynamic exploration** - following links based on content
+- **Selective extraction** - stopping when you find what you need
+
+```python
+# Deep crawling example: Explore a website dynamically
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+
+async def deep_crawl_example():
+    # Configure a 2-level deep crawl
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2,           # Crawl 2 levels deep
+            include_external=False, # Stay within domain
+            max_pages=50           # Limit for efficiency
+        ),
+        verbose=True
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        # Start crawling and follow links dynamically
+        results = await crawler.arun("https://example.com", config=config)
+        
+        print(f"Discovered and crawled {len(results)} pages")
+        for result in results[:3]:
+            print(f"Found: {result.url} at depth {result.metadata.get('depth', 0)}")
+
+asyncio.run(deep_crawl_example())
+```
+
+### URL Seeding: Bulk Discovery
+
+URL seeding shines when you want:
+- **Comprehensive coverage** - get thousands of URLs in seconds
+- **Bulk processing** - filter before crawling
+- **Resource efficiency** - know exactly what you'll crawl
+
+```python
+# URL seeding example: Analyze all documentation
+from crawl4ai import AsyncUrlSeeder, SeedingConfig
+
+seeder = AsyncUrlSeeder()
+config = SeedingConfig(
+    source="sitemap",
+    extract_head=True,
+    pattern="*/docs/*"
+)
+
+# Get ALL documentation URLs instantly
+urls = await seeder.urls("example.com", config)
+# 1000+ URLs discovered in seconds!
+```
+
+### The Trade-offs
+
+| Aspect | Deep Crawling | URL Seeding |
+|--------|---------------|-------------|
+| **Coverage** | Discovers pages dynamically | Gets most existing URLs instantly |
+| **Freshness** | Finds brand new pages | May miss very recent pages |
+| **Speed** | Slower, page by page | Extremely fast bulk discovery |
+| **Resource Usage** | Higher - crawls to discover | Lower - discovers then crawls |
+| **Control** | Can stop mid-process | Pre-filters before crawling |
+
+### When to Use Each
+
+**Choose Deep Crawling when:**
+- You need the absolute latest content
+- You're searching for specific information
+- The site structure is unknown or dynamic
+- You want to stop as soon as you find what you need
+
+**Choose URL Seeding when:**
+- You need to analyze large portions of a site
+- You want to filter URLs before crawling
+- You're doing comparative analysis
+- You need to optimize resource usage
+
+The magic happens when you understand both approaches and choose the right tool for your task. Sometimes, you might even combine them - use URL seeding for bulk discovery, then deep crawl specific sections for the latest updates.
+
+## Your First URL Seeding Adventure
+
+Let's see the magic in action. We'll discover blog posts about Python, filter for tutorials, and crawl only those pages.
+
+```python
+import asyncio
+from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunConfig
+
+async def smart_blog_crawler():
+    # Step 1: Create our URL discoverer
+    seeder = AsyncUrlSeeder()
+    
+    # Step 2: Configure discovery - let's find all blog posts
+    config = SeedingConfig(
+        source="sitemap",           # Use the website's sitemap
+        pattern="*/blog/*.html",    # Only blog posts
+        extract_head=True,          # Get page metadata
+        max_urls=100               # Limit for this example
+    )
+    
+    # Step 3: Discover URLs from the Python blog
+    print("🔍 Discovering blog posts...")
+    urls = await seeder.urls("realpython.com", config)
+    print(f"✅ Found {len(urls)} blog posts")
+    
+    # Step 4: Filter for Python tutorials (using metadata!)
+    tutorials = [
+        url for url in urls 
+        if url["status"] == "valid" and 
+        any(keyword in str(url["head_data"]).lower() 
+            for keyword in ["tutorial", "guide", "how to"])
+    ]
+    print(f"📚 Filtered to {len(tutorials)} tutorials")
+    
+    # Step 5: Show what we found
+    print("\n🎯 Found these tutorials:")
+    for tutorial in tutorials[:5]:  # First 5
+        title = tutorial["head_data"].get("title", "No title")
+        print(f"  - {title}")
+        print(f"    {tutorial['url']}")
+    
+    # Step 6: Now crawl ONLY these relevant pages
+    print("\n🚀 Crawling tutorials...")
+    async with AsyncWebCrawler() as crawler:
+        config = CrawlerRunConfig(
+            only_text=True,
+            word_count_threshold=300  # Only substantial articles
+        )
+        
+        # Extract URLs and stream results as they come
+        tutorial_urls = [t["url"] for t in tutorials[:10]]
+        results = await crawler.arun_many(tutorial_urls, config=config)
+        
+        successful = 0
+        async for result in results:
+            if result.success:
+                successful += 1
+                print(f"✓ Crawled: {result.url[:60]}...")
+        
+        print(f"\n✨ Successfully crawled {successful} tutorials!")
+
+# Run it!
+asyncio.run(smart_blog_crawler())
+```
+
+**What just happened?**
+
+1. We discovered all blog URLs from the sitemap
+2. We filtered using metadata (no crawling needed!)
+3. We crawled only the relevant tutorials
+4. We saved tons of time and bandwidth
+
+This is the power of URL seeding - you see everything before you crawl anything.
+
+## Understanding the URL Seeder
+
+Now that you've seen the magic, let's understand how it works.
+
+### Basic Usage
+
+Creating a URL seeder is simple:
+
+```python
+from crawl4ai import AsyncUrlSeeder
+
+# Create a seeder instance
+seeder = AsyncUrlSeeder()
+
+# Discover URLs from a domain
+config = SeedingConfig(source="sitemap")
+urls = await seeder.urls("example.com", config)
+```
+
+The seeder can discover URLs from two powerful sources:
+
+#### 1. Sitemaps (Fastest)
+
+```python
+# Discover from sitemap
+config = SeedingConfig(source="sitemap")
+urls = await seeder.urls("example.com", config)
+```
+
+Sitemaps are XML files that websites create specifically to list all their URLs. It's like getting a menu at a restaurant - everything is listed upfront.
+
+#### 2. Common Crawl (Most Comprehensive)
+
+```python
+# Discover from Common Crawl
+config = SeedingConfig(source="cc")
+urls = await seeder.urls("example.com", config)
+```
+
+Common Crawl is a massive public dataset that regularly crawls the entire web. It's like having access to a pre-built index of the internet.
+
+#### 3. Both Sources (Maximum Coverage)
+
+```python
+# Use both sources
+config = SeedingConfig(source="cc+sitemap")
+urls = await seeder.urls("example.com", config)
+```
+
+### Configuration Magic: SeedingConfig
+
+The `SeedingConfig` object is your control panel. Here's everything you can configure:
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
+| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
+| `extract_head` | bool | False | Extract metadata from page `<head>` |
+| `live_check` | bool | False | Verify URLs are accessible |
+| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
+| `concurrency` | int | 10 | Parallel workers for fetching |
+| `hits_per_sec` | int | None | Rate limit for requests |
+| `force` | bool | False | Bypass cache, fetch fresh data |
+| `verbose` | bool | False | Show detailed progress |
+| `query` | str | None | Search query for BM25 scoring |
+| `scoring_method` | str | None | Scoring method (currently "bm25") |
+| `score_threshold` | float | None | Minimum score to include URL |
+
+#### Pattern Matching Examples
+
+```python
+# Match all blog posts
+config = SeedingConfig(pattern="*/blog/*")
+
+# Match only HTML files
+config = SeedingConfig(pattern="*.html")
+
+# Match product pages
+config = SeedingConfig(pattern="*/product/*")
+
+# Match everything except admin pages
+config = SeedingConfig(pattern="*")
+# Then filter: urls = [u for u in urls if "/admin/" not in u["url"]]
+```
+
+### URL Validation: Live Checking
+
+Sometimes you need to know if URLs are actually accessible. That's where live checking comes in:
+
+```python
+config = SeedingConfig(
+    source="sitemap",
+    live_check=True,  # Verify each URL is accessible
+    concurrency=20    # Check 20 URLs in parallel
+)
+
+urls = await seeder.urls("example.com", config)
+
+# Now you can filter by status
+live_urls = [u for u in urls if u["status"] == "valid"]
+dead_urls = [u for u in urls if u["status"] == "not_valid"]
+
+print(f"Live URLs: {len(live_urls)}")
+print(f"Dead URLs: {len(dead_urls)}")
+```
+
+**When to use live checking:**
+- Before a large crawling operation
+- When working with older sitemaps
+- When data freshness is critical
+
+**When to skip it:**
+- Quick explorations
+- When you trust the source
+- When speed is more important than accuracy
+
+### The Power of Metadata: Head Extraction
+
+This is where URL seeding gets really powerful. Instead of crawling entire pages, you can extract just the metadata:
+
+```python
+config = SeedingConfig(
+    extract_head=True  # Extract metadata from <head> section
+)
+
+urls = await seeder.urls("example.com", config)
+
+# Now each URL has rich metadata
+for url in urls[:3]:
+    print(f"\nURL: {url['url']}")
+    print(f"Title: {url['head_data'].get('title')}")
+    
+    meta = url['head_data'].get('meta', {})
+    print(f"Description: {meta.get('description')}")
+    print(f"Keywords: {meta.get('keywords')}")
+    
+    # Even Open Graph data!
+    print(f"OG Image: {meta.get('og:image')}")
+```
+
+#### What Can We Extract?
+
+The head extraction gives you a treasure trove of information:
+
+```python
+# Example of extracted head_data
+{
+    "title": "10 Python Tips for Beginners",
+    "charset": "utf-8",
+    "lang": "en",
+    "meta": {
+        "description": "Learn essential Python tips...",
+        "keywords": "python, programming, tutorial",
+        "author": "Jane Developer",
+        "viewport": "width=device-width, initial-scale=1",
+        
+        # Open Graph tags
+        "og:title": "10 Python Tips for Beginners",
+        "og:description": "Essential Python tips for new programmers",
+        "og:image": "https://example.com/python-tips.jpg",
+        "og:type": "article",
+        
+        # Twitter Card tags
+        "twitter:card": "summary_large_image",
+        "twitter:title": "10 Python Tips",
+        
+        # Dublin Core metadata
+        "dc.creator": "Jane Developer",
+        "dc.date": "2024-01-15"
+    },
+    "link": {
+        "canonical": [{"href": "https://example.com/blog/python-tips"}],
+        "alternate": [{"href": "/feed.xml", "type": "application/rss+xml"}]
+    },
+    "jsonld": [
+        {
+            "@type": "Article",
+            "headline": "10 Python Tips for Beginners",
+            "datePublished": "2024-01-15",
+            "author": {"@type": "Person", "name": "Jane Developer"}
+        }
+    ]
+}
+```
+
+This metadata is gold for filtering! You can find exactly what you need without crawling a single page.
+
+### Understanding Results
+
+Each URL in the results has this structure:
+
+```python
+{
+    "url": "https://example.com/blog/python-tips.html",
+    "status": "valid",        # "valid", "not_valid", or "unknown"
+    "head_data": {            # Only if extract_head=True
+        "title": "Page Title",
+        "meta": {...},
+        "link": {...},
+        "jsonld": [...]
+    },
+    "relevance_score": 0.85   # Only if using BM25 scoring
+}
+```
+
+Let's see a real example:
+
+```python
+config = SeedingConfig(
+    source="sitemap",
+    extract_head=True,
+    live_check=True
+)
+
+urls = await seeder.urls("blog.example.com", config)
+
+# Analyze the results
+for url in urls[:5]:
+    print(f"\n{'='*60}")
+    print(f"URL: {url['url']}")
+    print(f"Status: {url['status']}")
+    
+    if url['head_data']:
+        data = url['head_data']
+        print(f"Title: {data.get('title', 'No title')}")
+        
+        # Check content type
+        meta = data.get('meta', {})
+        content_type = meta.get('og:type', 'unknown')
+        print(f"Content Type: {content_type}")
+        
+        # Publication date
+        pub_date = None
+        for jsonld in data.get('jsonld', []):
+            if isinstance(jsonld, dict):
+                pub_date = jsonld.get('datePublished')
+                if pub_date:
+                    break
+        
+        if pub_date:
+            print(f"Published: {pub_date}")
+        
+        # Word count (if available)
+        word_count = meta.get('word_count')
+        if word_count:
+            print(f"Word Count: {word_count}")
+```
+
+## Smart Filtering with BM25 Scoring
+
+Now for the really cool part - intelligent filtering based on relevance!
+
+### Introduction to Relevance Scoring
+
+BM25 is a ranking algorithm that scores how relevant a document is to a search query. With URL seeding, we can score URLs based on their metadata *before* crawling them.
+
+Think of it like this:
+- Traditional way: Read every book in the library to find ones about Python
+- Smart way: Check the titles and descriptions, score them, read only the most relevant
+
+### Query-Based Discovery
+
+Here's how to use BM25 scoring:
+
+```python
+config = SeedingConfig(
+    source="sitemap",
+    extract_head=True,           # Required for scoring
+    query="python async tutorial",  # What we're looking for
+    scoring_method="bm25",       # Use BM25 algorithm
+    score_threshold=0.3          # Minimum relevance score
+)
+
+urls = await seeder.urls("realpython.com", config)
+
+# Results are automatically sorted by relevance!
+for url in urls[:5]:
+    print(f"Score: {url['relevance_score']:.2f} - {url['url']}")
+    print(f"  Title: {url['head_data']['title']}")
+```
+
+### Real Examples
+
+#### Finding Documentation Pages
+
+```python
+# Find API documentation
+config = SeedingConfig(
+    source="sitemap",
+    extract_head=True,
+    query="API reference documentation endpoints",
+    scoring_method="bm25",
+    score_threshold=0.5,
+    max_urls=20
+)
+
+urls = await seeder.urls("docs.example.com", config)
+
+# The highest scoring URLs will be API docs!
+```
+
+#### Discovering Product Pages
+
+```python
+# Find specific products
+config = SeedingConfig(
+    source="cc+sitemap",  # Use both sources
+    extract_head=True,
+    query="wireless headphones noise canceling",
+    scoring_method="bm25",
+    score_threshold=0.4,
+    pattern="*/product/*"  # Combine with pattern matching
+)
+
+urls = await seeder.urls("shop.example.com", config)
+
+# Filter further by price (from metadata)
+affordable = [
+    u for u in urls 
+    if float(u['head_data'].get('meta', {}).get('product:price', '0')) < 200
+]
+```
+
+#### Filtering News Articles
+
+```python
+# Find recent news about AI
+config = SeedingConfig(
+    source="sitemap",
+    extract_head=True,
+    query="artificial intelligence machine learning breakthrough",
+    scoring_method="bm25",
+    score_threshold=0.35
+)
+
+urls = await seeder.urls("technews.com", config)
+
+# Filter by date
+from datetime import datetime, timedelta
+
+recent = []
+cutoff = datetime.now() - timedelta(days=7)
+
+for url in urls:
+    # Check JSON-LD for publication date
+    for jsonld in url['head_data'].get('jsonld', []):
+        if 'datePublished' in jsonld:
+            pub_date = datetime.fromisoformat(jsonld['datePublished'].replace('Z', '+00:00'))
+            if pub_date > cutoff:
+                recent.append(url)
+                break
+```
+
+#### Complex Query Patterns
+
+```python
+# Multi-concept queries
+queries = [
+    "python async await concurrency tutorial",
+    "data science pandas numpy visualization",
+    "web scraping beautifulsoup selenium automation",
+    "machine learning tensorflow keras deep learning"
+]
+
+all_tutorials = []
+
+for query in queries:
+    config = SeedingConfig(
+        source="sitemap",
+        extract_head=True,
+        query=query,
+        scoring_method="bm25",
+        score_threshold=0.4,
+        max_urls=10  # Top 10 per topic
+    )
+    
+    urls = await seeder.urls("learning-platform.com", config)
+    all_tutorials.extend(urls)
+
+# Remove duplicates while preserving order
+seen = set()
+unique_tutorials = []
+for url in all_tutorials:
+    if url['url'] not in seen:
+        seen.add(url['url'])
+        unique_tutorials.append(url)
+
+print(f"Found {len(unique_tutorials)} unique tutorials across all topics")
+```
+
+## Scaling Up: Multiple Domains
+
+When you need to discover URLs across multiple websites, URL seeding really shines.
+
+### The `many_urls` Method
+
+```python
+# Discover URLs from multiple domains in parallel
+domains = ["site1.com", "site2.com", "site3.com"]
+
+config = SeedingConfig(
+    source="sitemap",
+    extract_head=True,
+    query="python tutorial",
+    scoring_method="bm25",
+    score_threshold=0.3
+)
+
+# Returns a dictionary: {domain: [urls]}
+results = await seeder.many_urls(domains, config)
+
+# Process results
+for domain, urls in results.items():
+    print(f"\n{domain}: Found {len(urls)} relevant URLs")
+    if urls:
+        top = urls[0]  # Highest scoring
+        print(f"  Top result: {top['url']}")
+        print(f"  Score: {top['relevance_score']:.2f}")
+```
+
+### Cross-Domain Examples
+
+#### Competitor Analysis
+
+```python
+# Analyze content strategies across competitors
+competitors = [
+    "competitor1.com",
+    "competitor2.com", 
+    "competitor3.com"
+]
+
+config = SeedingConfig(
+    source="sitemap",
+    extract_head=True,
+    pattern="*/blog/*",
+    max_urls=100
+)
+
+results = await seeder.many_urls(competitors, config)
+
+# Analyze content types
+for domain, urls in results.items():
+    content_types = {}
+    
+    for url in urls:
+        # Extract content type from metadata
+        og_type = url['head_data'].get('meta', {}).get('og:type', 'unknown')
+        content_types[og_type] = content_types.get(og_type, 0) + 1
+    
+    print(f"\n{domain} content distribution:")
+    for ctype, count in sorted(content_types.items(), key=lambda x: x[1], reverse=True):
+        print(f"  {ctype}: {count}")
+```
+
+#### Industry Research
+
+```python
+# Research Python tutorials across educational sites
+educational_sites = [
+    "realpython.com",
+    "pythontutorial.net",
+    "learnpython.org",
+    "python.org"
+]
+
+config = SeedingConfig(
+    source="sitemap",
+    extract_head=True,
+    query="beginner python tutorial basics",
+    scoring_method="bm25",
+    score_threshold=0.3,
+    max_urls=20  # Per site
+)
+
+results = await seeder.many_urls(educational_sites, config)
+
+# Find the best beginner tutorials
+all_tutorials = []
+for domain, urls in results.items():
+    for url in urls:
+        url['domain'] = domain  # Add domain info
+        all_tutorials.append(url)
+
+# Sort by relevance across all domains
+all_tutorials.sort(key=lambda x: x['relevance_score'], reverse=True)
+
+print("Top 10 Python tutorials for beginners across all sites:")
+for i, tutorial in enumerate(all_tutorials[:10], 1):
+    print(f"{i}. [{tutorial['relevance_score']:.2f}] {tutorial['head_data']['title']}")
+    print(f"   {tutorial['url']}")
+    print(f"   From: {tutorial['domain']}")
+```
+
+#### Multi-Site Monitoring
+
+```python
+# Monitor news about your company across multiple sources
+news_sites = [
+    "techcrunch.com",
+    "theverge.com",
+    "wired.com",
+    "arstechnica.com"
+]
+
+company_name = "YourCompany"
+
+config = SeedingConfig(
+    source="cc",  # Common Crawl for recent content
+    extract_head=True,
+    query=f"{company_name} announcement news",
+    scoring_method="bm25",
+    score_threshold=0.5,  # High threshold for relevance
+    max_urls=10
+)
+
+results = await seeder.many_urls(news_sites, config)
+
+# Collect all mentions
+mentions = []
+for domain, urls in results.items():
+    mentions.extend(urls)
+
+if mentions:
+    print(f"Found {len(mentions)} mentions of {company_name}:")
+    for mention in mentions:
+        print(f"\n- {mention['head_data']['title']}")
+        print(f"  {mention['url']}")
+        print(f"  Score: {mention['relevance_score']:.2f}")
+else:
+    print(f"No recent mentions of {company_name} found")
+```
+
+## Advanced Integration Patterns
+
+Let's put everything together in a real-world example.
+
+### Building a Research Assistant
+
+Here's a complete example that discovers, scores, filters, and crawls intelligently:
+
+```python
+import asyncio
+from datetime import datetime
+from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunConfig
+
+class ResearchAssistant:
+    def __init__(self):
+        self.seeder = AsyncUrlSeeder()
+        
+    async def research_topic(self, topic, domains, max_articles=20):
+        """Research a topic across multiple domains."""
+        
+        print(f"🔬 Researching '{topic}' across {len(domains)} domains...")
+        
+        # Step 1: Discover relevant URLs
+        config = SeedingConfig(
+            source="cc+sitemap",     # Maximum coverage
+            extract_head=True,       # Get metadata
+            query=topic,             # Research topic
+            scoring_method="bm25",   # Smart scoring
+            score_threshold=0.4,     # Quality threshold
+            max_urls=10,             # Per domain
+            concurrency=20,          # Fast discovery
+            verbose=True
+        )
+        
+        # Discover across all domains
+        discoveries = await self.seeder.many_urls(domains, config)
+        
+        # Step 2: Collect and rank all articles
+        all_articles = []
+        for domain, urls in discoveries.items():
+            for url in urls:
+                url['domain'] = domain
+                all_articles.append(url)
+        
+        # Sort by relevance
+        all_articles.sort(key=lambda x: x['relevance_score'], reverse=True)
+        
+        # Take top articles
+        top_articles = all_articles[:max_articles]
+        
+        print(f"\n📊 Found {len(all_articles)} relevant articles")
+        print(f"📌 Selected top {len(top_articles)} for deep analysis")
+        
+        # Step 3: Show what we're about to crawl
+        print("\n🎯 Articles to analyze:")
+        for i, article in enumerate(top_articles[:5], 1):
+            print(f"\n{i}. {article['head_data']['title']}")
+            print(f"   Score: {article['relevance_score']:.2f}")
+            print(f"   Source: {article['domain']}")
+            print(f"   URL: {article['url'][:60]}...")
+        
+        # Step 4: Crawl the selected articles
+        print(f"\n🚀 Deep crawling {len(top_articles)} articles...")
+        
+        async with AsyncWebCrawler() as crawler:
+            config = CrawlerRunConfig(
+                only_text=True,
+                word_count_threshold=200,  # Substantial content only
+                stream=True
+            )
+            
+            # Extract URLs and crawl all articles
+            article_urls = [article['url'] for article in top_articles]
+            results = []
+            async for result in await crawler.arun_many(article_urls, config=config):
+                if result.success:
+                    results.append({
+                        'url': result.url,
+                        'title': result.metadata.get('title', 'No title'),
+                        'content': result.markdown.raw_markdown,
+                        'domain': next(a['domain'] for a in top_articles if a['url'] == result.url),
+                        'score': next(a['relevance_score'] for a in top_articles if a['url'] == result.url)
+                    })
+                    print(f"✓ Crawled: {result.url[:60]}...")
+            
+        # Step 5: Analyze and summarize
+        print(f"\n📝 Analysis complete! Crawled {len(results)} articles")
+        
+        return self.create_research_summary(topic, results)
+    
+    def create_research_summary(self, topic, articles):
+        """Create a research summary from crawled articles."""
+        
+        summary = {
+            'topic': topic,
+            'timestamp': datetime.now().isoformat(),
+            'total_articles': len(articles),
+            'sources': {}
+        }
+        
+        # Group by domain
+        for article in articles:
+            domain = article['domain']
+            if domain not in summary['sources']:
+                summary['sources'][domain] = []
+            
+            summary['sources'][domain].append({
+                'title': article['title'],
+                'url': article['url'],
+                'score': article['score'],
+                'excerpt': article['content'][:500] + '...' if len(article['content']) > 500 else article['content']
+            })
+        
+        return summary
+
+# Use the research assistant
+async def main():
+    assistant = ResearchAssistant()
+    
+    # Research Python async programming across multiple sources
+    topic = "python asyncio best practices performance optimization"
+    domains = [
+        "realpython.com",
+        "python.org",
+        "stackoverflow.com",
+        "medium.com"
+    ]
+    
+    summary = await assistant.research_topic(topic, domains, max_articles=15)
+    
+    # Display results
+    print("\n" + "="*60)
+    print("RESEARCH SUMMARY")
+    print("="*60)
+    print(f"Topic: {summary['topic']}")
+    print(f"Date: {summary['timestamp']}")
+    print(f"Total Articles Analyzed: {summary['total_articles']}")
+    
+    print("\nKey Findings by Source:")
+    for domain, articles in summary['sources'].items():
+        print(f"\n📚 {domain} ({len(articles)} articles)")
+        for article in articles[:2]:  # Top 2 per domain
+            print(f"\n  Title: {article['title']}")
+            print(f"  Relevance: {article['score']:.2f}")
+            print(f"  Preview: {article['excerpt'][:200]}...")
+
+asyncio.run(main())
+```
+
+### Performance Optimization Tips
+
+1. **Use caching wisely**
+```python
+# First run - populate cache
+config = SeedingConfig(source="sitemap", extract_head=True, force=True)
+urls = await seeder.urls("example.com", config)
+
+# Subsequent runs - use cache (much faster)
+config = SeedingConfig(source="sitemap", extract_head=True, force=False)
+urls = await seeder.urls("example.com", config)
+```
+
+2. **Optimize concurrency**
+```python
+# For many small requests (like HEAD checks)
+config = SeedingConfig(concurrency=50, hits_per_sec=20)
+
+# For fewer large requests (like full head extraction)
+config = SeedingConfig(concurrency=10, hits_per_sec=5)
+```
+
+3. **Stream large result sets**
+```python
+# When crawling many URLs
+async with AsyncWebCrawler() as crawler:
+    # Assuming urls is a list of URL strings
+    results = await crawler.arun_many(urls, config=config)
+    
+    # Process as they arrive
+    async for result in results:
+        process_immediately(result)  # Don't wait for all
+```
+
+## Best Practices & Tips
+
+### Cache Management
+
+The seeder automatically caches results to speed up repeated operations:
+
+- **Common Crawl cache**: `~/.crawl4ai/seeder_cache/[index]_[domain]_[hash].jsonl`
+- **Sitemap cache**: `~/.crawl4ai/seeder_cache/sitemap_[domain]_[hash].jsonl`
+- **HEAD data cache**: `~/.cache/url_seeder/head/[hash].json`
+
+Cache expires after 7 days by default. Use `force=True` to refresh.
+
+### Pattern Matching Strategies
+
+```python
+# Be specific when possible
+good_pattern = "*/blog/2024/*.html"  # Specific
+bad_pattern = "*"                     # Too broad
+
+# Combine patterns with metadata filtering
+config = SeedingConfig(
+    pattern="*/articles/*",
+    extract_head=True
+)
+urls = await seeder.urls("news.com", config)
+
+# Further filter by publish date, author, category, etc.
+recent = [u for u in urls if is_recent(u['head_data'])]
+```
+
+### Rate Limiting Considerations
+
+```python
+# Be respectful of servers
+config = SeedingConfig(
+    hits_per_sec=10,      # Max 10 requests per second
+    concurrency=20        # But use 20 workers
+)
+
+# For your own servers
+config = SeedingConfig(
+    hits_per_sec=None,    # No limit
+    concurrency=100       # Go fast
+)
+```
+
+## Quick Reference
+
+### Common Patterns
+
+```python
+# Blog post discovery
+config = SeedingConfig(
+    source="sitemap",
+    pattern="*/blog/*",
+    extract_head=True,
+    query="your topic",
+    scoring_method="bm25"
+)
+
+# E-commerce product discovery
+config = SeedingConfig(
+    source="cc+sitemap",
+    pattern="*/product/*",
+    extract_head=True,
+    live_check=True
+)
+
+# Documentation search
+config = SeedingConfig(
+    source="sitemap",
+    pattern="*/docs/*",
+    extract_head=True,
+    query="API reference",
+    scoring_method="bm25",
+    score_threshold=0.5
+)
+
+# News monitoring
+config = SeedingConfig(
+    source="cc",
+    extract_head=True,
+    query="company name",
+    scoring_method="bm25",
+    max_urls=50
+)
+```
+
+### Troubleshooting Guide
+
+| Issue | Solution |
+|-------|----------|
+| No URLs found | Try `source="cc+sitemap"`, check domain spelling |
+| Slow discovery | Reduce `concurrency`, add `hits_per_sec` limit |
+| Missing metadata | Ensure `extract_head=True` |
+| Low relevance scores | Refine query, lower `score_threshold` |
+| Rate limit errors | Reduce `hits_per_sec` and `concurrency` |
+
+### Performance Benchmarks
+
+Typical performance on a standard connection:
+
+- **Sitemap discovery**: 100-1,000 URLs/second
+- **Common Crawl discovery**: 50-500 URLs/second  
+- **HEAD checking**: 10-50 URLs/second
+- **Head extraction**: 5-20 URLs/second
+- **BM25 scoring**: 10,000+ URLs/second
+
+## Conclusion
+
+URL seeding transforms web crawling from a blind expedition into a surgical strike. By discovering and analyzing URLs before crawling, you can:
+
+- Save hours of crawling time
+- Reduce bandwidth usage by 90%+
+- Find exactly what you need
+- Scale across multiple domains effortlessly
+
+Whether you're building a research tool, monitoring competitors, or creating a content aggregator, URL seeding gives you the intelligence to crawl smarter, not harder.
+
+Now go forth and seed intelligently! 🌱🚀
\ No newline at end of file
diff --git a/tests/general/test_async_url_seeder_bm25.py b/tests/general/test_async_url_seeder_bm25.py
new file mode 100644
index 00000000..31d6cff0
--- /dev/null
+++ b/tests/general/test_async_url_seeder_bm25.py
@@ -0,0 +1,711 @@
+"""
+Comprehensive test cases for AsyncUrlSeeder with BM25 scoring functionality.
+Tests cover all features including query-based scoring, metadata extraction, 
+edge cases, and integration scenarios.
+"""
+
+import asyncio
+import pytest
+from typing import List, Dict, Any
+from crawl4ai import AsyncUrlSeeder, SeedingConfig, AsyncLogger
+import json
+from datetime import datetime
+
+# Test domain - using docs.crawl4ai.com as it has the actual documentation
+TEST_DOMAIN = "kidocode.com"
+TEST_DOMAIN = "docs.crawl4ai.com"
+TEST_DOMAIN = "www.bbc.com/sport"
+
+
+class TestAsyncUrlSeederBM25:
+    """Comprehensive test suite for AsyncUrlSeeder with BM25 scoring."""
+    
+    async def create_seeder(self):
+        """Create an AsyncUrlSeeder instance for testing."""
+        logger = AsyncLogger()
+        return AsyncUrlSeeder(logger=logger)
+
+    # ============================================
+    # Basic BM25 Scoring Tests
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_basic_bm25_scoring(self, seeder):
+        """Test basic BM25 scoring with a simple query."""
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="premier league highlights",
+            scoring_method="bm25",
+            max_urls=200,
+            verbose=True,
+            force=True  # Force fresh fetch
+        )
+        
+        results = await seeder.urls(TEST_DOMAIN, config)
+        
+        # Verify results have relevance scores
+        assert all("relevance_score" in r for r in results)
+        
+        # Verify scores are normalized between 0 and 1
+        scores = [r["relevance_score"] for r in results]
+        assert all(0.0 <= s <= 1.0 for s in scores)
+        
+        # Verify results are sorted by relevance (descending)
+        assert scores == sorted(scores, reverse=True)
+        
+        # Print top 5 results for manual verification
+        print("\nTop 5 results for 'web crawling tutorial':")
+        for i, r in enumerate(results[:5]):
+            print(f"{i+1}. Score: {r['relevance_score']:.3f} - {r['url']}")
+    
+    @pytest.mark.asyncio
+    async def test_query_variations(self, seeder):
+        """Test BM25 scoring with different query variations."""
+        queries = [
+            "VAR controversy",
+            "player ratings",
+            "live score update",
+            "transfer rumours",
+            "post match analysis",
+            "injury news"
+        ]
+        
+        for query in queries:
+            config = SeedingConfig(
+                source="sitemap",
+                extract_head=True,
+                query=query,
+                scoring_method="bm25",
+                max_urls=100,
+                # force=True 
+            )
+            
+            results = await seeder.urls(TEST_DOMAIN, config)
+            
+            # Verify each query produces scored results
+            assert len(results) > 0
+            assert all("relevance_score" in r for r in results)
+            
+            print(f"\nTop result for '{query}':")
+            if results:
+                top = results[0]
+                print(f"  Score: {top['relevance_score']:.3f} - {top['url']}")
+    
+    # ============================================
+    # Score Threshold Tests
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_score_threshold_filtering(self, seeder):
+        """Test filtering results by minimum relevance score."""
+        thresholds = [0.1, 0.3, 0.5, 0.7]
+        
+        for threshold in thresholds:
+            config = SeedingConfig(
+                source="sitemap",
+                extract_head=True,
+                query="league standings",
+                score_threshold=threshold,
+                scoring_method="bm25",
+                max_urls=50
+            )
+            
+            results = await seeder.urls(TEST_DOMAIN, config)
+            
+            # Verify all results meet threshold
+            if results:
+                assert all(r["relevance_score"] >= threshold for r in results)
+            
+            print(f"\nThreshold {threshold}: {len(results)} URLs passed")
+    
+    @pytest.mark.asyncio
+    async def test_extreme_thresholds(self, seeder):
+        """Test edge cases with extreme threshold values."""
+        # Very low threshold - should return many results
+        config_low = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="match",
+            score_threshold=0.001,
+            scoring_method="bm25"
+        )
+        results_low = await seeder.urls(TEST_DOMAIN, config_low)
+        
+        # Very high threshold - might return few or no results
+        config_high = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="match",
+            score_threshold=0.99,
+            scoring_method="bm25"
+        )
+        results_high = await seeder.urls(TEST_DOMAIN, config_high)
+        
+        # Low threshold should return more results than high
+        assert len(results_low) >= len(results_high)
+        print(f"\nLow threshold (0.001): {len(results_low)} results")
+        print(f"High threshold (0.99): {len(results_high)} results")
+    
+    # ============================================
+    # Metadata Extraction Tests
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_comprehensive_metadata_extraction(self, seeder):
+        """Test extraction of all metadata types including JSON-LD."""
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="match report",
+            scoring_method="bm25",
+            max_urls=5,
+            verbose=True
+        )
+        
+        results = await seeder.urls(TEST_DOMAIN, config)
+        
+        for result in results:
+            head_data = result.get("head_data", {})
+            
+            # Check for various metadata fields
+            print(f"\nMetadata for {result['url']}:")
+            print(f"  Title: {head_data.get('title', 'N/A')}")
+            print(f"  Charset: {head_data.get('charset', 'N/A')}")
+            print(f"  Lang: {head_data.get('lang', 'N/A')}")
+            
+            # Check meta tags
+            meta = head_data.get("meta", {})
+            if meta:
+                print("  Meta tags found:")
+                for key in ["description", "keywords", "author", "viewport"]:
+                    if key in meta:
+                        print(f"    {key}: {meta[key][:50]}...")
+            
+            # Check for Open Graph tags
+            og_tags = {k: v for k, v in meta.items() if k.startswith("og:")}
+            if og_tags:
+                print("  Open Graph tags found:")
+                for k, v in list(og_tags.items())[:3]:
+                    print(f"    {k}: {v[:50]}...")
+            
+            # Check JSON-LD
+            if head_data.get("jsonld"):
+                print(f"  JSON-LD schemas found: {len(head_data['jsonld'])}")
+    
+    @pytest.mark.asyncio
+    async def test_jsonld_extraction_scoring(self, seeder):
+        """Test that JSON-LD data contributes to BM25 scoring."""
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="Premier League match report highlights",
+            scoring_method="bm25",
+            max_urls=20
+        )
+        
+        results = await seeder.urls(TEST_DOMAIN, config)
+        
+        # Find results with JSON-LD data
+        jsonld_results = [r for r in results if r.get("head_data", {}).get("jsonld")]
+        
+        if jsonld_results:
+            print(f"\nFound {len(jsonld_results)} URLs with JSON-LD data")
+            for r in jsonld_results[:3]:
+                print(f"  Score: {r['relevance_score']:.3f} - {r['url']}")
+                jsonld_data = r["head_data"]["jsonld"]
+                print(f"    JSON-LD types: {[item.get('@type', 'Unknown') for item in jsonld_data if isinstance(item, dict)]}")
+    
+    # ============================================
+    # Edge Cases and Error Handling
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_empty_query(self, seeder):
+        """Test behavior with empty query string."""
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="",
+            scoring_method="bm25",
+            max_urls=10
+        )
+        
+        results = await seeder.urls(TEST_DOMAIN, config)
+        
+        # Should return results but all with zero scores
+        assert len(results) > 0
+        assert all(r.get("relevance_score", 0) == 0 for r in results)
+    
+    @pytest.mark.asyncio
+    async def test_query_without_extract_head(self, seeder):
+        """Test query scoring when extract_head is False."""
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=False,  # This should trigger a warning
+            query="Premier League match report highlights",
+            scoring_method="bm25",
+            max_urls=10
+        )
+        
+        results = await seeder.urls(TEST_DOMAIN, config)
+        
+        # Results should not have relevance scores
+        assert all("relevance_score" not in r for r in results)
+        print("\nVerified: No scores added when extract_head=False")
+    
+    @pytest.mark.asyncio
+    async def test_special_characters_in_query(self, seeder):
+        """Test queries with special characters and symbols."""
+        special_queries = [
+            "premier league + analytics",
+            "injury/rehab routines",
+            "AI-powered scouting",
+            "match stats & xG",
+            "tactical@breakdown",
+            "transfer-window.yml"
+        ]
+        
+        for query in special_queries:
+            config = SeedingConfig(
+                source="sitemap",
+                extract_head=True,
+                query=query,
+                scoring_method="bm25",
+                max_urls=5
+            )
+            
+            try:
+                results = await seeder.urls(TEST_DOMAIN, config)
+                assert isinstance(results, list)
+                print(f"\n✓ Query '{query}' processed successfully")
+            except Exception as e:
+                pytest.fail(f"Failed on query '{query}': {str(e)}")
+    
+    @pytest.mark.asyncio
+    async def test_unicode_query(self, seeder):
+        """Test queries with Unicode characters."""
+        unicode_queries = [
+            "网页爬虫",  # Chinese
+            "веб-краулер",  # Russian
+            "🚀 crawl4ai",  # Emoji
+            "naïve implementation",  # Accented characters
+        ]
+        
+        for query in unicode_queries:
+            config = SeedingConfig(
+                source="sitemap",
+                extract_head=True,
+                query=query,
+                scoring_method="bm25",
+                max_urls=5
+            )
+            
+            try:
+                results = await seeder.urls(TEST_DOMAIN, config)
+                assert isinstance(results, list)
+                print(f"\n✓ Unicode query '{query}' processed successfully")
+            except Exception as e:
+                print(f"\n✗ Unicode query '{query}' failed: {str(e)}")
+    
+    # ============================================
+    # Performance and Scalability Tests
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_large_scale_scoring(self, seeder):
+        """Test BM25 scoring with many URLs."""
+        config = SeedingConfig(
+            source="cc+sitemap",  # Use both sources for more URLs
+            extract_head=True,
+            query="world cup group standings",
+            scoring_method="bm25",
+            max_urls=100,
+            concurrency=20,
+            hits_per_sec=10
+        )
+        
+        start_time = asyncio.get_event_loop().time()
+        results = await seeder.urls(TEST_DOMAIN, config)
+        elapsed = asyncio.get_event_loop().time() - start_time
+        
+        print(f"\nProcessed {len(results)} URLs in {elapsed:.2f} seconds")
+        print(f"Average time per URL: {elapsed/len(results)*1000:.1f}ms")
+        
+        # Verify scoring worked at scale
+        assert all("relevance_score" in r for r in results)
+        
+        # Check score distribution
+        scores = [r["relevance_score"] for r in results]
+        print(f"Score distribution:")
+        print(f"  Min: {min(scores):.3f}")
+        print(f"  Max: {max(scores):.3f}")
+        print(f"  Avg: {sum(scores)/len(scores):.3f}")
+    
+    @pytest.mark.asyncio
+    async def test_concurrent_scoring_consistency(self, seeder):
+        """Test that concurrent requests produce consistent scores."""
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="live score update",
+            scoring_method="bm25",
+            max_urls=20,
+            concurrency=10
+        )
+        
+        # Run the same query multiple times
+        results_list = []
+        for _ in range(3):
+            results = await seeder.urls(TEST_DOMAIN, config)
+            results_list.append(results)
+        
+        # Compare scores across runs (they should be identical for same URLs)
+        url_scores = {}
+        for results in results_list:
+            for r in results:
+                url = r["url"]
+                score = r["relevance_score"]
+                if url in url_scores:
+                    # Scores should be very close (allowing for tiny float differences)
+                    assert abs(url_scores[url] - score) < 0.001
+                else:
+                    url_scores[url] = score
+        
+        print(f"\n✓ Consistent scores across {len(results_list)} runs")
+    
+    # ============================================
+    # Multi-Domain Tests
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_many_urls_with_scoring(self, seeder):
+        """Test many_urls method with BM25 scoring."""
+        domains = [TEST_DOMAIN, "docs.crawl4ai.com", "example.com"]
+        
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            # live_check=True,
+            query="fixture list",
+            scoring_method="bm25",
+            score_threshold=0.2,
+            max_urls=10,
+            force=True,  # Force fresh fetch
+        )
+        
+        results_dict = await seeder.many_urls(domains, config)
+        
+        for domain, results in results_dict.items():
+            print(f"\nDomain: {domain}")
+            print(f"  Found {len(results)} URLs above threshold")
+            if results:
+                top = results[0]
+                print(f"  Top result: {top['relevance_score']:.3f} - {top['url']}")
+    
+    # ============================================
+    # Complex Query Tests
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_multi_word_complex_queries(self, seeder):
+        """Test complex multi-word queries."""
+        complex_queries = [
+            "how to follow live match commentary",
+            "extract expected goals stats from match data",
+            "premier league match report analysis",
+            "transfer rumours and confirmed signings tracker",
+            "tactical breakdown of high press strategy"
+        ]
+        
+        for query in complex_queries:
+            config = SeedingConfig(
+                source="sitemap",
+                extract_head=True,
+                query=query,
+                scoring_method="bm25",
+                max_urls=5
+            )
+            
+            results = await seeder.urls(TEST_DOMAIN, config)
+            
+            if results:
+                print(f"\nQuery: '{query}'")
+                print(f"Top match: {results[0]['relevance_score']:.3f} - {results[0]['url']}")
+                
+                # Extract matched terms from metadata
+                head_data = results[0].get("head_data", {})
+                title = head_data.get("title", "")
+                description = head_data.get("meta", {}).get("description", "")
+                
+                # Simple term matching for verification
+                query_terms = set(query.lower().split())
+                title_terms = set(title.lower().split())
+                desc_terms = set(description.lower().split())
+                
+                matched_terms = query_terms & (title_terms | desc_terms)
+                if matched_terms:
+                    print(f"Matched terms: {', '.join(matched_terms)}")
+    
+    # ============================================
+    # Cache and Force Tests
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_scoring_with_cache(self, seeder):
+        """Test that scoring works correctly with cached results."""
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="injury update timeline",
+            scoring_method="bm25",
+            max_urls=10,
+            force=False  # Use cache
+        )
+        
+        # First run - populate cache
+        results1 = await seeder.urls(TEST_DOMAIN, config)
+        
+        # Second run - should use cache
+        results2 = await seeder.urls(TEST_DOMAIN, config)
+        
+        # Results should be identical
+        assert len(results1) == len(results2)
+        for r1, r2 in zip(results1, results2):
+            assert r1["url"] == r2["url"]
+            assert abs(r1["relevance_score"] - r2["relevance_score"]) < 0.001
+        
+        print("\n✓ Cache produces consistent scores")
+    
+    @pytest.mark.asyncio
+    async def test_force_refresh_scoring(self, seeder):
+        """Test force=True bypasses cache for fresh scoring."""
+        config_cached = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="transfer window",
+            scoring_method="bm25",
+            max_urls=5,
+            force=False
+        )
+        
+        config_forced = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="transfer window",
+            scoring_method="bm25",
+            max_urls=5,
+            force=True
+        )
+        
+        # Run with cache
+        start1 = asyncio.get_event_loop().time()
+        results1 = await seeder.urls(TEST_DOMAIN, config_cached)
+        time1 = asyncio.get_event_loop().time() - start1
+        
+        # Run with force (should be slower due to fresh fetch)
+        start2 = asyncio.get_event_loop().time()
+        results2 = await seeder.urls(TEST_DOMAIN, config_forced)
+        time2 = asyncio.get_event_loop().time() - start2
+        
+        print(f"\nCached run: {time1:.2f}s")
+        print(f"Forced run: {time2:.2f}s")
+        
+        # Both should produce scored results
+        assert all("relevance_score" in r for r in results1)
+        assert all("relevance_score" in r for r in results2)
+    
+    # ============================================
+    # Source Combination Tests
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_scoring_with_multiple_sources(self, seeder):
+        """Test BM25 scoring with combined sources (cc+sitemap)."""
+        config = SeedingConfig(
+            source="cc+sitemap",
+            extract_head=True,
+            query="match highlights video",
+            scoring_method="bm25",
+            score_threshold=0.3,
+            max_urls=30,
+            concurrency=15
+        )
+        
+        results = await seeder.urls(TEST_DOMAIN, config)
+        
+        # Verify we got results from both sources
+        print(f"\nCombined sources returned {len(results)} URLs above threshold")
+        
+        # Check URL diversity
+        unique_paths = set()
+        for r in results:
+            path = r["url"].replace("https://", "").replace("http://", "").split("/", 1)[-1]
+            unique_paths.add(path.split("?")[0])  # Remove query params
+        
+        print(f"Unique paths found: {len(unique_paths)}")
+        
+        # All should be scored and above threshold
+        assert all(r["relevance_score"] >= 0.3 for r in results)
+    
+    # ============================================
+    # Integration Tests
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_full_workflow_integration(self, seeder):
+        """Test complete workflow: discover -> score -> filter -> use."""
+        # Step 1: Discover and score URLs
+        config = SeedingConfig(
+            source="sitemap",
+            extract_head=True,
+            query="premier league opening fixtures",
+            scoring_method="bm25",
+            score_threshold=0.4,
+            max_urls=10,
+            verbose=True
+        )
+        
+        results = await seeder.urls(TEST_DOMAIN, config)
+        
+        print(f"\nStep 1: Found {len(results)} relevant URLs")
+        
+        # Step 2: Analyze top results
+        if results:
+            top_urls = results[:3]
+            print("\nStep 2: Top 3 URLs for crawling:")
+            for i, r in enumerate(top_urls):
+                print(f"{i+1}. Score: {r['relevance_score']:.3f}")
+                print(f"   URL: {r['url']}")
+                print(f"   Title: {r['head_data'].get('title', 'N/A')}")
+                
+                # Check metadata quality
+                meta = r['head_data'].get('meta', {})
+                if 'description' in meta:
+                    print(f"   Description: {meta['description'][:80]}...")
+        
+        # Step 3: Verify these URLs would be good for actual crawling
+        assert all(r["status"] == "valid" for r in results[:3])
+        print("\nStep 3: All top URLs are valid for crawling ✓")
+    
+    # ============================================
+    # Report Generation
+    # ============================================
+    
+    @pytest.mark.asyncio
+    async def test_generate_scoring_report(self, seeder):
+        """Generate a comprehensive report of BM25 scoring effectiveness."""
+        queries = {
+            "beginner": "match schedule",
+            "advanced": "tactical analysis pressing",
+            "api": "VAR decision explanation",
+            "deployment": "fixture changes due to weather",
+            "extraction": "expected goals statistics"
+        }
+        
+        report = {
+            "timestamp": datetime.now().isoformat(),
+            "domain": TEST_DOMAIN,
+            "results": {}
+        }
+        
+        for category, query in queries.items():
+            config = SeedingConfig(
+                source="sitemap",
+                extract_head=True,
+                query=query,
+                scoring_method="bm25",
+                max_urls=10
+            )
+            
+            results = await seeder.urls(TEST_DOMAIN, config)
+            
+            report["results"][category] = {
+                "query": query,
+                "total_results": len(results),
+                "top_results": [
+                    {
+                        "url": r["url"],
+                        "score": r["relevance_score"],
+                        "title": r["head_data"].get("title", "")
+                    }
+                    for r in results[:3]
+                ],
+                "score_distribution": {
+                    "min": min(r["relevance_score"] for r in results) if results else 0,
+                    "max": max(r["relevance_score"] for r in results) if results else 0,
+                    "avg": sum(r["relevance_score"] for r in results) / len(results) if results else 0
+                }
+            }
+        
+        # Print report
+        print("\n" + "="*60)
+        print("BM25 SCORING EFFECTIVENESS REPORT")
+        print("="*60)
+        print(f"Domain: {report['domain']}")
+        print(f"Timestamp: {report['timestamp']}")
+        print("\nResults by Category:")
+        
+        for category, data in report["results"].items():
+            print(f"\n{category.upper()}: '{data['query']}'")
+            print(f"  Total results: {data['total_results']}")
+            print(f"  Score range: {data['score_distribution']['min']:.3f} - {data['score_distribution']['max']:.3f}")
+            print(f"  Average score: {data['score_distribution']['avg']:.3f}")
+            print("  Top matches:")
+            for i, result in enumerate(data['top_results']):
+                print(f"    {i+1}. [{result['score']:.3f}] {result['title']}")
+
+
+# ============================================
+# Standalone test runner
+# ============================================
+
+async def run_all_tests():
+    """Run all tests standalone (without pytest)."""
+    print("Running AsyncUrlSeeder BM25 Tests...")
+    print("="*60)
+    
+    test_instance = TestAsyncUrlSeederBM25()
+    seeder = await test_instance.create_seeder()
+    
+    # Run each test method
+    test_methods = [
+        # test_instance.test_basic_bm25_scoring,
+        # test_instance.test_query_variations,
+        # test_instance.test_score_threshold_filtering,
+        # test_instance.test_extreme_thresholds,
+        # test_instance.test_comprehensive_metadata_extraction,
+        # test_instance.test_jsonld_extraction_scoring,
+        # test_instance.test_empty_query,
+        # test_instance.test_query_without_extract_head,
+        # test_instance.test_special_characters_in_query,
+        # test_instance.test_unicode_query,
+        # test_instance.test_large_scale_scoring,
+        # test_instance.test_concurrent_scoring_consistency,
+        # test_instance.test_many_urls_with_scoring,
+        test_instance.test_multi_word_complex_queries,
+        test_instance.test_scoring_with_cache,
+        test_instance.test_force_refresh_scoring,
+        test_instance.test_scoring_with_multiple_sources,
+        test_instance.test_full_workflow_integration,
+        test_instance.test_generate_scoring_report
+    ]
+    
+    for test_method in test_methods:
+        try:
+            print(f"\nRunning {test_method.__name__}...")
+            await test_method(seeder)
+            print(f"✓ {test_method.__name__} passed")
+        except Exception as e:
+            import traceback
+            print(f"✗ {test_method.__name__} failed: {str(e)}")
+            print(f"  Error type: {type(e).__name__}")
+            traceback.print_exc()
+    
+    print("\n" + "="*60)
+    print("Test suite completed!")
+
+
+if __name__ == "__main__":
+    # Run tests directly
+    asyncio.run(run_all_tests())
\ No newline at end of file