diff --git a/crawl4ai/async_url_seeder.py b/crawl4ai/async_url_seeder.py
index b9dce91a..c3931b07 100644
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -14,7 +14,16 @@ Features
 """
 
 from __future__ import annotations
-import aiofiles, asyncio, gzip, hashlib, io, json, os, pathlib, re, time
+import aiofiles
+import asyncio
+import gzip
+import hashlib
+import io
+import json
+import os
+import pathlib
+import re
+import time
 from datetime import timedelta
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
@@ -42,7 +51,8 @@ except ImportError:
 # Import AsyncLoggerBase from crawl4ai's logger module
 # Assuming crawl4ai/async_logger.py defines AsyncLoggerBase
 # You might need to adjust this import based on your exact file structure
-from .async_logger import AsyncLoggerBase, AsyncLogger # Import AsyncLogger for default if needed
+# Import AsyncLogger for default if needed
+from .async_logger import AsyncLoggerBase, AsyncLogger
 
 # Import SeedingConfig for type hints
 from typing import TYPE_CHECKING
@@ -55,16 +65,19 @@ COLLINFO_URL = "https://index.commoncrawl.org/collinfo.json"
 # CACHE_DIR = pathlib.Path("~/.crawl4ai").expanduser() # REMOVED: now managed by __init__
 # CACHE_DIR.mkdir(exist_ok=True) # REMOVED: now managed by __init__
 # INDEX_CACHE = CACHE_DIR / "latest_cc_index.txt" # REMOVED: now managed by __init__
-TTL = timedelta(days=7) # Keeping this constant as it's a seeder-specific TTL
+TTL = timedelta(days=7)  # Keeping this constant as it's a seeder-specific TTL
 
 _meta_rx = re.compile(
     r'<meta\s+(?:[^>]*?(?:name|property|http-equiv)\s*=\s*["\']?([^"\' >]+)[^>]*?content\s*=\s*["\']?([^"\' >]+)[^>]*?)\/?>',
     re.I)
 _charset_rx = re.compile(r'<meta\s+[^>]*charset=["\']?([^"\' >]+)', re.I)
-_title_rx   = re.compile(r'<title>(.*?)</title>', re.I|re.S)
-_link_rx    = re.compile(r'<link\s+[^>]*rel=["\']?([^"\' >]+)[^>]*href=["\']?([^"\' >]+)', re.I)
+_title_rx = re.compile(r'<title>(.*?)</title>', re.I | re.S)
+_link_rx = re.compile(
+    r'<link\s+[^>]*rel=["\']?([^"\' >]+)[^>]*href=["\']?([^"\' >]+)', re.I)
 
 # ────────────────────────────────────────────────────────────────────────── helpers
+
+
 def _match(url: str, pattern: str) -> bool:
     if fnmatch.fnmatch(url, pattern):
         return True
@@ -72,11 +85,13 @@ def _match(url: str, pattern: str) -> bool:
     return (fnmatch.fnmatch(canon, pattern)
             or (canon.startswith("www.") and fnmatch.fnmatch(canon[4:], pattern)))
 
+
 def _parse_head(src: str) -> Dict[str, Any]:
     if LXML:
         try:
             if isinstance(src, str):
-                src = src.encode("utf-8", "replace")     # strip Unicode, let lxml decode
+                # strip Unicode, let lxml decode
+                src = src.encode("utf-8", "replace")
             doc = lxml_html.fromstring(src)
         except (ValueError, etree.ParserError):
             return {}        # malformed, bail gracefully
@@ -87,13 +102,18 @@ def _parse_head(src: str) -> Dict[str, Any]:
             "meta": {}, "link": {}, "jsonld": []
         }
         for el in doc.xpath(".//meta"):
-            k = el.attrib.get("name") or el.attrib.get("property") or el.attrib.get("http-equiv")
-            if k: info["meta"][k.lower()] = el.attrib.get("content", "")
-            elif "charset" in el.attrib: info["charset"] = el.attrib["charset"].lower()
+            k = el.attrib.get("name") or el.attrib.get(
+                "property") or el.attrib.get("http-equiv")
+            if k:
+                info["meta"][k.lower()] = el.attrib.get("content", "")
+            elif "charset" in el.attrib:
+                info["charset"] = el.attrib["charset"].lower()
         for el in doc.xpath(".//link"):
             rel = " ".join(el.attrib.get("rel", [])).lower()
-            if not rel: continue
-            entry = {a: el.attrib[a] for a in ("href","as","type","hreflang") if a in el.attrib}
+            if not rel:
+                continue
+            entry = {a: el.attrib[a] for a in (
+                "href", "as", "type", "hreflang") if a in el.attrib}
             info["link"].setdefault(rel, []).append(entry)
         # Extract JSON-LD structured data
         for script in doc.xpath('.//script[@type="application/ld+json"]'):
@@ -109,14 +129,19 @@ def _parse_head(src: str) -> Dict[str, Any]:
             info["lang"] = html_elem.attrib.get("lang", "")
         return info
     # regex fallback
-    info: Dict[str,Any] = {"title":None,"charset":None,"meta":{},"link":{},"jsonld":[],"lang":""}
-    m=_title_rx.search(src);            info["title"]=m.group(1).strip() if m else None
-    for k,v in _meta_rx.findall(src):   info["meta"][k.lower()]=v
-    m=_charset_rx.search(src);          info["charset"]=m.group(1).lower() if m else None
-    for rel,href in _link_rx.findall(src):
-        info["link"].setdefault(rel.lower(),[]).append({"href":href})
+    info: Dict[str, Any] = {"title": None, "charset": None,
+                            "meta": {}, "link": {}, "jsonld": [], "lang": ""}
+    m = _title_rx.search(src)
+    info["title"] = m.group(1).strip() if m else None
+    for k, v in _meta_rx.findall(src):
+        info["meta"][k.lower()] = v
+    m = _charset_rx.search(src)
+    info["charset"] = m.group(1).lower() if m else None
+    for rel, href in _link_rx.findall(src):
+        info["link"].setdefault(rel.lower(), []).append({"href": href})
     # Try to extract JSON-LD with regex
-    jsonld_pattern = re.compile(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.I|re.S)
+    jsonld_pattern = re.compile(
+        r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.I | re.S)
     for match in jsonld_pattern.findall(src):
         try:
             jsonld_data = json.loads(match.strip())
@@ -130,41 +155,72 @@ def _parse_head(src: str) -> Dict[str, Any]:
     return info
 
 # ────────────────────────────────────────────────────────────────────────── class
+
+
 class AsyncUrlSeeder:
     """
     Async version of UrlSeeder.
     Call pattern is await/async for / async with.
 
-    Public coroutine
-    ----------------
+    Public coroutines
+    -----------------
     await seed.urls(...)
         returns List[Dict[str,Any]]  (url, status, head_data)
+    await seed.many_urls(...)
+        returns Dict[str, List[Dict[str,Any]]]
+    await seed.close()
+        closes the HTTP client if owned by seeder
+    
+    Usage examples
+    --------------
+    # Manual cleanup:
+    seeder = AsyncUrlSeeder()
+    try:
+        urls = await seeder.urls("example.com", config)
+    finally:
+        await seeder.close()
+    
+    # Using async context manager (recommended):
+    async with AsyncUrlSeeder() as seeder:
+        urls = await seeder.urls("example.com", config)
+    
+    # Reusing existing client:
+    client = httpx.AsyncClient()
+    seeder = AsyncUrlSeeder(client=client)
+    urls = await seeder.urls("example.com", config)
+    # No need to close seeder, as it doesn't own the client
     """
 
     def __init__(
         self,
         ttl: timedelta = TTL,
-        client: Optional[httpx.AsyncClient]=None,
-        logger: Optional[AsyncLoggerBase] = None, # NEW: Add logger parameter
-        base_directory: Optional[Union[str, pathlib.Path]] = None, # NEW: Add base_directory
+        client: Optional[httpx.AsyncClient] = None,
+        logger: Optional[AsyncLoggerBase] = None,  # NEW: Add logger parameter
+        # NEW: Add base_directory
+        base_directory: Optional[Union[str, pathlib.Path]] = None,
         cache_root: Optional[Union[str, Path]] = None,
     ):
         self.ttl = ttl
+        self._owns_client = client is None  # Track if we created the client
         self.client = client or httpx.AsyncClient(http2=True, timeout=20, headers={
-            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
         })
-        self.logger = logger # Store the logger instance
-        self.base_directory = pathlib.Path(base_directory or os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())) # Resolve base_directory
-        self.cache_dir = self.base_directory / ".crawl4ai" / "seeder_cache" # NEW: Specific cache dir for seeder
-        self.cache_dir.mkdir(parents=True, exist_ok=True) # Ensure it exists
-        self.index_cache_path = self.cache_dir / "latest_cc_index.txt" # NEW: Index cache path
+        self.logger = logger  # Store the logger instance
+        self.base_directory = pathlib.Path(base_directory or os.getenv(
+            "CRAWL4_AI_BASE_DIRECTORY", Path.home()))  # Resolve base_directory
+        self.cache_dir = self.base_directory / ".crawl4ai" / \
+            "seeder_cache"  # NEW: Specific cache dir for seeder
+        self.cache_dir.mkdir(parents=True, exist_ok=True)  # Ensure it exists
+        self.index_cache_path = self.cache_dir / \
+            "latest_cc_index.txt"  # NEW: Index cache path
 
         # defer – grabbing the index inside an active loop blows up
         self.index_id: Optional[str] = None
         self._rate_sem: Optional[asyncio.Semaphore] = None
 
         # ───────── cache dirs ─────────
-        self.cache_root = Path(os.path.expanduser(cache_root or "~/.cache/url_seeder"))
+        self.cache_root = Path(os.path.expanduser(
+            cache_root or "~/.cache/url_seeder"))
         (self.cache_root / "live").mkdir(parents=True, exist_ok=True)
         (self.cache_root / "head").mkdir(exist_ok=True)
 
@@ -173,7 +229,8 @@ class AsyncUrlSeeder:
         if self.logger:
             log_method = getattr(self.logger, level, None)
             if log_method:
-                log_method(message=message, tag=tag, params=kwargs.get('params', {}))
+                log_method(message=message, tag=tag,
+                           params=kwargs.get('params', {}))
             # else: # Fallback for unknown level, should not happen with AsyncLoggerBase
             #     print(f"[{tag}] {level.upper()}: {message.format(**kwargs)}")
 
@@ -182,35 +239,34 @@ class AsyncUrlSeeder:
         h = hashlib.sha1(url.encode()).hexdigest()
         return self.cache_root / kind / f"{h}.json"
 
-    def _cache_get(self, kind: str, url: str) -> Optional[Dict[str, Any]]:
+    async def _cache_get(self, kind: str, url: str) -> Optional[Dict[str, Any]]:
         p = self._cache_path(kind, url)
         if not p.exists():
             return None
-        # TTL check
-        if time.time() - p.stat().st_mtime > self.ttl.total_seconds():
+        if time.time()-p.stat().st_mtime > self.ttl.total_seconds():
             return None
         try:
-            return json.loads(p.read_text())
+            async with aiofiles.open(p, "r") as f:
+                return json.loads(await f.read())
         except Exception:
             return None
 
-    def _cache_set(self, kind: str, url: str, data: Dict[str, Any]) -> None:
+    async def _cache_set(self, kind: str, url: str, data: Dict[str, Any]) -> None:
         try:
-            self._cache_path(kind, url).write_text(
-                json.dumps(data, separators=(",", ":"))
-            )
+            async with aiofiles.open(self._cache_path(kind, url), "w") as f:
+                await f.write(json.dumps(data, separators=(",", ":")))
         except Exception:
             pass
 
-
     # ─────────────────────────────── discovery entry
+
     async def urls(self,
-        domain: str,
-        config: "SeedingConfig",
-    ) -> List[Dict[str,Any]]:
+                   domain: str,
+                   config: "SeedingConfig",
+                   ) -> List[Dict[str, Any]]:
         """
         Fetch URLs for a domain using configuration from SeedingConfig.
-        
+
         Parameters
         ----------
         domain : str
@@ -228,37 +284,40 @@ class AsyncUrlSeeder:
         hits_per_sec = config.hits_per_sec
         self.force = config.force  # Store force flag as instance attribute
         force = config.force
-        verbose = config.verbose if config.verbose is not None else (self.logger.verbose if self.logger else False)
+        verbose = config.verbose if config.verbose is not None else (
+            self.logger.verbose if self.logger else False)
         max_urls = config.max_urls if config.max_urls is not None else -1
         query = config.query
         score_threshold = config.score_threshold
-        scoring_method = config.scoring_method    
-        
+        scoring_method = config.scoring_method
+
         # Ensure seeder's logger verbose matches the config's verbose if it's set
         if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
             self.logger.verbose = config.verbose
 
         # ensure we have the latest CC collection id
         if self.index_id is None:
-            self.index_id = await self._latest_index()        
+            self.index_id = await self._latest_index()
 
         # Parse source parameter - split by '+' to get list of sources
         sources = source.split('+')
         valid_sources = {"cc", "sitemap"}
         for s in sources:
             if s not in valid_sources:
-                raise ValueError(f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
-        
+                raise ValueError(
+                    f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
+
         if hits_per_sec:
             if hits_per_sec <= 0:
-                self._log("warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
+                self._log(
+                    "warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
                 self._rate_sem = None
             else:
                 self._rate_sem = asyncio.Semaphore(hits_per_sec)
         else:
-            self._rate_sem = None # Ensure it's None if no rate limiting
+            self._rate_sem = None  # Ensure it's None if no rate limiting
 
-        self._log("info", "Starting URL seeding for {domain} with source={source}", 
+        self._log("info", "Starting URL seeding for {domain} with source={source}",
                   params={"domain": domain, "source": source}, tag="URL_SEED")
 
         # choose stream
@@ -268,44 +327,51 @@ class AsyncUrlSeeder:
                 async for u in self._from_sitemaps(domain, pattern, force):
                     yield u
             if "cc" in sources:
-                self._log("debug", "Fetching from Common Crawl...", tag="URL_SEED")
+                self._log("debug", "Fetching from Common Crawl...",
+                          tag="URL_SEED")
                 async for u in self._from_cc(domain, pattern, force):
                     yield u
 
-        queue = asyncio.Queue()
+        # Use bounded queue to prevent RAM spikes with large domains
+        queue_size = min(10000, max(1000, concurrency * 100))  # Dynamic size based on concurrency
+        queue = asyncio.Queue(maxsize=queue_size)
         producer_done = asyncio.Event()
-        stop_event    = asyncio.Event()
+        stop_event = asyncio.Event()
         seen: set[str] = set()
 
         async def producer():
             try:
                 async for u in gen():
                     if u in seen:
-                        self._log("debug", "Skipping duplicate URL: {url}", 
+                        self._log("debug", "Skipping duplicate URL: {url}",
                                   params={"url": u}, tag="URL_SEED")
                         continue
                     if stop_event.is_set():
-                        self._log("info", "Producer stopping due to max_urls limit.", tag="URL_SEED")
+                        self._log(
+                            "info", "Producer stopping due to max_urls limit.", tag="URL_SEED")
                         break
-                    await queue.put(u)
+                    seen.add(u)
+                    await queue.put(u)  # Will block if queue is full, providing backpressure
             except Exception as e:
-                self._log("error", "Producer encountered an error: {error}", params={"error": str(e)}, tag="URL_SEED")
+                self._log("error", "Producer encountered an error: {error}", params={
+                          "error": str(e)}, tag="URL_SEED")
             finally:
                 producer_done.set()
                 self._log("debug", "Producer finished.", tag="URL_SEED")
 
-
-        async def worker(res_list: List[Dict[str,Any]]):
+        async def worker(res_list: List[Dict[str, Any]]):
             while True:
                 if queue.empty() and producer_done.is_set():
                     # self._log("debug", "Worker exiting: queue empty and producer done.", tag="URL_SEED")
                     break
                 try:
-                    url = await asyncio.wait_for(queue.get(), 5) # Increased timeout slightly
+                    # Increased timeout slightly
+                    url = await asyncio.wait_for(queue.get(), 5)
                 except asyncio.TimeoutError:
-                    continue # Keep checking queue and producer_done status
+                    continue  # Keep checking queue and producer_done status
                 except Exception as e:
-                    self._log("error", "Worker failed to get URL from queue: {error}", params={"error": str(e)}, tag="URL_SEED")
+                    self._log("error", "Worker failed to get URL from queue: {error}", params={
+                              "error": str(e)}, tag="URL_SEED")
                     continue
 
                 if max_urls > 0 and len(res_list) >= max_urls:
@@ -332,70 +398,34 @@ class AsyncUrlSeeder:
                 if self._rate_sem:  # global QPS control
                     async with self._rate_sem:
                         await self._validate(url, res_list, live_check, extract_head,
-                                             head_timeout, verbose)
+                                             head_timeout, verbose, query, score_threshold, scoring_method)
                 else:
                     await self._validate(url, res_list, live_check, extract_head,
-                                         head_timeout, verbose)
-                queue.task_done() # Mark task as done for queue.join() if ever used
+                                         head_timeout, verbose, query, score_threshold, scoring_method)
+                queue.task_done()  # Mark task as done for queue.join() if ever used
 
         # launch
-        results: List[Dict[str,Any]] = []
+        results: List[Dict[str, Any]] = []
         prod_task = asyncio.create_task(producer())
-        workers = [asyncio.create_task(worker(results)) for _ in range(concurrency)]
+        workers = [asyncio.create_task(worker(results))
+                   for _ in range(concurrency)]
 
         # Wait for all workers to finish
         await asyncio.gather(prod_task, *workers)
-        await queue.join() # Ensure all queued items are processed
+        await queue.join()  # Ensure all queued items are processed
 
-        self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}", 
+        self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}",
                   params={"domain": domain, "count": len(results)}, tag="URL_SEED")
 
-        # Apply BM25 scoring if query is provided and extract_head is enabled
+        # Sort by relevance score if query was provided
         if query and extract_head and scoring_method == "bm25":
-            self._log("info", "Applying BM25 scoring for query: '{query}'", 
-                      params={"query": query}, tag="URL_SEED")
-            
-            # Extract text contexts from all results
-            documents = []
-            valid_indices = []
-            for i, result in enumerate(results):
-                if result.get("head_data"):
-                    text_context = self._extract_text_context(result["head_data"])
-                    if text_context:  # Only include non-empty contexts
-                        documents.append(text_context)
-                        valid_indices.append(i)
-            
-            if documents:
-                # Calculate BM25 scores
-                scores = self._calculate_bm25_score(query, documents)
-                
-                # Add scores to results
-                for idx, score in zip(valid_indices, scores):
-                    results[idx]["relevance_score"] = float(score)
-                
-                # Add zero scores to results without head_data
-                for i, result in enumerate(results):
-                    if i not in valid_indices:
-                        result["relevance_score"] = 0.0
-                
-                # Filter by score threshold if specified
-                if score_threshold is not None:
-                    original_count = len(results)
-                    results = [r for r in results if r.get("relevance_score", 0.0) >= score_threshold]
-                    self._log("info", "Filtered {filtered} URLs below score threshold {threshold}. Remaining: {remaining}", 
-                              params={"filtered": original_count - len(results), 
-                                      "threshold": score_threshold, 
-                                      "remaining": len(results)}, tag="URL_SEED")
-                
-                # Sort by relevance score (highest first)
-                results.sort(key=lambda x: x.get("relevance_score", 0.0), reverse=True)
-            else:
-                self._log("warning", "No valid head data found for BM25 scoring.", tag="URL_SEED")
-                # Add zero scores to all results
-                for result in results:
-                    result["relevance_score"] = 0.0
+            results.sort(key=lambda x: x.get(
+                "relevance_score", 0.0), reverse=True)
+            self._log("info", "Sorted {count} URLs by relevance score for query: '{query}'",
+                      params={"count": len(results), "query": query}, tag="URL_SEED")
         elif query and not extract_head:
-            self._log("warning", "Query provided but extract_head is False. Enable extract_head for relevance scoring.", tag="URL_SEED")
+            self._log(
+                "warning", "Query provided but extract_head is False. Enable extract_head for relevance scoring.", tag="URL_SEED")
 
         return results[:max_urls] if max_urls > 0 else results
 
@@ -403,7 +433,7 @@ class AsyncUrlSeeder:
         self,
         domains: Sequence[str],
         config: "SeedingConfig",
-    ) -> Dict[str, List[Dict[str,Any]]]:
+    ) -> Dict[str, List[Dict[str, Any]]]:
         """
         Fetch URLs for many domains in parallel.
 
@@ -416,9 +446,9 @@ class AsyncUrlSeeder:
 
         Returns a {domain: urls-list} dict.
         """
-        self._log("info", "Starting URL seeding for {count} domains...", 
+        self._log("info", "Starting URL seeding for {count} domains...",
                   params={"count": len(domains)}, tag="URL_SEED")
-        
+
         # Ensure seeder's logger verbose matches the config's verbose if it's set
         if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
             self.logger.verbose = config.verbose
@@ -428,9 +458,10 @@ class AsyncUrlSeeder:
             for domain in domains
         ]
         results = await asyncio.gather(*tasks)
-        
+
         final_results = dict(zip(domains, results))
-        self._log("info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
+        self._log(
+            "info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
         return final_results
 
     async def _resolve_head(self, url: str) -> Optional[str]:
@@ -459,66 +490,67 @@ class AsyncUrlSeeder:
 
         except Exception as e:
             self._log("debug", "HEAD {url} failed: {err}",
-                    params={"url": url, "err": str(e)}, tag="URL_SEED")
+                      params={"url": url, "err": str(e)}, tag="URL_SEED")
             return None
 
-
     # ─────────────────────────────── CC
-    async def _from_cc(self, domain:str, pattern:str, force:bool):
+    async def _from_cc(self, domain: str, pattern: str, force: bool):
         import re
         digest = hashlib.md5(pattern.encode()).hexdigest()[:8]
 
         # ── normalise for CC   (strip scheme, query, fragment)
-        raw = re.sub(r'^https?://', '', domain).split('#', 1)[0].split('?', 1)[0].lstrip('.')
+        raw = re.sub(r'^https?://', '', domain).split('#',
+                                                      1)[0].split('?', 1)[0].lstrip('.')
 
         # ── sanitize only for cache-file name
         safe = re.sub('[/?#]+', '_', raw)
         path = self.cache_dir / f"{self.index_id}_{safe}_{digest}.jsonl"
 
         if path.exists() and not force:
-            self._log("info", "Loading CC URLs for {domain} from cache: {path}", 
+            self._log("info", "Loading CC URLs for {domain} from cache: {path}",
                       params={"domain": domain, "path": path}, tag="URL_SEED")
-            async with aiofiles.open(path,"r") as fp:
+            async with aiofiles.open(path, "r") as fp:
                 async for line in fp:
-                    url=line.strip()
-                    if _match(url,pattern): yield url
+                    url = line.strip()
+                    if _match(url, pattern):
+                        yield url
             return
 
         # build CC glob – if a path is present keep it, else add trailing /*
         glob = f"*.{raw}*" if '/' in raw else f"*.{raw}/*"
         url = f"https://index.commoncrawl.org/{self.index_id}-index?url={quote(glob, safe='*')}&output=json"
 
-        retries=(1,3,7)
-        self._log("info", "Fetching CC URLs for {domain} from Common Crawl index: {url}", 
+        retries = (1, 3, 7)
+        self._log("info", "Fetching CC URLs for {domain} from Common Crawl index: {url}",
                   params={"domain": domain, "url": url}, tag="URL_SEED")
-        for i,d in enumerate(retries+(-1,)):  # last -1 means don't retry
+        for i, d in enumerate(retries+(-1,)):  # last -1 means don't retry
             try:
                 async with self.client.stream("GET", url) as r:
                     r.raise_for_status()
-                    async with aiofiles.open(path,"w") as fp:
+                    async with aiofiles.open(path, "w") as fp:
                         async for line in r.aiter_lines():
                             rec = json.loads(line)
                             u = rec["url"]
                             await fp.write(u+"\n")
-                            if _match(u,pattern): yield u
+                            if _match(u, pattern):
+                                yield u
                 return
             except httpx.HTTPStatusError as e:
-                if e.response.status_code==503 and i<len(retries):
-                    self._log("warning", "Common Crawl API returned 503 for {domain}. Retrying in {delay}s.", 
+                if e.response.status_code == 503 and i < len(retries):
+                    self._log("warning", "Common Crawl API returned 503 for {domain}. Retrying in {delay}s.",
                               params={"domain": domain, "delay": retries[i]}, tag="URL_SEED")
                     await asyncio.sleep(retries[i])
                     continue
-                self._log("error", "HTTP error fetching CC index for {domain}: {error}", 
+                self._log("error", "HTTP error fetching CC index for {domain}: {error}",
                           params={"domain": domain, "error": str(e)}, tag="URL_SEED")
                 raise
             except Exception as e:
-                self._log("error", "Error fetching CC index for {domain}: {error}", 
+                self._log("error", "Error fetching CC index for {domain}: {error}",
                           params={"domain": domain, "error": str(e)}, tag="URL_SEED")
                 raise
 
-
     # ─────────────────────────────── Sitemaps
-    async def _from_sitemaps(self, domain:str, pattern:str, force:bool=False):
+    async def _from_sitemaps(self, domain: str, pattern: str, force: bool = False):
         """
         1. Probe default sitemap locations.
         2. If none exist, parse robots.txt for alternative sitemap URLs.
@@ -543,15 +575,16 @@ class AsyncUrlSeeder:
 
         # 1️⃣ direct sitemap probe
         # strip any scheme so we can handle https → http fallback
-        host=re.sub(r'^https?://','',domain).rstrip('/')
+        host = re.sub(r'^https?://', '', domain).rstrip('/')
 
-        schemes=('https','http')  # prefer TLS, downgrade if needed
+        schemes = ('https', 'http')  # prefer TLS, downgrade if needed
         for scheme in schemes:
-            for suffix in ("/sitemap.xml","/sitemap_index.xml"):
-                sm=f"{scheme}://{host}{suffix}"
+            for suffix in ("/sitemap.xml", "/sitemap_index.xml"):
+                sm = f"{scheme}://{host}{suffix}"
                 sm = await self._resolve_head(sm)
                 if sm:
-                    self._log("info","Found sitemap at {url}",params={"url":sm},tag="URL_SEED")
+                    self._log("info", "Found sitemap at {url}", params={
+                              "url": sm}, tag="URL_SEED")
                     async with aiofiles.open(path, "w") as fp:
                         async for u in self._iter_sitemap(sm):
                             await fp.write(u + "\n")
@@ -560,15 +593,18 @@ class AsyncUrlSeeder:
                     return
 
         # 2️⃣ robots.txt fallback
-        robots=f"https://{domain.rstrip('/')}/robots.txt"
+        robots = f"https://{domain.rstrip('/')}/robots.txt"
         try:
-            r=await self.client.get(robots,timeout=10,follow_redirects=True)
-            if not 200<=r.status_code<300:
-                self._log("warning","robots.txt unavailable for {d} HTTP{c}",params={"d":domain,"c":r.status_code},tag="URL_SEED")
+            r = await self.client.get(robots, timeout=10, follow_redirects=True)
+            if not 200 <= r.status_code < 300:
+                self._log("warning", "robots.txt unavailable for {d} HTTP{c}", params={
+                          "d": domain, "c": r.status_code}, tag="URL_SEED")
                 return
-            sitemap_lines=[l.split(":",1)[1].strip() for l in r.text.splitlines() if l.lower().startswith("sitemap:")]
+            sitemap_lines = [l.split(":", 1)[1].strip(
+            ) for l in r.text.splitlines() if l.lower().startswith("sitemap:")]
         except Exception as e:
-            self._log("warning","Failed to fetch robots.txt for {d}: {e}",params={"d":domain,"e":str(e)},tag="URL_SEED")
+            self._log("warning", "Failed to fetch robots.txt for {d}: {e}", params={
+                      "d": domain, "e": str(e)}, tag="URL_SEED")
             return
 
         if sitemap_lines:
@@ -579,53 +615,60 @@ class AsyncUrlSeeder:
                         if _match(u, pattern):
                             yield u
 
-    async def _iter_sitemap(self, url:str):
+    async def _iter_sitemap(self, url: str):
         try:
             r = await self.client.get(url, timeout=15)
             r.raise_for_status()
         except httpx.HTTPStatusError as e:
-            self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}", 
+            self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}",
                       params={"url": url, "status_code": e.response.status_code}, tag="URL_SEED")
             return
         except httpx.RequestError as e:
-            self._log("warning", "Network error fetching sitemap {url}: {error}", 
+            self._log("warning", "Network error fetching sitemap {url}: {error}",
                       params={"url": url, "error": str(e)}, tag="URL_SEED")
             return
         except Exception as e:
-            self._log("error", "Unexpected error fetching sitemap {url}: {error}", 
+            self._log("error", "Unexpected error fetching sitemap {url}: {error}",
                       params={"url": url, "error": str(e)}, tag="URL_SEED")
             return
 
         data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
-        
+
+        # Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
+        is_sitemap_index = False
+        sub_sitemaps = []
+        regular_urls = []
+
         # Use lxml for XML parsing if available, as it's generally more robust
         if LXML:
             try:
                 # Use XML parser for sitemaps, not HTML parser
                 parser = etree.XMLParser(recover=True)
                 root = etree.fromstring(data, parser=parser)
-                
+
                 # Define namespace for sitemap
                 ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
-                
-                # First check if this is a sitemap index
-                for sitemap_elem in root.xpath('//s:sitemap/s:loc', namespaces=ns):
-                    loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
-                    if loc:
-                        self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED")
-                        async for u in self._iter_sitemap(loc):
-                            yield u
-                
-                # Then check for regular URLs
-                for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
-                    loc = loc_elem.text.strip() if loc_elem.text else ""
-                    if loc:
-                        yield loc
+
+                # Check for sitemap index entries
+                sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
+                if sitemap_locs:
+                    is_sitemap_index = True
+                    for sitemap_elem in sitemap_locs:
+                        loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
+                        if loc:
+                            sub_sitemaps.append(loc)
+
+                # If not a sitemap index, get regular URLs
+                if not is_sitemap_index:
+                    for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
+                        loc = loc_elem.text.strip() if loc_elem.text else ""
+                        if loc:
+                            regular_urls.append(loc)
             except Exception as e:
-                self._log("error", "LXML parsing error for sitemap {url}: {error}", 
+                self._log("error", "LXML parsing error for sitemap {url}: {error}",
                           params={"url": url, "error": str(e)}, tag="URL_SEED")
                 return
-        else: # Fallback to xml.etree.ElementTree
+        else:  # Fallback to xml.etree.ElementTree
             import xml.etree.ElementTree as ET
             try:
                 # Parse the XML
@@ -634,89 +677,180 @@ class AsyncUrlSeeder:
                 for elem in root.iter():
                     if '}' in elem.tag:
                         elem.tag = elem.tag.split('}')[1]
-                
+
                 # Check for sitemap index entries
-                for sitemap in root.findall('.//sitemap'):
-                    loc_elem = sitemap.find('loc')
-                    if loc_elem is not None and loc_elem.text:
-                        loc = loc_elem.text.strip()
-                        self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED")
-                        async for u in self._iter_sitemap(loc):
-                            yield u
-                
-                # Check for regular URL entries
-                for url in root.findall('.//url'):
-                    loc_elem = url.find('loc')
-                    if loc_elem is not None and loc_elem.text:
-                        yield loc_elem.text.strip()
+                sitemaps = root.findall('.//sitemap')
+                if sitemaps:
+                    is_sitemap_index = True
+                    for sitemap in sitemaps:
+                        loc_elem = sitemap.find('loc')
+                        if loc_elem is not None and loc_elem.text:
+                            sub_sitemaps.append(loc_elem.text.strip())
+
+                # If not a sitemap index, get regular URLs
+                if not is_sitemap_index:
+                    for url_elem in root.findall('.//url'):
+                        loc_elem = url_elem.find('loc')
+                        if loc_elem is not None and loc_elem.text:
+                            regular_urls.append(loc_elem.text.strip())
             except Exception as e:
-                self._log("error", "ElementTree parsing error for sitemap {url}: {error}", 
+                self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
                           params={"url": url, "error": str(e)}, tag="URL_SEED")
                 return
 
+        # Process based on type
+        if is_sitemap_index and sub_sitemaps:
+            self._log("info", "Processing sitemap index with {count} sub-sitemaps in parallel",
+                      params={"count": len(sub_sitemaps)}, tag="URL_SEED")
+
+            # Create a bounded queue for results to prevent RAM issues
+            # For sitemap indexes, use a larger queue as we expect many URLs
+            queue_size = min(50000, len(sub_sitemaps) * 1000)  # Estimate 1000 URLs per sitemap
+            result_queue = asyncio.Queue(maxsize=queue_size)
+            completed_count = 0
+            total_sitemaps = len(sub_sitemaps)
+
+            async def process_subsitemap(sitemap_url: str):
+                try:
+                    self._log(
+                        "debug", "Processing sub-sitemap: {url}", params={"url": sitemap_url}, tag="URL_SEED")
+                    # Recursively process sub-sitemap
+                    async for u in self._iter_sitemap(sitemap_url):
+                        await result_queue.put(u)  # Will block if queue is full
+                except Exception as e:
+                    self._log("error", "Error processing sub-sitemap {url}: {error}",
+                              params={"url": sitemap_url, "error": str(e)}, tag="URL_SEED")
+                finally:
+                    # Put sentinel to signal completion
+                    await result_queue.put(None)
+
+            # Start all tasks
+            tasks = [asyncio.create_task(process_subsitemap(sm))
+                     for sm in sub_sitemaps]
+
+            # Yield results as they come in
+            while completed_count < total_sitemaps:
+                item = await result_queue.get()
+                if item is None:
+                    completed_count += 1
+                else:
+                    yield item
+
+            # Ensure all tasks are done
+            await asyncio.gather(*tasks, return_exceptions=True)
+        else:
+            # Regular sitemap - yield URLs directly
+            for u in regular_urls:
+                yield u
 
     # ─────────────────────────────── validate helpers
-    async def _validate(self, url:str, res_list:List[Dict[str,Any]], live:bool,
-                        extract:bool, timeout:int, verbose:bool):
+    async def _validate(self, url: str, res_list: List[Dict[str, Any]], live: bool,
+                        extract: bool, timeout: int, verbose: bool, query: Optional[str] = None,
+                        score_threshold: Optional[float] = None, scoring_method: str = "bm25"):
         # Local verbose parameter for this function is used to decide if intermediate logs should be printed
         # The main logger's verbose status should be controlled by the caller.
-        
+
         cache_kind = "head" if extract else "live"
 
         # ---------- try cache ----------
-        if (live or extract) and not (hasattr(self, 'force') and self.force):
-            cached = self._cache_get(cache_kind, url)
+        if not (hasattr(self, 'force') and self.force):
+            cached = await self._cache_get(cache_kind, url)
             if cached:
                 res_list.append(cached)
                 return
 
         if extract:
-            self._log("debug", "Fetching head for {url}", params={"url": url}, tag="URL_SEED")
-            ok,html,final = await self._fetch_head(url,timeout)
-            status="valid" if ok else "not_valid"
-            self._log("info" if ok else "warning", "HEAD {status} for {final_url}", 
+            self._log("debug", "Fetching head for {url}", params={
+                      "url": url}, tag="URL_SEED")
+            ok, html, final = await self._fetch_head(url, timeout)
+            status = "valid" if ok else "not_valid"
+            self._log("info" if ok else "warning", "HEAD {status} for {final_url}",
                       params={"status": status.upper(), "final_url": final or url}, tag="URL_SEED")
+            # head_data = _parse_head(html) if ok else {}
+            head_data = await asyncio.to_thread(_parse_head, html) if ok else {}
             entry = {
                 "url": final or url,
                 "status": status,
-                "head_data": _parse_head(html) if ok else {},
+                "head_data": head_data,
             }
-            if live or extract:
-                self._cache_set(cache_kind, url, entry)
-            res_list.append(entry)
+
+            # Apply BM25 scoring if query is provided and head data exists
+            if query and ok and scoring_method == "bm25" and head_data:
+                text_context = self._extract_text_context(head_data)
+                if text_context:
+                    # Calculate BM25 score for this single document
+                    # scores = self._calculate_bm25_score(query, [text_context])
+                    scores = await asyncio.to_thread(self._calculate_bm25_score, query, [text_context])
+                    relevance_score = scores[0] if scores else 0.0
+                    entry["relevance_score"] = float(relevance_score)
+                else:
+                    # No text context, use URL-based scoring as fallback
+                    relevance_score = self._calculate_url_relevance_score(
+                        query, entry["url"])
+                    entry["relevance_score"] = float(relevance_score)
+            elif query:
+                # Query provided but no head data - we reject this entry
+                self._log("debug", "No head data for {url}, using URL-based scoring",
+                          params={"url": url}, tag="URL_SEED")
+                return
+                # relevance_score = self._calculate_url_relevance_score(query, entry["url"])
+                # entry["relevance_score"] = float(relevance_score)
+
         elif live:
-            self._log("debug", "Performing live check for {url}", params={"url": url}, tag="URL_SEED")
-            ok=await self._resolve_head(url)
-            status="valid" if ok else "not_valid"
-            self._log("info" if ok else "warning", "LIVE CHECK {status} for {url}", 
+            self._log("debug", "Performing live check for {url}", params={
+                      "url": url}, tag="URL_SEED")
+            ok = await self._resolve_head(url)
+            status = "valid" if ok else "not_valid"
+            self._log("info" if ok else "warning", "LIVE CHECK {status} for {url}",
                       params={"status": status.upper(), "url": url}, tag="URL_SEED")
             entry = {"url": url, "status": status, "head_data": {}}
-            if live or extract:
-                self._cache_set(cache_kind, url, entry)
-            res_list.append(entry)
+
+            # Apply URL-based scoring if query is provided
+            if query:
+                relevance_score = self._calculate_url_relevance_score(
+                    query, url)
+                entry["relevance_score"] = float(relevance_score)
+
         else:
             entry = {"url": url, "status": "unknown", "head_data": {}}
+
+            # Apply URL-based scoring if query is provided
+            if query:
+                relevance_score = self._calculate_url_relevance_score(
+                    query, url)
+                entry["relevance_score"] = float(relevance_score)
+
+        # Now decide whether to add the entry based on score threshold
+        if query and "relevance_score" in entry:
+            if score_threshold is None or entry["relevance_score"] >= score_threshold:
+                if live or extract:
+                    await self._cache_set(cache_kind, url, entry)
+                res_list.append(entry)
+            else:
+                self._log("debug", "URL {url} filtered out with score {score} < {threshold}",
+                          params={"url": url, "score": entry["relevance_score"], "threshold": score_threshold}, tag="URL_SEED")
+        else:
+            # No query or no scoring - add as usual
             if live or extract:
-                self._cache_set(cache_kind, url, entry)
+                await self._cache_set(cache_kind, url, entry)
             res_list.append(entry)
 
-
-    async def _head_ok(self, url:str, timeout:int)->bool:
+    async def _head_ok(self, url: str, timeout: int) -> bool:
         try:
-            r=await self.client.head(url, timeout=timeout,
-                headers={"Range":"bytes=0-0","Accept-Encoding":"identity"})
-            r.raise_for_status() # Raise for bad status codes (4xx, 5xx)
+            r = await self.client.head(url, timeout=timeout,
+                                       headers={"Range": "bytes=0-0", "Accept-Encoding": "identity"})
+            r.raise_for_status()  # Raise for bad status codes (4xx, 5xx)
             return True
         except httpx.RequestError as e:
-            self._log("debug", "HEAD check network error for {url}: {error}", 
+            self._log("debug", "HEAD check network error for {url}: {error}",
                       params={"url": url, "error": str(e)}, tag="URL_SEED")
             return False
         except httpx.HTTPStatusError as e:
-            self._log("debug", "HEAD check HTTP status error for {url}: {status_code}", 
+            self._log("debug", "HEAD check HTTP status error for {url}: {status_code}",
                       params={"url": url, "status_code": e.response.status_code}, tag="URL_SEED")
             return False
         except Exception as e:
-            self._log("error", "Unexpected error during HEAD check for {url}: {error}", 
+            self._log("error", "Unexpected error during HEAD check for {url}: {error}",
                       params={"url": url, "error": str(e)}, tag="URL_SEED")
             return False
 
@@ -726,7 +860,7 @@ class AsyncUrlSeeder:
         timeout: int,
         max_redirects: int = 5,
         max_bytes: int = 65_536,  # stop after 64 kB even if </head> never comes
-        chunk_size: int = 4096,       # how much we read per await        
+        chunk_size: int = 4096,       # how much we read per await
     ):
         for _ in range(max_redirects+1):
             try:
@@ -742,22 +876,24 @@ class AsyncUrlSeeder:
                     },
                     follow_redirects=False,
                 ) as r:
-                    
-                    if r.status_code in (301,302,303,307,308):
+
+                    if r.status_code in (301, 302, 303, 307, 308):
                         location = r.headers.get("Location")
                         if location:
                             url = urljoin(url, location)
-                            self._log("debug", "Redirecting from {original_url} to {new_url}", 
+                            self._log("debug", "Redirecting from {original_url} to {new_url}",
                                       params={"original_url": r.url, "new_url": url}, tag="URL_SEED")
                             continue
                         else:
-                            self._log("warning", "Redirect status {status_code} but no Location header for {url}", 
+                            self._log("warning", "Redirect status {status_code} but no Location header for {url}",
                                       params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
-                            return False, "", str(r.url) # Return original URL if no new location
+                            # Return original URL if no new location
+                            return False, "", str(r.url)
 
                     # For 2xx or other non-redirect codes, proceed to read content
-                    if not (200 <= r.status_code < 400): # Only allow successful codes, or continue
-                        self._log("warning", "Non-success status {status_code} when fetching head for {url}", 
+                    # Only allow successful codes, or continue
+                    if not (200 <= r.status_code < 400):
+                        self._log("warning", "Non-success status {status_code} when fetching head for {url}",
                                   params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
                         return False, "", str(r.url)
 
@@ -768,7 +904,7 @@ class AsyncUrlSeeder:
                         if b"</head>" in low or len(buf) >= max_bytes:
                             await r.aclose()
                             break
-                    
+
                     enc = r.headers.get("Content-Encoding", "").lower()
                     try:
                         if enc == "gzip" and buf[:2] == b"\x1f\x8b":
@@ -779,28 +915,30 @@ class AsyncUrlSeeder:
                             # Header says “gzip” or “br” but payload is plain – ignore
                             self._log(
                                 "debug",
-                               "Skipping bogus {encoding} for {url}",
+                                "Skipping bogus {encoding} for {url}",
                                 params={"encoding": enc, "url": r.url},
                                 tag="URL_SEED",
                             )
                     except Exception as e:
                         self._log(
-                           "warning",
+                            "warning",
                             "Decompression error for {url} ({encoding}): {error}",
-                            params={"url": r.url, "encoding": enc, "error": str(e)},
+                            params={"url": r.url,
+                                    "encoding": enc, "error": str(e)},
                             tag="URL_SEED",
-                       )
+                        )
                         # fall through with raw buf
-                    
+
                     # Find the </head> tag case-insensitively and decode
                     idx = buf.lower().find(b"</head>")
-                    if idx==-1: 
-                        self._log("debug", "No </head> tag found in initial bytes of {url}", 
+                    if idx == -1:
+                        self._log("debug", "No </head> tag found in initial bytes of {url}",
                                   params={"url": r.url}, tag="URL_SEED")
                         # If no </head> is found, take a reasonable chunk or all if small
-                        html_bytes = buf if len(buf) < 10240 else buf[:10240] # Take max 10KB if no head tag
+                        # Take max 10KB if no head tag
+                        html_bytes = buf if len(buf) < 10240 else buf[:10240]
                     else:
-                        html_bytes = buf[:idx+7] # Include </head> tag
+                        html_bytes = buf[:idx+7]  # Include </head> tag
 
                     try:
                         html = html_bytes.decode("utf-8", "replace")
@@ -813,49 +951,50 @@ class AsyncUrlSeeder:
                         )
                         html = html_bytes.decode("latin-1", "replace")
 
-                    return True,html,str(r.url) # Return the actual URL after redirects
-                    
+                    # Return the actual URL after redirects
+                    return True, html, str(r.url)
+
             except httpx.RequestError as e:
-                self._log("debug", "Fetch head network error for {url}: {error}", 
+                self._log("debug", "Fetch head network error for {url}: {error}",
                           params={"url": url, "error": str(e)}, tag="URL_SEED")
-                return False,"",url
-        
+                return False, "", url
+
         # If loop finishes without returning (e.g. too many redirects)
-        self._log("warning", "Exceeded max redirects ({max_redirects}) for {url}", 
+        self._log("warning", "Exceeded max redirects ({max_redirects}) for {url}",
                   params={"max_redirects": max_redirects, "url": url}, tag="URL_SEED")
-        return False,"",url
+        return False, "", url
 
     # ─────────────────────────────── BM25 scoring helpers
     def _extract_text_context(self, head_data: Dict[str, Any]) -> str:
         """Extract all relevant text from head metadata for scoring."""
         # Priority fields with their weights (for future enhancement)
         text_parts = []
-        
+
         # Title
         if head_data.get("title"):
             text_parts.append(head_data["title"])
-        
+
         # Standard meta tags
         meta = head_data.get("meta", {})
         for key in ["description", "keywords", "author", "subject", "summary", "abstract"]:
             if meta.get(key):
                 text_parts.append(meta[key])
-        
+
         # Open Graph tags
         for key in ["og:title", "og:description", "og:site_name", "article:tag"]:
             if meta.get(key):
                 text_parts.append(meta[key])
-        
+
         # Twitter Card tags
         for key in ["twitter:title", "twitter:description", "twitter:image:alt"]:
             if meta.get(key):
                 text_parts.append(meta[key])
-        
+
         # Dublin Core tags
         for key in ["dc.title", "dc.description", "dc.subject", "dc.creator"]:
             if meta.get(key):
                 text_parts.append(meta[key])
-        
+
         # JSON-LD structured data
         for jsonld in head_data.get("jsonld", []):
             if isinstance(jsonld, dict):
@@ -865,8 +1004,9 @@ class AsyncUrlSeeder:
                         if isinstance(jsonld[field], str):
                             text_parts.append(jsonld[field])
                         elif isinstance(jsonld[field], list):
-                            text_parts.extend(str(item) for item in jsonld[field] if item)
-                
+                            text_parts.extend(str(item)
+                                              for item in jsonld[field] if item)
+
                 # Handle @graph structures
                 if "@graph" in jsonld and isinstance(jsonld["@graph"], list):
                     for item in jsonld["@graph"]:
@@ -874,71 +1014,176 @@ class AsyncUrlSeeder:
                             for field in ["name", "headline", "description"]:
                                 if field in item and isinstance(item[field], str):
                                     text_parts.append(item[field])
-        
+
         # Combine all text parts
         return " ".join(filter(None, text_parts))
-    
+
+    def _calculate_url_relevance_score(self, query: str, url: str) -> float:
+        """Calculate relevance score between query and URL using string matching."""
+        # Normalize inputs
+        query_lower = query.lower()
+        url_lower = url.lower()
+
+        # Extract URL components
+        from urllib.parse import urlparse
+        parsed = urlparse(url)
+        domain = parsed.netloc.replace('www.', '')
+        path = parsed.path.strip('/')
+
+        # Create searchable text from URL
+        # Split domain by dots and path by slashes
+        domain_parts = domain.split('.')
+        path_parts = [p for p in path.split('/') if p]
+
+        # Include query parameters if any
+        query_params = parsed.query
+        param_parts = []
+        if query_params:
+            for param in query_params.split('&'):
+                if '=' in param:
+                    key, value = param.split('=', 1)
+                    param_parts.extend([key, value])
+
+        # Combine all parts
+        all_parts = domain_parts + path_parts + param_parts
+
+        # Calculate scores
+        scores = []
+        query_tokens = query_lower.split()
+
+        # 1. Exact match in any part (highest score)
+        for part in all_parts:
+            part_lower = part.lower()
+            if query_lower in part_lower:
+                scores.append(1.0)
+            elif part_lower in query_lower:
+                scores.append(0.9)
+
+        # 2. Token matching
+        for token in query_tokens:
+            token_scores = []
+            for part in all_parts:
+                part_lower = part.lower()
+                if token in part_lower:
+                    # Score based on how much of the part the token covers
+                    coverage = len(token) / len(part_lower)
+                    token_scores.append(0.7 * coverage)
+                elif part_lower in token:
+                    coverage = len(part_lower) / len(token)
+                    token_scores.append(0.6 * coverage)
+
+            if token_scores:
+                scores.append(max(token_scores))
+
+        # 3. Character n-gram similarity (for fuzzy matching)
+        def get_ngrams(text, n=3):
+            return set(text[i:i+n] for i in range(len(text)-n+1))
+
+        # Combine all URL parts into one string for n-gram comparison
+        url_text = ' '.join(all_parts).lower()
+        if len(query_lower) >= 3 and len(url_text) >= 3:
+            query_ngrams = get_ngrams(query_lower)
+            url_ngrams = get_ngrams(url_text)
+            if query_ngrams and url_ngrams:
+                intersection = len(query_ngrams & url_ngrams)
+                union = len(query_ngrams | url_ngrams)
+                jaccard = intersection / union if union > 0 else 0
+                scores.append(0.5 * jaccard)
+
+        # Calculate final score
+        if not scores:
+            return 0.0
+
+        # Weighted average with bias towards higher scores
+        scores.sort(reverse=True)
+        weighted_score = 0
+        total_weight = 0
+        for i, score in enumerate(scores):
+            weight = 1 / (i + 1)  # Higher weight for better matches
+            weighted_score += score * weight
+            total_weight += weight
+
+        final_score = weighted_score / total_weight if total_weight > 0 else 0
+        return min(final_score, 1.0)  # Cap at 1.0
+
     def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]:
         """Calculate BM25 scores for documents against a query."""
         if not HAS_BM25:
-            self._log("warning", "rank_bm25 not installed. Returning zero scores.", tag="URL_SEED")
+            self._log(
+                "warning", "rank_bm25 not installed. Returning zero scores.", tag="URL_SEED")
             return [0.0] * len(documents)
-        
+
         if not query or not documents:
             return [0.0] * len(documents)
-        
+
         # Tokenize query and documents (simple whitespace tokenization)
         # For production, consider using a proper tokenizer
         query_tokens = query.lower().split()
         tokenized_docs = [doc.lower().split() for doc in documents]
-        
+
         # Handle edge case where all documents are empty
         if all(len(doc) == 0 for doc in tokenized_docs):
             return [0.0] * len(documents)
-        
+
         # Create BM25 instance and calculate scores
         try:
             from rank_bm25 import BM25Okapi
             bm25 = BM25Okapi(tokenized_docs)
             scores = bm25.get_scores(query_tokens)
-            
+
             # Normalize scores to 0-1 range
             max_score = max(scores) if max(scores) > 0 else 1.0
             normalized_scores = [score / max_score for score in scores]
-            
+
             return normalized_scores
         except Exception as e:
-            self._log("error", "Error calculating BM25 scores: {error}", 
+            self._log("error", "Error calculating BM25 scores: {error}",
                       params={"error": str(e)}, tag="URL_SEED")
             return [0.0] * len(documents)
 
+    # ─────────────────────────────── cleanup methods
+    async def close(self):
+        """Close the HTTP client if we own it."""
+        if self._owns_client and self.client:
+            await self.client.aclose()
+            self._log("debug", "Closed HTTP client", tag="URL_SEED")
+    
+    async def __aenter__(self):
+        """Async context manager entry."""
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """Async context manager exit."""
+        await self.close()
+        return False
+
     # ─────────────────────────────── index helper
-    async def _latest_index(self)->str:
-        if self.index_cache_path.exists() and (time.time()-self.index_cache_path.stat().st_mtime)<self.ttl.total_seconds():
-            self._log("info", "Loading latest CC index from cache: {path}", 
+    async def _latest_index(self) -> str:
+        if self.index_cache_path.exists() and (time.time()-self.index_cache_path.stat().st_mtime) < self.ttl.total_seconds():
+            self._log("info", "Loading latest CC index from cache: {path}",
                       params={"path": self.index_cache_path}, tag="URL_SEED")
             return self.index_cache_path.read_text().strip()
-        
-        self._log("info", "Fetching latest Common Crawl index from {url}", 
+
+        self._log("info", "Fetching latest Common Crawl index from {url}",
                   params={"url": COLLINFO_URL}, tag="URL_SEED")
         try:
             async with httpx.AsyncClient() as c:
-                j=await c.get(COLLINFO_URL,timeout=10)
-                j.raise_for_status() # Raise an exception for bad status codes
-                idx=j.json()[0]["id"]
+                j = await c.get(COLLINFO_URL, timeout=10)
+                j.raise_for_status()  # Raise an exception for bad status codes
+                idx = j.json()[0]["id"]
                 self.index_cache_path.write_text(idx)
-                self._log("success", "Successfully fetched and cached CC index: {index_id}", 
+                self._log("success", "Successfully fetched and cached CC index: {index_id}",
                           params={"index_id": idx}, tag="URL_SEED")
                 return idx
         except httpx.RequestError as e:
-            self._log("error", "Network error fetching CC index info: {error}", 
+            self._log("error", "Network error fetching CC index info: {error}",
                       params={"error": str(e)}, tag="URL_SEED")
             raise
         except httpx.HTTPStatusError as e:
-            self._log("error", "HTTP error fetching CC index info: {status_code}", 
+            self._log("error", "HTTP error fetching CC index info: {status_code}",
                       params={"status_code": e.response.status_code}, tag="URL_SEED")
             raise
         except Exception as e:
-            self._log("error", "Unexpected error fetching CC index info: {error}", 
+            self._log("error", "Unexpected error fetching CC index info: {error}",
                       params={"error": str(e)}, tag="URL_SEED")
-            raise
\ No newline at end of file
+            raise
diff --git a/docs/examples/url_seeder/bbc_sport_research_assistant.py b/docs/examples/url_seeder/bbc_sport_research_assistant.py
new file mode 100644
index 00000000..c9104f2f
--- /dev/null
+++ b/docs/examples/url_seeder/bbc_sport_research_assistant.py
@@ -0,0 +1,806 @@
+"""
+BBC Sport Research Assistant Pipeline
+=====================================
+
+This example demonstrates how URLSeeder helps create an efficient research pipeline:
+1. Discover all available URLs without crawling
+2. Filter and rank them based on relevance
+3. Crawl only the most relevant content
+4. Generate comprehensive research insights
+
+Pipeline Steps:
+1. Get user query
+2. Optionally enhance query using LLM
+3. Use URLSeeder to discover and rank URLs
+4. Crawl top K URLs with BM25 filtering
+5. Generate detailed response with citations
+
+Requirements:
+- pip install crawl4ai
+- pip install litellm
+- export GEMINI_API_KEY="your-api-key"
+
+Usage:
+- Run normally: python bbc_sport_research_assistant.py
+- Run test mode: python bbc_sport_research_assistant.py test
+"""
+
+import asyncio
+import json
+import os
+import hashlib
+import pickle
+from typing import List, Dict, Optional, Tuple
+from dataclasses import dataclass, asdict
+from datetime import datetime
+from pathlib import Path
+
+# Rich for colored output
+from rich.console import Console
+from rich.text import Text
+from rich.panel import Panel
+from rich.table import Table
+from rich.progress import Progress, SpinnerColumn, TextColumn
+
+# Crawl4AI imports
+from crawl4ai import (
+    AsyncWebCrawler, 
+    BrowserConfig, 
+    CrawlerRunConfig,
+    AsyncUrlSeeder, 
+    SeedingConfig,
+    AsyncLogger
+)
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+# LiteLLM for AI communication
+import litellm
+
+# Initialize Rich console
+console = Console()
+
+# Get the current directory where this script is located
+SCRIPT_DIR = Path(__file__).parent.resolve()
+
+# Cache configuration - relative to script directory
+CACHE_DIR = SCRIPT_DIR / "temp_cache"
+CACHE_DIR.mkdir(parents=True, exist_ok=True)
+
+# Testing limits
+TESTING_MODE = True
+MAX_URLS_DISCOVERY = 100 if TESTING_MODE else 1000
+MAX_URLS_TO_CRAWL = 5 if TESTING_MODE else 10
+
+
+def get_cache_key(prefix: str, *args) -> str:
+    """Generate cache key from prefix and arguments"""
+    content = f"{prefix}:{'|'.join(str(arg) for arg in args)}"
+    return hashlib.md5(content.encode()).hexdigest()
+
+
+def load_from_cache(cache_key: str) -> Optional[any]:
+    """Load data from cache if exists"""
+    cache_path = CACHE_DIR / f"{cache_key}.pkl"
+    if cache_path.exists():
+        with open(cache_path, 'rb') as f:
+            return pickle.load(f)
+    return None
+
+
+def save_to_cache(cache_key: str, data: any) -> None:
+    """Save data to cache"""
+    cache_path = CACHE_DIR / f"{cache_key}.pkl"
+    with open(cache_path, 'wb') as f:
+        pickle.dump(data, f)
+
+
+@dataclass
+class ResearchConfig:
+    """Configuration for research pipeline"""
+    # Core settings
+    domain: str = "www.bbc.com/sport"
+    max_urls_discovery: int = 100
+    max_urls_to_crawl: int = 10
+    top_k_urls: int = 10
+    
+    # Scoring and filtering
+    score_threshold: float = 0.1
+    scoring_method: str = "bm25"
+    
+    # Processing options
+    use_llm_enhancement: bool = True
+    extract_head_metadata: bool = True
+    live_check: bool = True
+    force_refresh: bool = False
+    
+    # Crawler settings
+    max_concurrent_crawls: int = 5
+    timeout: int = 30000
+    headless: bool = True
+    
+    # Output settings
+    save_json: bool = True
+    save_markdown: bool = True
+    output_dir: str = None  # Will be set in __post_init__
+    
+    # Development settings
+    test_mode: bool = False
+    interactive_mode: bool = False
+    verbose: bool = True
+    
+    def __post_init__(self):
+        """Adjust settings based on test mode"""
+        if self.test_mode:
+            self.max_urls_discovery = 50
+            self.max_urls_to_crawl = 3
+            self.top_k_urls = 5
+        
+        # Set default output directory relative to script location
+        if self.output_dir is None:
+            self.output_dir = str(SCRIPT_DIR / "research_results")
+
+
+@dataclass
+class ResearchQuery:
+    """Container for research query and metadata"""
+    original_query: str
+    enhanced_query: Optional[str] = None
+    search_patterns: List[str] = None
+    timestamp: str = None
+
+
+@dataclass
+class ResearchResult:
+    """Container for research results"""
+    query: ResearchQuery
+    discovered_urls: List[Dict]
+    crawled_content: List[Dict]
+    synthesis: str
+    citations: List[Dict]
+    metadata: Dict
+
+
+async def get_user_query() -> str:
+    """
+    Get research query from user input
+    """
+    query = input("\n🔍 Enter your research query: ")
+    return query.strip()
+
+
+async def enhance_query_with_llm(query: str) -> ResearchQuery:
+    """
+    Use LLM to enhance the research query:
+    - Extract key terms
+    - Generate search patterns
+    - Identify related topics
+    """
+    # Check cache
+    cache_key = get_cache_key("enhanced_query", query)
+    cached_result = load_from_cache(cache_key)
+    if cached_result:
+        console.print("[dim cyan]📦 Using cached enhanced query[/dim cyan]")
+        return cached_result
+    
+    try:
+        response = await litellm.acompletion(
+            model="gemini/gemini-2.5-flash-preview-04-17",
+            messages=[{
+                "role": "user", 
+                "content": f"""Given this research query: "{query}"
+                
+                Extract:
+                1. Key terms and concepts (as a list)
+                2. Related search terms
+                3. A more specific/enhanced version of the query
+                
+                Return as JSON:
+                {{
+                    "key_terms": ["term1", "term2"],
+                    "related_terms": ["related1", "related2"],
+                    "enhanced_query": "enhanced version of query"
+                }}"""
+            }],
+            # reasoning_effort="low",
+            temperature=0.3,
+            response_format={"type": "json_object"}
+        )
+        
+        data = json.loads(response.choices[0].message.content)
+        
+        # Create search patterns
+        all_terms = data["key_terms"] + data["related_terms"]
+        patterns = [f"*{term.lower()}*" for term in all_terms]
+        
+        result = ResearchQuery(
+            original_query=query,
+            enhanced_query=data["enhanced_query"],
+            search_patterns=patterns[:10],  # Limit patterns
+            timestamp=datetime.now().isoformat()
+        )
+        
+        # Cache the result
+        save_to_cache(cache_key, result)
+        return result
+        
+    except Exception as e:
+        console.print(f"[yellow]⚠️ LLM enhancement failed: {e}[/yellow]")
+        # Fallback to simple tokenization
+        return ResearchQuery(
+            original_query=query,
+            enhanced_query=query,
+            search_patterns=tokenize_query_to_patterns(query),
+            timestamp=datetime.now().isoformat()
+        )
+
+
+def tokenize_query_to_patterns(query: str) -> List[str]:
+    """
+    Convert query into URL patterns for URLSeeder
+    Example: "AI startups funding" -> ["*ai*", "*startup*", "*funding*"]
+    """
+    # Simple tokenization - split and create patterns
+    words = query.lower().split()
+    # Filter out common words
+    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'that'}
+    keywords = [w for w in words if w not in stop_words and len(w) > 2]
+    
+    # Create patterns
+    patterns = [f"*{keyword}*" for keyword in keywords]
+    return patterns[:8]  # Limit to 8 patterns
+
+
+async def discover_urls(domain: str, query: str, config: ResearchConfig) -> List[Dict]:
+    """
+    Use URLSeeder to discover and rank URLs:
+    1. Fetch all URLs from domain
+    2. Filter by patterns
+    3. Extract metadata (titles, descriptions)
+    4. Rank by BM25 relevance score
+    5. Return top K URLs
+    """
+    # Check cache
+    cache_key = get_cache_key("discovered_urls", domain, query, config.top_k_urls)
+    cached_result = load_from_cache(cache_key)
+    if cached_result and not config.force_refresh:
+        console.print("[dim cyan]📦 Using cached URL discovery[/dim cyan]")
+        return cached_result
+    
+    console.print(f"\n[cyan]🔍 Discovering URLs from {domain}...[/cyan]")
+    
+    # Initialize URL seeder
+    seeder = AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose))
+    
+    # Configure seeding
+    seeding_config = SeedingConfig(
+        source="sitemap+cc",  # Use both sitemap and Common Crawl
+        extract_head=config.extract_head_metadata,
+        query=query,
+        scoring_method=config.scoring_method,
+        score_threshold=config.score_threshold,
+        max_urls=config.max_urls_discovery,
+        live_check=config.live_check,
+        force=config.force_refresh
+    )
+    
+    try:
+        # Discover URLs
+        urls = await seeder.urls(domain, seeding_config)
+        
+        # Sort by relevance score (descending)
+        sorted_urls = sorted(
+            urls, 
+            key=lambda x: x.get('relevance_score', 0), 
+            reverse=True
+        )
+        
+        # Take top K
+        top_urls = sorted_urls[:config.top_k_urls]
+        
+        console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
+        
+        # Cache the result
+        save_to_cache(cache_key, top_urls)
+        return top_urls
+        
+    except Exception as e:
+        console.print(f"[red]❌ URL discovery failed: {e}[/red]")
+        return []
+
+
+async def crawl_selected_urls(urls: List[str], query: str, config: ResearchConfig) -> List[Dict]:
+    """
+    Crawl selected URLs with content filtering:
+    - Use AsyncWebCrawler.arun_many()
+    - Apply content filter
+    - Generate clean markdown
+    """
+    # Extract just URLs from the discovery results
+    url_list = [u['url'] for u in urls if 'url' in u][:config.max_urls_to_crawl]
+    
+    if not url_list:
+        console.print("[red]❌ No URLs to crawl[/red]")
+        return []
+    
+    console.print(f"\n[cyan]🕷️ Crawling {len(url_list)} URLs...[/cyan]")
+    
+    # Check cache for each URL
+    crawled_results = []
+    urls_to_crawl = []
+    
+    for url in url_list:
+        cache_key = get_cache_key("crawled_content", url, query)
+        cached_content = load_from_cache(cache_key)
+        if cached_content and not config.force_refresh:
+            crawled_results.append(cached_content)
+        else:
+            urls_to_crawl.append(url)
+    
+    if urls_to_crawl:
+        console.print(f"[cyan]📥 Crawling {len(urls_to_crawl)} new URLs (cached: {len(crawled_results)})[/cyan]")
+                
+        # Configure markdown generator with content filter
+        md_generator = DefaultMarkdownGenerator(
+            content_filter=PruningContentFilter(
+                threshold=0.48,
+                threshold_type="dynamic",
+                min_word_threshold=10
+            ),
+        )
+        
+        # Configure crawler
+        crawler_config = CrawlerRunConfig(
+            markdown_generator=md_generator,
+            exclude_external_links=True,
+            excluded_tags=['nav', 'header', 'footer', 'aside'],
+        )
+        
+        # Create crawler with browser config
+        async with AsyncWebCrawler(
+            config=BrowserConfig(
+                headless=config.headless,
+                verbose=config.verbose
+            )
+        ) as crawler:
+            # Crawl URLs
+            results = await crawler.arun_many(
+                urls_to_crawl,
+                config=crawler_config,
+                max_concurrent=config.max_concurrent_crawls
+            )
+            
+            # Process results
+            for url, result in zip(urls_to_crawl, results):
+                if result.success:
+                    content_data = {
+                        'url': url,
+                        'title': result.metadata.get('title', ''),
+                        'markdown': result.markdown.fit_markdown or result.markdown.raw_markdown,
+                        'raw_length': len(result.markdown.raw_markdown),
+                        'fit_length': len(result.markdown.fit_markdown) if result.markdown.fit_markdown else len(result.markdown.raw_markdown),
+                        'metadata': result.metadata
+                    }
+                    crawled_results.append(content_data)
+                    
+                    # Cache the result
+                    cache_key = get_cache_key("crawled_content", url, query)
+                    save_to_cache(cache_key, content_data)
+                else:
+                    console.print(f"  [red]❌ Failed: {url[:50]}... - {result.error}[/red]")
+    
+    console.print(f"[green]✅ Successfully crawled {len(crawled_results)} URLs[/green]")
+    return crawled_results
+
+
+async def generate_research_synthesis(
+    query: str, 
+    crawled_content: List[Dict]
+) -> Tuple[str, List[Dict]]:
+    """
+    Use LLM to synthesize research findings:
+    - Analyze all crawled content
+    - Generate comprehensive answer
+    - Extract citations and references
+    """
+    if not crawled_content:
+        return "No content available for synthesis.", []
+    
+    console.print("\n[cyan]🤖 Generating research synthesis...[/cyan]")
+    
+    # Prepare content for LLM
+    content_sections = []
+    for i, content in enumerate(crawled_content, 1):
+        section = f"""
+SOURCE {i}:
+Title: {content['title']}
+URL: {content['url']}
+Content Preview:
+{content['markdown'][:1500]}...
+"""
+        content_sections.append(section)
+    
+    combined_content = "\n---\n".join(content_sections)
+    
+    try:
+        response = await litellm.acompletion(
+            model="gemini/gemini-2.5-flash-preview-04-17",
+            messages=[{
+                "role": "user",
+                "content": f"""Research Query: "{query}"
+
+Based on the following sources, provide a comprehensive research synthesis.
+
+{combined_content}
+
+Please provide:
+1. An executive summary (2-3 sentences)
+2. Key findings (3-5 bullet points)
+3. Detailed analysis (2-3 paragraphs)
+4. Future implications or trends
+
+Format your response with clear sections and cite sources using [Source N] notation.
+Keep the total response under 800 words."""
+            }],
+            # reasoning_effort="medium",
+            temperature=0.7
+        )
+        
+        synthesis = response.choices[0].message.content
+        
+        # Extract citations from the synthesis
+        citations = []
+        for i, content in enumerate(crawled_content, 1):
+            if f"[Source {i}]" in synthesis or f"Source {i}" in synthesis:
+                citations.append({
+                    'source_id': i,
+                    'title': content['title'],
+                    'url': content['url']
+                })
+        
+        return synthesis, citations
+        
+    except Exception as e:
+        console.print(f"[red]❌ Synthesis generation failed: {e}[/red]")
+        # Fallback to simple summary
+        summary = f"Research on '{query}' found {len(crawled_content)} relevant articles:\n\n"
+        for content in crawled_content[:3]:
+            summary += f"- {content['title']}\n  {content['url']}\n\n"
+        return summary, []
+
+
+def format_research_output(result: ResearchResult) -> str:
+    """
+    Format the final research output with:
+    - Executive summary
+    - Key findings
+    - Detailed analysis
+    - Citations and sources
+    """
+    output = []
+    output.append("\n" + "=" * 60)
+    output.append("🔬 RESEARCH RESULTS")
+    output.append("=" * 60)
+    
+    # Query info
+    output.append(f"\n📋 Query: {result.query.original_query}")
+    if result.query.enhanced_query != result.query.original_query:
+        output.append(f"   Enhanced: {result.query.enhanced_query}")
+    
+    # Discovery stats
+    output.append(f"\n📊 Statistics:")
+    output.append(f"   - URLs discovered: {len(result.discovered_urls)}")
+    output.append(f"   - URLs crawled: {len(result.crawled_content)}")
+    output.append(f"   - Processing time: {result.metadata.get('duration', 'N/A')}")
+    
+    # Synthesis
+    output.append(f"\n📝 SYNTHESIS")
+    output.append("-" * 60)
+    output.append(result.synthesis)
+    
+    # Citations
+    if result.citations:
+        output.append(f"\n📚 SOURCES")
+        output.append("-" * 60)
+        for citation in result.citations:
+            output.append(f"[{citation['source_id']}] {citation['title']}")
+            output.append(f"    {citation['url']}")
+    
+    return "\n".join(output)
+
+
+async def save_research_results(result: ResearchResult, config: ResearchConfig) -> Tuple[str, str]:
+    """
+    Save research results in JSON and Markdown formats
+    
+    Returns:
+        Tuple of (json_path, markdown_path)
+    """
+    # Create output directory
+    output_dir = Path(config.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Generate filename based on query and timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    query_slug = result.query.original_query[:50].replace(" ", "_").replace("/", "_")
+    base_filename = f"{timestamp}_{query_slug}"
+    
+    json_path = None
+    md_path = None
+    
+    # Save JSON
+    if config.save_json:
+        json_path = output_dir / f"{base_filename}.json"
+        with open(json_path, 'w') as f:
+            json.dump(asdict(result), f, indent=2, default=str)
+        console.print(f"\n[green]💾 JSON saved: {json_path}[/green]")
+    
+    # Save Markdown
+    if config.save_markdown:
+        md_path = output_dir / f"{base_filename}.md"
+        
+        # Create formatted markdown
+        md_content = [
+            f"# Research Report: {result.query.original_query}",
+            f"\n**Generated on:** {result.metadata.get('timestamp', 'N/A')}",
+            f"\n**Domain:** {result.metadata.get('domain', 'N/A')}",
+            f"\n**Processing time:** {result.metadata.get('duration', 'N/A')}",
+            "\n---\n",
+            "## Query Information",
+            f"- **Original Query:** {result.query.original_query}",
+            f"- **Enhanced Query:** {result.query.enhanced_query or 'N/A'}",
+            f"- **Search Patterns:** {', '.join(result.query.search_patterns or [])}",
+            "\n## Statistics",
+            f"- **URLs Discovered:** {len(result.discovered_urls)}",
+            f"- **URLs Crawled:** {len(result.crawled_content)}",
+            f"- **Sources Cited:** {len(result.citations)}",
+            "\n## Research Synthesis\n",
+            result.synthesis,
+            "\n## Sources\n"
+        ]
+        
+        # Add citations
+        for citation in result.citations:
+            md_content.append(f"### [{citation['source_id']}] {citation['title']}")
+            md_content.append(f"- **URL:** [{citation['url']}]({citation['url']})")
+            md_content.append("")
+        
+        # Add discovered URLs summary
+        md_content.extend([
+            "\n## Discovered URLs (Top 10)\n",
+            "| Score | URL | Title |",
+            "|-------|-----|-------|"
+        ])
+        
+        for url_data in result.discovered_urls[:10]:
+            score = url_data.get('relevance_score', 0)
+            url = url_data.get('url', '')
+            title = 'N/A'
+            if 'head_data' in url_data and url_data['head_data']:
+                title = url_data['head_data'].get('title', 'N/A')[:60] + '...'
+            md_content.append(f"| {score:.3f} | {url[:50]}... | {title} |")
+        
+        # Write markdown
+        with open(md_path, 'w') as f:
+            f.write('\n'.join(md_content))
+        
+        console.print(f"[green]📄 Markdown saved: {md_path}[/green]")
+    
+    return str(json_path) if json_path else None, str(md_path) if md_path else None
+
+
+async def wait_for_user(message: str = "\nPress Enter to continue..."):
+    """Wait for user input in interactive mode"""
+    input(message)
+
+
+async def research_pipeline(
+    query: str,
+    config: ResearchConfig
+) -> ResearchResult:
+    """
+    Main research pipeline orchestrator with configurable settings
+    """
+    start_time = datetime.now()
+    
+    # Display pipeline header
+    header = Panel(
+        f"[bold cyan]Research Pipeline[/bold cyan]\n\n"
+        f"[dim]Domain:[/dim] {config.domain}\n"
+        f"[dim]Mode:[/dim] {'Test' if config.test_mode else 'Production'}\n"
+        f"[dim]Interactive:[/dim] {'Yes' if config.interactive_mode else 'No'}",
+        title="🚀 Starting",
+        border_style="cyan"
+    )
+    console.print(header)
+    
+    # Step 1: Enhance query (optional)
+    console.print(f"\n[bold cyan]📝 Step 1: Query Processing[/bold cyan]")
+    if config.interactive_mode:
+        await wait_for_user()
+        
+    if config.use_llm_enhancement:
+        research_query = await enhance_query_with_llm(query)
+    else:
+        research_query = ResearchQuery(
+            original_query=query,
+            enhanced_query=query,
+            search_patterns=tokenize_query_to_patterns(query),
+            timestamp=datetime.now().isoformat()
+        )
+    
+    console.print(f"   [green]✅ Query ready:[/green] {research_query.enhanced_query or query}")
+    
+    # Step 2: Discover URLs
+    console.print(f"\n[bold cyan]🔍 Step 2: URL Discovery[/bold cyan]")
+    if config.interactive_mode:
+        await wait_for_user()
+        
+    discovered_urls = await discover_urls(
+        domain=config.domain,
+        query=research_query.enhanced_query or query,
+        config=config
+    )
+    
+    if not discovered_urls:
+        return ResearchResult(
+            query=research_query,
+            discovered_urls=[],
+            crawled_content=[],
+            synthesis="No relevant URLs found for the given query.",
+            citations=[],
+            metadata={'duration': str(datetime.now() - start_time)}
+        )
+    
+    console.print(f"   [green]✅ Found {len(discovered_urls)} relevant URLs[/green]")
+    
+    # Step 3: Crawl selected URLs
+    console.print(f"\n[bold cyan]🕷️ Step 3: Content Crawling[/bold cyan]")
+    if config.interactive_mode:
+        await wait_for_user()
+        
+    crawled_content = await crawl_selected_urls(
+        urls=discovered_urls,
+        query=research_query.enhanced_query or query,
+        config=config
+    )
+    
+    console.print(f"   [green]✅ Successfully crawled {len(crawled_content)} pages[/green]")
+    
+    # Step 4: Generate synthesis
+    console.print(f"\n[bold cyan]🤖 Step 4: Synthesis Generation[/bold cyan]")
+    if config.interactive_mode:
+        await wait_for_user()
+        
+    synthesis, citations = await generate_research_synthesis(
+        query=research_query.enhanced_query or query,
+        crawled_content=crawled_content
+    )
+    
+    console.print(f"   [green]✅ Generated synthesis with {len(citations)} citations[/green]")
+    
+    # Step 5: Create result
+    result = ResearchResult(
+        query=research_query,
+        discovered_urls=discovered_urls,
+        crawled_content=crawled_content,
+        synthesis=synthesis,
+        citations=citations,
+        metadata={
+            'duration': str(datetime.now() - start_time),
+            'domain': config.domain,
+            'timestamp': datetime.now().isoformat(),
+            'config': asdict(config)
+        }
+    )
+    
+    duration = datetime.now() - start_time
+    console.print(f"\n[bold green]✅ Research completed in {duration}[/bold green]")
+    
+    return result
+
+
+async def main():
+    """
+    Main entry point for the BBC Sport Research Assistant
+    """
+    # Example queries
+    example_queries = [
+        "Premier League transfer news and rumors",
+        "Champions League match results and analysis", 
+        "World Cup qualifying updates",
+        "Football injury reports and return dates",
+        "Tennis grand slam tournament results"
+    ]
+    
+    # Display header
+    console.print(Panel.fit(
+        "[bold cyan]BBC Sport Research Assistant[/bold cyan]\n\n"
+        "This tool demonstrates efficient research using URLSeeder:\n"
+        "[dim]• Discover all URLs without crawling\n"
+        "• Filter and rank by relevance\n"
+        "• Crawl only the most relevant content\n"
+        "• Generate AI-powered insights with citations[/dim]\n\n"
+        f"[dim]📁 Working directory: {SCRIPT_DIR}[/dim]",
+        title="🔬 Welcome",
+        border_style="cyan"
+    ))
+    
+    # Configuration options table
+    config_table = Table(title="\n⚙️  Configuration Options", show_header=False, box=None)
+    config_table.add_column(style="bold cyan", width=3)
+    config_table.add_column()
+    
+    config_table.add_row("1", "Quick Test Mode (3 URLs, fast)")
+    config_table.add_row("2", "Standard Mode (10 URLs, balanced)")
+    config_table.add_row("3", "Comprehensive Mode (20 URLs, thorough)")
+    config_table.add_row("4", "Custom Configuration")
+    
+    console.print(config_table)
+    
+    config_choice = input("\nSelect configuration (1-4): ").strip()
+    
+    # Create config based on choice
+    if config_choice == "1":
+        config = ResearchConfig(test_mode=True, interactive_mode=False)
+    elif config_choice == "2":
+        config = ResearchConfig(max_urls_to_crawl=10, top_k_urls=10)
+    elif config_choice == "3":
+        config = ResearchConfig(max_urls_to_crawl=20, top_k_urls=20, max_urls_discovery=200)
+    else:
+        # Custom configuration
+        config = ResearchConfig()
+        config.test_mode = input("\nTest mode? (y/n): ").lower() == 'y'
+        config.interactive_mode = input("Interactive mode (pause between steps)? (y/n): ").lower() == 'y'
+        config.use_llm_enhancement = input("Use AI to enhance queries? (y/n): ").lower() == 'y'
+        
+        if not config.test_mode:
+            try:
+                config.max_urls_to_crawl = int(input("Max URLs to crawl (default 10): ") or "10")
+                config.top_k_urls = int(input("Top K URLs to select (default 10): ") or "10")
+            except ValueError:
+                console.print("[yellow]Using default values[/yellow]")
+    
+    # Display example queries
+    query_table = Table(title="\n📋 Example Queries", show_header=False, box=None)
+    query_table.add_column(style="bold cyan", width=3)
+    query_table.add_column()
+    
+    for i, q in enumerate(example_queries, 1):
+        query_table.add_row(str(i), q)
+    
+    console.print(query_table)
+    
+    query_input = input("\nSelect a query (1-5) or enter your own: ").strip()
+    
+    if query_input.isdigit() and 1 <= int(query_input) <= len(example_queries):
+        query = example_queries[int(query_input) - 1]
+    else:
+        query = query_input if query_input else example_queries[0]
+    
+    console.print(f"\n[bold cyan]📝 Selected Query:[/bold cyan] {query}")
+    
+    # Run the research pipeline
+    result = await research_pipeline(query=query, config=config)
+    
+    # Display results
+    formatted_output = format_research_output(result)
+    # print(formatted_output)
+    console.print(Panel.fit(
+        formatted_output,
+        title="🔬 Research Results",
+        border_style="green"
+    ))
+    
+    # Save results
+    if config.save_json or config.save_markdown:
+        json_path, md_path = await save_research_results(result, config)
+        # print(f"\n✅ Results saved successfully!")
+        if json_path:
+            console.print(f"[green]JSON saved at:[/green] {json_path}")
+        if md_path:
+            console.print(f"[green]Markdown saved at:[/green] {md_path}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file