Add BBC Sp0ort Research Assistant pipeline example

- Implemented a comprehensive research pipeline using URLSeeder. - Steps include user query input, optional LLM enhancement, URL discovery and ranking, content crawling, and synthesis generation. - Introduced caching mechanism for enhanced query results and crawled content. - Configurable settings for testing and production modes. - Output results in JSON and Markdown formats with detailed research insights and citations.
2025-06-04 23:23:21 +08:00
parent 09fd3e152a
commit b5c2732f88
2 changed files with 1324 additions and 273 deletions
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -14,7 +14,16 @@ Features
 """
 from __future__ import annotations
-import aiofiles, asyncio, gzip, hashlib, io, json, os, pathlib, re, time
+import aiofiles
 import asyncio
 import gzip
 import hashlib
 import io
 import json
 import os
 import pathlib
 import re
 import time
 from datetime import timedelta
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
@@ -42,7 +51,8 @@ except ImportError:
 # Import AsyncLoggerBase from crawl4ai's logger module
 # Assuming crawl4ai/async_logger.py defines AsyncLoggerBase
 # You might need to adjust this import based on your exact file structure
-from .async_logger import AsyncLoggerBase, AsyncLogger # Import AsyncLogger for default if needed
+# Import AsyncLogger for default if needed
 from .async_logger import AsyncLoggerBase, AsyncLogger
 # Import SeedingConfig for type hints
 from typing import TYPE_CHECKING
@@ -62,9 +72,12 @@ _meta_rx = re.compile(
    re.I)
 _charset_rx = re.compile(r'<meta\s+[^>]*charset=["\']?([^"\' >]+)', re.I)
 _title_rx = re.compile(r'<title>(.*?)</title>', re.I | re.S)
-_link_rx    = re.compile(r'<link\s+[^>]*rel=["\']?([^"\' >]+)[^>]*href=["\']?([^"\' >]+)', re.I)
+_link_rx = re.compile(
    r'<link\s+[^>]*rel=["\']?([^"\' >]+)[^>]*href=["\']?([^"\' >]+)', re.I)
 # ────────────────────────────────────────────────────────────────────────── helpers
 def _match(url: str, pattern: str) -> bool:
    if fnmatch.fnmatch(url, pattern):
        return True
@@ -72,11 +85,13 @@ def _match(url: str, pattern: str) -> bool:
    return (fnmatch.fnmatch(canon, pattern)
            or (canon.startswith("www.") and fnmatch.fnmatch(canon[4:], pattern)))
 def _parse_head(src: str) -> Dict[str, Any]:
    if LXML:
        try:
            if isinstance(src, str):
-                src = src.encode("utf-8", "replace")     # strip Unicode, let lxml decode
+                # strip Unicode, let lxml decode
                src = src.encode("utf-8", "replace")
            doc = lxml_html.fromstring(src)
        except (ValueError, etree.ParserError):
            return {}        # malformed, bail gracefully
@@ -87,13 +102,18 @@ def _parse_head(src: str) -> Dict[str, Any]:
            "meta": {}, "link": {}, "jsonld": []
        }
        for el in doc.xpath(".//meta"):
-            k = el.attrib.get("name") or el.attrib.get("property") or el.attrib.get("http-equiv")
+            k = el.attrib.get("name") or el.attrib.get(
-            if k: info["meta"][k.lower()] = el.attrib.get("content", "")
+                "property") or el.attrib.get("http-equiv")
-            elif "charset" in el.attrib: info["charset"] = el.attrib["charset"].lower()
+            if k:
                info["meta"][k.lower()] = el.attrib.get("content", "")
            elif "charset" in el.attrib:
                info["charset"] = el.attrib["charset"].lower()
        for el in doc.xpath(".//link"):
            rel = " ".join(el.attrib.get("rel", [])).lower()
-            if not rel: continue
+            if not rel:
-            entry = {a: el.attrib[a] for a in ("href","as","type","hreflang") if a in el.attrib}
+                continue
            entry = {a: el.attrib[a] for a in (
                "href", "as", "type", "hreflang") if a in el.attrib}
            info["link"].setdefault(rel, []).append(entry)
        # Extract JSON-LD structured data
        for script in doc.xpath('.//script[@type="application/ld+json"]'):
@@ -109,14 +129,19 @@ def _parse_head(src: str) -> Dict[str, Any]:
            info["lang"] = html_elem.attrib.get("lang", "")
        return info
    # regex fallback
-    info: Dict[str,Any] = {"title":None,"charset":None,"meta":{},"link":{},"jsonld":[],"lang":""}
+    info: Dict[str, Any] = {"title": None, "charset": None,
-    m=_title_rx.search(src);            info["title"]=m.group(1).strip() if m else None
+                            "meta": {}, "link": {}, "jsonld": [], "lang": ""}
-    for k,v in _meta_rx.findall(src):   info["meta"][k.lower()]=v
+    m = _title_rx.search(src)
-    m=_charset_rx.search(src);          info["charset"]=m.group(1).lower() if m else None
+    info["title"] = m.group(1).strip() if m else None
    for k, v in _meta_rx.findall(src):
        info["meta"][k.lower()] = v
    m = _charset_rx.search(src)
    info["charset"] = m.group(1).lower() if m else None
    for rel, href in _link_rx.findall(src):
        info["link"].setdefault(rel.lower(), []).append({"href": href})
    # Try to extract JSON-LD with regex
-    jsonld_pattern = re.compile(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.I|re.S)
+    jsonld_pattern = re.compile(
        r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.I | re.S)
    for match in jsonld_pattern.findall(src):
        try:
            jsonld_data = json.loads(match.strip())
@@ -130,15 +155,40 @@ def _parse_head(src: str) -> Dict[str, Any]:
    return info
 # ────────────────────────────────────────────────────────────────────────── class
 class AsyncUrlSeeder:
    """
    Async version of UrlSeeder.
    Call pattern is await/async for / async with.
-    Public coroutine
+    Public coroutines
-    ----------------
+    -----------------
    await seed.urls(...)
        returns List[Dict[str,Any]]  (url, status, head_data)
    await seed.many_urls(...)
        returns Dict[str, List[Dict[str,Any]]]
    await seed.close()
        closes the HTTP client if owned by seeder
    Usage examples
    --------------
    # Manual cleanup:
    seeder = AsyncUrlSeeder()
    try:
        urls = await seeder.urls("example.com", config)
    finally:
        await seeder.close()
    # Using async context manager (recommended):
    async with AsyncUrlSeeder() as seeder:
        urls = await seeder.urls("example.com", config)
    # Reusing existing client:
    client = httpx.AsyncClient()
    seeder = AsyncUrlSeeder(client=client)
    urls = await seeder.urls("example.com", config)
    # No need to close seeder, as it doesn't own the client
    """
    def __init__(
@@ -146,25 +196,31 @@ class AsyncUrlSeeder:
        ttl: timedelta = TTL,
        client: Optional[httpx.AsyncClient] = None,
        logger: Optional[AsyncLoggerBase] = None,  # NEW: Add logger parameter
-        base_directory: Optional[Union[str, pathlib.Path]] = None, # NEW: Add base_directory
+        # NEW: Add base_directory
        base_directory: Optional[Union[str, pathlib.Path]] = None,
        cache_root: Optional[Union[str, Path]] = None,
    ):
        self.ttl = ttl
        self._owns_client = client is None  # Track if we created the client
        self.client = client or httpx.AsyncClient(http2=True, timeout=20, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        })
        self.logger = logger  # Store the logger instance
-        self.base_directory = pathlib.Path(base_directory or os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())) # Resolve base_directory
+        self.base_directory = pathlib.Path(base_directory or os.getenv(
-        self.cache_dir = self.base_directory / ".crawl4ai" / "seeder_cache" # NEW: Specific cache dir for seeder
+            "CRAWL4_AI_BASE_DIRECTORY", Path.home()))  # Resolve base_directory
        self.cache_dir = self.base_directory / ".crawl4ai" / \
            "seeder_cache"  # NEW: Specific cache dir for seeder
        self.cache_dir.mkdir(parents=True, exist_ok=True)  # Ensure it exists
-        self.index_cache_path = self.cache_dir / "latest_cc_index.txt" # NEW: Index cache path
+        self.index_cache_path = self.cache_dir / \
            "latest_cc_index.txt"  # NEW: Index cache path
        # defer – grabbing the index inside an active loop blows up
        self.index_id: Optional[str] = None
        self._rate_sem: Optional[asyncio.Semaphore] = None
        # ───────── cache dirs ─────────
-        self.cache_root = Path(os.path.expanduser(cache_root or "~/.cache/url_seeder"))
+        self.cache_root = Path(os.path.expanduser(
            cache_root or "~/.cache/url_seeder"))
        (self.cache_root / "live").mkdir(parents=True, exist_ok=True)
        (self.cache_root / "head").mkdir(exist_ok=True)
@@ -173,7 +229,8 @@ class AsyncUrlSeeder:
        if self.logger:
            log_method = getattr(self.logger, level, None)
            if log_method:
-                log_method(message=message, tag=tag, params=kwargs.get('params', {}))
+                log_method(message=message, tag=tag,
                           params=kwargs.get('params', {}))
            # else: # Fallback for unknown level, should not happen with AsyncLoggerBase
            #     print(f"[{tag}] {level.upper()}: {message.format(**kwargs)}")
@@ -182,28 +239,27 @@ class AsyncUrlSeeder:
        h = hashlib.sha1(url.encode()).hexdigest()
        return self.cache_root / kind / f"{h}.json"
-    def _cache_get(self, kind: str, url: str) -> Optional[Dict[str, Any]]:
+    async def _cache_get(self, kind: str, url: str) -> Optional[Dict[str, Any]]:
        p = self._cache_path(kind, url)
        if not p.exists():
            return None
        # TTL check
        if time.time()-p.stat().st_mtime > self.ttl.total_seconds():
            return None
        try:
-            return json.loads(p.read_text())
+            async with aiofiles.open(p, "r") as f:
                return json.loads(await f.read())
        except Exception:
            return None
-    def _cache_set(self, kind: str, url: str, data: Dict[str, Any]) -> None:
+    async def _cache_set(self, kind: str, url: str, data: Dict[str, Any]) -> None:
        try:
-            self._cache_path(kind, url).write_text(
+            async with aiofiles.open(self._cache_path(kind, url), "w") as f:
-                json.dumps(data, separators=(",", ":"))
+                await f.write(json.dumps(data, separators=(",", ":")))
            )
        except Exception:
            pass
    # ─────────────────────────────── discovery entry
    async def urls(self,
                   domain: str,
                   config: "SeedingConfig",
@@ -228,7 +284,8 @@ class AsyncUrlSeeder:
        hits_per_sec = config.hits_per_sec
        self.force = config.force  # Store force flag as instance attribute
        force = config.force
-        verbose = config.verbose if config.verbose is not None else (self.logger.verbose if self.logger else False)
+        verbose = config.verbose if config.verbose is not None else (
            self.logger.verbose if self.logger else False)
        max_urls = config.max_urls if config.max_urls is not None else -1
        query = config.query
        score_threshold = config.score_threshold
@@ -247,11 +304,13 @@ class AsyncUrlSeeder:
        valid_sources = {"cc", "sitemap"}
        for s in sources:
            if s not in valid_sources:
-                raise ValueError(f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
+                raise ValueError(
                    f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
        if hits_per_sec:
            if hits_per_sec <= 0:
-                self._log("warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
+                self._log(
                    "warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
                self._rate_sem = None
            else:
                self._rate_sem = asyncio.Semaphore(hits_per_sec)
@@ -268,11 +327,14 @@ class AsyncUrlSeeder:
                async for u in self._from_sitemaps(domain, pattern, force):
                    yield u
            if "cc" in sources:
-                self._log("debug", "Fetching from Common Crawl...", tag="URL_SEED")
+                self._log("debug", "Fetching from Common Crawl...",
                          tag="URL_SEED")
                async for u in self._from_cc(domain, pattern, force):
                    yield u
-        queue = asyncio.Queue()
+        # Use bounded queue to prevent RAM spikes with large domains
        queue_size = min(10000, max(1000, concurrency * 100))  # Dynamic size based on concurrency
        queue = asyncio.Queue(maxsize=queue_size)
        producer_done = asyncio.Event()
        stop_event = asyncio.Event()
        seen: set[str] = set()
@@ -285,27 +347,31 @@ class AsyncUrlSeeder:
                                  params={"url": u}, tag="URL_SEED")
                        continue
                    if stop_event.is_set():
-                        self._log("info", "Producer stopping due to max_urls limit.", tag="URL_SEED")
+                        self._log(
                            "info", "Producer stopping due to max_urls limit.", tag="URL_SEED")
                        break
-                    await queue.put(u)
+                    seen.add(u)
                    await queue.put(u)  # Will block if queue is full, providing backpressure
            except Exception as e:
-                self._log("error", "Producer encountered an error: {error}", params={"error": str(e)}, tag="URL_SEED")
+                self._log("error", "Producer encountered an error: {error}", params={
                          "error": str(e)}, tag="URL_SEED")
            finally:
                producer_done.set()
                self._log("debug", "Producer finished.", tag="URL_SEED")
        async def worker(res_list: List[Dict[str, Any]]):
            while True:
                if queue.empty() and producer_done.is_set():
                    # self._log("debug", "Worker exiting: queue empty and producer done.", tag="URL_SEED")
                    break
                try:
-                    url = await asyncio.wait_for(queue.get(), 5) # Increased timeout slightly
+                    # Increased timeout slightly
                    url = await asyncio.wait_for(queue.get(), 5)
                except asyncio.TimeoutError:
                    continue  # Keep checking queue and producer_done status
                except Exception as e:
-                    self._log("error", "Worker failed to get URL from queue: {error}", params={"error": str(e)}, tag="URL_SEED")
+                    self._log("error", "Worker failed to get URL from queue: {error}", params={
                              "error": str(e)}, tag="URL_SEED")
                    continue
                if max_urls > 0 and len(res_list) >= max_urls:
@@ -332,16 +398,17 @@ class AsyncUrlSeeder:
                if self._rate_sem:  # global QPS control
                    async with self._rate_sem:
                        await self._validate(url, res_list, live_check, extract_head,
-                                             head_timeout, verbose)
+                                             head_timeout, verbose, query, score_threshold, scoring_method)
                else:
                    await self._validate(url, res_list, live_check, extract_head,
-                                         head_timeout, verbose)
+                                         head_timeout, verbose, query, score_threshold, scoring_method)
                queue.task_done()  # Mark task as done for queue.join() if ever used
        # launch
        results: List[Dict[str, Any]] = []
        prod_task = asyncio.create_task(producer())
-        workers = [asyncio.create_task(worker(results)) for _ in range(concurrency)]
+        workers = [asyncio.create_task(worker(results))
                   for _ in range(concurrency)]
        # Wait for all workers to finish
        await asyncio.gather(prod_task, *workers)
@@ -350,52 +417,15 @@ class AsyncUrlSeeder:
        self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}",
                  params={"domain": domain, "count": len(results)}, tag="URL_SEED")
-        # Apply BM25 scoring if query is provided and extract_head is enabled
+        # Sort by relevance score if query was provided
        if query and extract_head and scoring_method == "bm25":
-            self._log("info", "Applying BM25 scoring for query: '{query}'", 
+            results.sort(key=lambda x: x.get(
-                      params={"query": query}, tag="URL_SEED")
+                "relevance_score", 0.0), reverse=True)
-            
+            self._log("info", "Sorted {count} URLs by relevance score for query: '{query}'",
-            # Extract text contexts from all results
+                      params={"count": len(results), "query": query}, tag="URL_SEED")
            documents = []
            valid_indices = []
            for i, result in enumerate(results):
                if result.get("head_data"):
                    text_context = self._extract_text_context(result["head_data"])
                    if text_context:  # Only include non-empty contexts
                        documents.append(text_context)
                        valid_indices.append(i)
            if documents:
                # Calculate BM25 scores
                scores = self._calculate_bm25_score(query, documents)
                # Add scores to results
                for idx, score in zip(valid_indices, scores):
                    results[idx]["relevance_score"] = float(score)
                # Add zero scores to results without head_data
                for i, result in enumerate(results):
                    if i not in valid_indices:
                        result["relevance_score"] = 0.0
                # Filter by score threshold if specified
                if score_threshold is not None:
                    original_count = len(results)
                    results = [r for r in results if r.get("relevance_score", 0.0) >= score_threshold]
                    self._log("info", "Filtered {filtered} URLs below score threshold {threshold}. Remaining: {remaining}", 
                              params={"filtered": original_count - len(results), 
                                      "threshold": score_threshold, 
                                      "remaining": len(results)}, tag="URL_SEED")
                # Sort by relevance score (highest first)
                results.sort(key=lambda x: x.get("relevance_score", 0.0), reverse=True)
            else:
                self._log("warning", "No valid head data found for BM25 scoring.", tag="URL_SEED")
                # Add zero scores to all results
                for result in results:
                    result["relevance_score"] = 0.0
        elif query and not extract_head:
-            self._log("warning", "Query provided but extract_head is False. Enable extract_head for relevance scoring.", tag="URL_SEED")
+            self._log(
                "warning", "Query provided but extract_head is False. Enable extract_head for relevance scoring.", tag="URL_SEED")
        return results[:max_urls] if max_urls > 0 else results
@@ -430,7 +460,8 @@ class AsyncUrlSeeder:
        results = await asyncio.gather(*tasks)
        final_results = dict(zip(domains, results))
-        self._log("info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
+        self._log(
            "info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
        return final_results
    async def _resolve_head(self, url: str) -> Optional[str]:
@@ -462,14 +493,14 @@ class AsyncUrlSeeder:
                      params={"url": url, "err": str(e)}, tag="URL_SEED")
            return None
    # ─────────────────────────────── CC
    async def _from_cc(self, domain: str, pattern: str, force: bool):
        import re
        digest = hashlib.md5(pattern.encode()).hexdigest()[:8]
        # ── normalise for CC   (strip scheme, query, fragment)
-        raw = re.sub(r'^https?://', '', domain).split('#', 1)[0].split('?', 1)[0].lstrip('.')
+        raw = re.sub(r'^https?://', '', domain).split('#',
                                                      1)[0].split('?', 1)[0].lstrip('.')
        # ── sanitize only for cache-file name
        safe = re.sub('[/?#]+', '_', raw)
@@ -481,7 +512,8 @@ class AsyncUrlSeeder:
            async with aiofiles.open(path, "r") as fp:
                async for line in fp:
                    url = line.strip()
-                    if _match(url,pattern): yield url
+                    if _match(url, pattern):
                        yield url
            return
        # build CC glob – if a path is present keep it, else add trailing /*
@@ -500,7 +532,8 @@ class AsyncUrlSeeder:
                            rec = json.loads(line)
                            u = rec["url"]
                            await fp.write(u+"\n")
-                            if _match(u,pattern): yield u
+                            if _match(u, pattern):
                                yield u
                return
            except httpx.HTTPStatusError as e:
                if e.response.status_code == 503 and i < len(retries):
@@ -516,7 +549,6 @@ class AsyncUrlSeeder:
                          params={"domain": domain, "error": str(e)}, tag="URL_SEED")
                raise
    # ─────────────────────────────── Sitemaps
    async def _from_sitemaps(self, domain: str, pattern: str, force: bool = False):
        """
@@ -551,7 +583,8 @@ class AsyncUrlSeeder:
                sm = f"{scheme}://{host}{suffix}"
                sm = await self._resolve_head(sm)
                if sm:
-                    self._log("info","Found sitemap at {url}",params={"url":sm},tag="URL_SEED")
+                    self._log("info", "Found sitemap at {url}", params={
                              "url": sm}, tag="URL_SEED")
                    async with aiofiles.open(path, "w") as fp:
                        async for u in self._iter_sitemap(sm):
                            await fp.write(u + "\n")
@@ -564,11 +597,14 @@ class AsyncUrlSeeder:
        try:
            r = await self.client.get(robots, timeout=10, follow_redirects=True)
            if not 200 <= r.status_code < 300:
-                self._log("warning","robots.txt unavailable for {d} HTTP{c}",params={"d":domain,"c":r.status_code},tag="URL_SEED")
+                self._log("warning", "robots.txt unavailable for {d} HTTP{c}", params={
                          "d": domain, "c": r.status_code}, tag="URL_SEED")
                return
-            sitemap_lines=[l.split(":",1)[1].strip() for l in r.text.splitlines() if l.lower().startswith("sitemap:")]
+            sitemap_lines = [l.split(":", 1)[1].strip(
            ) for l in r.text.splitlines() if l.lower().startswith("sitemap:")]
        except Exception as e:
-            self._log("warning","Failed to fetch robots.txt for {d}: {e}",params={"d":domain,"e":str(e)},tag="URL_SEED")
+            self._log("warning", "Failed to fetch robots.txt for {d}: {e}", params={
                      "d": domain, "e": str(e)}, tag="URL_SEED")
            return
        if sitemap_lines:
@@ -598,6 +634,11 @@ class AsyncUrlSeeder:
        data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
        # Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
        is_sitemap_index = False
        sub_sitemaps = []
        regular_urls = []
        # Use lxml for XML parsing if available, as it's generally more robust
        if LXML:
            try:
@@ -608,19 +649,21 @@ class AsyncUrlSeeder:
                # Define namespace for sitemap
                ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
-                # First check if this is a sitemap index
+                # Check for sitemap index entries
-                for sitemap_elem in root.xpath('//s:sitemap/s:loc', namespaces=ns):
+                sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
                if sitemap_locs:
                    is_sitemap_index = True
                    for sitemap_elem in sitemap_locs:
                        loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
                        if loc:
-                        self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED")
+                            sub_sitemaps.append(loc)
                        async for u in self._iter_sitemap(loc):
                            yield u
-                # Then check for regular URLs
+                # If not a sitemap index, get regular URLs
                if not is_sitemap_index:
                    for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
                        loc = loc_elem.text.strip() if loc_elem.text else ""
                        if loc:
-                        yield loc
+                            regular_urls.append(loc)
            except Exception as e:
                self._log("error", "LXML parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -636,70 +679,161 @@ class AsyncUrlSeeder:
                        elem.tag = elem.tag.split('}')[1]
                # Check for sitemap index entries
-                for sitemap in root.findall('.//sitemap'):
+                sitemaps = root.findall('.//sitemap')
                if sitemaps:
                    is_sitemap_index = True
                    for sitemap in sitemaps:
                        loc_elem = sitemap.find('loc')
                        if loc_elem is not None and loc_elem.text:
-                        loc = loc_elem.text.strip()
+                            sub_sitemaps.append(loc_elem.text.strip())
                        self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED")
                        async for u in self._iter_sitemap(loc):
                            yield u
-                # Check for regular URL entries
+                # If not a sitemap index, get regular URLs
-                for url in root.findall('.//url'):
+                if not is_sitemap_index:
-                    loc_elem = url.find('loc')
+                    for url_elem in root.findall('.//url'):
                        loc_elem = url_elem.find('loc')
                        if loc_elem is not None and loc_elem.text:
-                        yield loc_elem.text.strip()
+                            regular_urls.append(loc_elem.text.strip())
            except Exception as e:
                self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
                          params={"url": url, "error": str(e)}, tag="URL_SEED")
                return
        # Process based on type
        if is_sitemap_index and sub_sitemaps:
            self._log("info", "Processing sitemap index with {count} sub-sitemaps in parallel",
                      params={"count": len(sub_sitemaps)}, tag="URL_SEED")
            # Create a bounded queue for results to prevent RAM issues
            # For sitemap indexes, use a larger queue as we expect many URLs
            queue_size = min(50000, len(sub_sitemaps) * 1000)  # Estimate 1000 URLs per sitemap
            result_queue = asyncio.Queue(maxsize=queue_size)
            completed_count = 0
            total_sitemaps = len(sub_sitemaps)
            async def process_subsitemap(sitemap_url: str):
                try:
                    self._log(
                        "debug", "Processing sub-sitemap: {url}", params={"url": sitemap_url}, tag="URL_SEED")
                    # Recursively process sub-sitemap
                    async for u in self._iter_sitemap(sitemap_url):
                        await result_queue.put(u)  # Will block if queue is full
                except Exception as e:
                    self._log("error", "Error processing sub-sitemap {url}: {error}",
                              params={"url": sitemap_url, "error": str(e)}, tag="URL_SEED")
                finally:
                    # Put sentinel to signal completion
                    await result_queue.put(None)
            # Start all tasks
            tasks = [asyncio.create_task(process_subsitemap(sm))
                     for sm in sub_sitemaps]
            # Yield results as they come in
            while completed_count < total_sitemaps:
                item = await result_queue.get()
                if item is None:
                    completed_count += 1
                else:
                    yield item
            # Ensure all tasks are done
            await asyncio.gather(*tasks, return_exceptions=True)
        else:
            # Regular sitemap - yield URLs directly
            for u in regular_urls:
                yield u
    # ─────────────────────────────── validate helpers
    async def _validate(self, url: str, res_list: List[Dict[str, Any]], live: bool,
-                        extract:bool, timeout:int, verbose:bool):
+                        extract: bool, timeout: int, verbose: bool, query: Optional[str] = None,
                        score_threshold: Optional[float] = None, scoring_method: str = "bm25"):
        # Local verbose parameter for this function is used to decide if intermediate logs should be printed
        # The main logger's verbose status should be controlled by the caller.
        cache_kind = "head" if extract else "live"
        # ---------- try cache ----------
-        if (live or extract) and not (hasattr(self, 'force') and self.force):
+        if not (hasattr(self, 'force') and self.force):
-            cached = self._cache_get(cache_kind, url)
+            cached = await self._cache_get(cache_kind, url)
            if cached:
                res_list.append(cached)
                return
        if extract:
-            self._log("debug", "Fetching head for {url}", params={"url": url}, tag="URL_SEED")
+            self._log("debug", "Fetching head for {url}", params={
                      "url": url}, tag="URL_SEED")
            ok, html, final = await self._fetch_head(url, timeout)
            status = "valid" if ok else "not_valid"
            self._log("info" if ok else "warning", "HEAD {status} for {final_url}",
                      params={"status": status.upper(), "final_url": final or url}, tag="URL_SEED")
            # head_data = _parse_head(html) if ok else {}
            head_data = await asyncio.to_thread(_parse_head, html) if ok else {}
            entry = {
                "url": final or url,
                "status": status,
-                "head_data": _parse_head(html) if ok else {},
+                "head_data": head_data,
            }
-            if live or extract:
+
-                self._cache_set(cache_kind, url, entry)
+            # Apply BM25 scoring if query is provided and head data exists
-            res_list.append(entry)
+            if query and ok and scoring_method == "bm25" and head_data:
                text_context = self._extract_text_context(head_data)
                if text_context:
                    # Calculate BM25 score for this single document
                    # scores = self._calculate_bm25_score(query, [text_context])
                    scores = await asyncio.to_thread(self._calculate_bm25_score, query, [text_context])
                    relevance_score = scores[0] if scores else 0.0
                    entry["relevance_score"] = float(relevance_score)
                else:
                    # No text context, use URL-based scoring as fallback
                    relevance_score = self._calculate_url_relevance_score(
                        query, entry["url"])
                    entry["relevance_score"] = float(relevance_score)
            elif query:
                # Query provided but no head data - we reject this entry
                self._log("debug", "No head data for {url}, using URL-based scoring",
                          params={"url": url}, tag="URL_SEED")
                return
                # relevance_score = self._calculate_url_relevance_score(query, entry["url"])
                # entry["relevance_score"] = float(relevance_score)
        elif live:
-            self._log("debug", "Performing live check for {url}", params={"url": url}, tag="URL_SEED")
+            self._log("debug", "Performing live check for {url}", params={
                      "url": url}, tag="URL_SEED")
            ok = await self._resolve_head(url)
            status = "valid" if ok else "not_valid"
            self._log("info" if ok else "warning", "LIVE CHECK {status} for {url}",
                      params={"status": status.upper(), "url": url}, tag="URL_SEED")
            entry = {"url": url, "status": status, "head_data": {}}
-            if live or extract:
+
-                self._cache_set(cache_kind, url, entry)
+            # Apply URL-based scoring if query is provided
-            res_list.append(entry)
+            if query:
                relevance_score = self._calculate_url_relevance_score(
                    query, url)
                entry["relevance_score"] = float(relevance_score)
        else:
            entry = {"url": url, "status": "unknown", "head_data": {}}
            if live or extract:
                self._cache_set(cache_kind, url, entry)
            res_list.append(entry)
            # Apply URL-based scoring if query is provided
            if query:
                relevance_score = self._calculate_url_relevance_score(
                    query, url)
                entry["relevance_score"] = float(relevance_score)
        # Now decide whether to add the entry based on score threshold
        if query and "relevance_score" in entry:
            if score_threshold is None or entry["relevance_score"] >= score_threshold:
                if live or extract:
                    await self._cache_set(cache_kind, url, entry)
                res_list.append(entry)
            else:
                self._log("debug", "URL {url} filtered out with score {score} < {threshold}",
                          params={"url": url, "score": entry["relevance_score"], "threshold": score_threshold}, tag="URL_SEED")
        else:
            # No query or no scoring - add as usual
            if live or extract:
                await self._cache_set(cache_kind, url, entry)
            res_list.append(entry)
    async def _head_ok(self, url: str, timeout: int) -> bool:
        try:
@@ -753,10 +887,12 @@ class AsyncUrlSeeder:
                        else:
                            self._log("warning", "Redirect status {status_code} but no Location header for {url}",
                                      params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
-                            return False, "", str(r.url) # Return original URL if no new location
+                            # Return original URL if no new location
                            return False, "", str(r.url)
                    # For 2xx or other non-redirect codes, proceed to read content
-                    if not (200 <= r.status_code < 400): # Only allow successful codes, or continue
+                    # Only allow successful codes, or continue
                    if not (200 <= r.status_code < 400):
                        self._log("warning", "Non-success status {status_code} when fetching head for {url}",
                                  params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
                        return False, "", str(r.url)
@@ -787,7 +923,8 @@ class AsyncUrlSeeder:
                        self._log(
                            "warning",
                            "Decompression error for {url} ({encoding}): {error}",
-                            params={"url": r.url, "encoding": enc, "error": str(e)},
+                            params={"url": r.url,
                                    "encoding": enc, "error": str(e)},
                            tag="URL_SEED",
                        )
                        # fall through with raw buf
@@ -798,7 +935,8 @@ class AsyncUrlSeeder:
                        self._log("debug", "No </head> tag found in initial bytes of {url}",
                                  params={"url": r.url}, tag="URL_SEED")
                        # If no </head> is found, take a reasonable chunk or all if small
-                        html_bytes = buf if len(buf) < 10240 else buf[:10240] # Take max 10KB if no head tag
+                        # Take max 10KB if no head tag
                        html_bytes = buf if len(buf) < 10240 else buf[:10240]
                    else:
                        html_bytes = buf[:idx+7]  # Include </head> tag
@@ -813,7 +951,8 @@ class AsyncUrlSeeder:
                        )
                        html = html_bytes.decode("latin-1", "replace")
-                    return True,html,str(r.url) # Return the actual URL after redirects
+                    # Return the actual URL after redirects
                    return True, html, str(r.url)
            except httpx.RequestError as e:
                self._log("debug", "Fetch head network error for {url}: {error}",
@@ -865,7 +1004,8 @@ class AsyncUrlSeeder:
                        if isinstance(jsonld[field], str):
                            text_parts.append(jsonld[field])
                        elif isinstance(jsonld[field], list):
-                            text_parts.extend(str(item) for item in jsonld[field] if item)
+                            text_parts.extend(str(item)
                                              for item in jsonld[field] if item)
                # Handle @graph structures
                if "@graph" in jsonld and isinstance(jsonld["@graph"], list):
@@ -878,10 +1018,99 @@ class AsyncUrlSeeder:
        # Combine all text parts
        return " ".join(filter(None, text_parts))
    def _calculate_url_relevance_score(self, query: str, url: str) -> float:
        """Calculate relevance score between query and URL using string matching."""
        # Normalize inputs
        query_lower = query.lower()
        url_lower = url.lower()
        # Extract URL components
        from urllib.parse import urlparse
        parsed = urlparse(url)
        domain = parsed.netloc.replace('www.', '')
        path = parsed.path.strip('/')
        # Create searchable text from URL
        # Split domain by dots and path by slashes
        domain_parts = domain.split('.')
        path_parts = [p for p in path.split('/') if p]
        # Include query parameters if any
        query_params = parsed.query
        param_parts = []
        if query_params:
            for param in query_params.split('&'):
                if '=' in param:
                    key, value = param.split('=', 1)
                    param_parts.extend([key, value])
        # Combine all parts
        all_parts = domain_parts + path_parts + param_parts
        # Calculate scores
        scores = []
        query_tokens = query_lower.split()
        # 1. Exact match in any part (highest score)
        for part in all_parts:
            part_lower = part.lower()
            if query_lower in part_lower:
                scores.append(1.0)
            elif part_lower in query_lower:
                scores.append(0.9)
        # 2. Token matching
        for token in query_tokens:
            token_scores = []
            for part in all_parts:
                part_lower = part.lower()
                if token in part_lower:
                    # Score based on how much of the part the token covers
                    coverage = len(token) / len(part_lower)
                    token_scores.append(0.7 * coverage)
                elif part_lower in token:
                    coverage = len(part_lower) / len(token)
                    token_scores.append(0.6 * coverage)
            if token_scores:
                scores.append(max(token_scores))
        # 3. Character n-gram similarity (for fuzzy matching)
        def get_ngrams(text, n=3):
            return set(text[i:i+n] for i in range(len(text)-n+1))
        # Combine all URL parts into one string for n-gram comparison
        url_text = ' '.join(all_parts).lower()
        if len(query_lower) >= 3 and len(url_text) >= 3:
            query_ngrams = get_ngrams(query_lower)
            url_ngrams = get_ngrams(url_text)
            if query_ngrams and url_ngrams:
                intersection = len(query_ngrams & url_ngrams)
                union = len(query_ngrams | url_ngrams)
                jaccard = intersection / union if union > 0 else 0
                scores.append(0.5 * jaccard)
        # Calculate final score
        if not scores:
            return 0.0
        # Weighted average with bias towards higher scores
        scores.sort(reverse=True)
        weighted_score = 0
        total_weight = 0
        for i, score in enumerate(scores):
            weight = 1 / (i + 1)  # Higher weight for better matches
            weighted_score += score * weight
            total_weight += weight
        final_score = weighted_score / total_weight if total_weight > 0 else 0
        return min(final_score, 1.0)  # Cap at 1.0
    def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]:
        """Calculate BM25 scores for documents against a query."""
        if not HAS_BM25:
-            self._log("warning", "rank_bm25 not installed. Returning zero scores.", tag="URL_SEED")
+            self._log(
                "warning", "rank_bm25 not installed. Returning zero scores.", tag="URL_SEED")
            return [0.0] * len(documents)
        if not query or not documents:
@@ -912,6 +1141,22 @@ class AsyncUrlSeeder:
                      params={"error": str(e)}, tag="URL_SEED")
            return [0.0] * len(documents)
    # ─────────────────────────────── cleanup methods
    async def close(self):
        """Close the HTTP client if we own it."""
        if self._owns_client and self.client:
            await self.client.aclose()
            self._log("debug", "Closed HTTP client", tag="URL_SEED")
    async def __aenter__(self):
        """Async context manager entry."""
        return self
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Async context manager exit."""
        await self.close()
        return False
    # ─────────────────────────────── index helper
    async def _latest_index(self) -> str:
        if self.index_cache_path.exists() and (time.time()-self.index_cache_path.stat().st_mtime) < self.ttl.total_seconds():
--- a/docs/examples/url_seeder/bbc_sport_research_assistant.py
+++ b/docs/examples/url_seeder/bbc_sport_research_assistant.py
@@ -0,0 +1,806 @@
 """
 BBC Sport Research Assistant Pipeline
 =====================================
 This example demonstrates how URLSeeder helps create an efficient research pipeline:
 1. Discover all available URLs without crawling
 2. Filter and rank them based on relevance
 3. Crawl only the most relevant content
 4. Generate comprehensive research insights
 Pipeline Steps:
 1. Get user query
 2. Optionally enhance query using LLM
 3. Use URLSeeder to discover and rank URLs
 4. Crawl top K URLs with BM25 filtering
 5. Generate detailed response with citations
 Requirements:
 - pip install crawl4ai
 - pip install litellm
 - export GEMINI_API_KEY="your-api-key"
 Usage:
 - Run normally: python bbc_sport_research_assistant.py
 - Run test mode: python bbc_sport_research_assistant.py test
 """
 import asyncio
 import json
 import os
 import hashlib
 import pickle
 from typing import List, Dict, Optional, Tuple
 from dataclasses import dataclass, asdict
 from datetime import datetime
 from pathlib import Path
 # Rich for colored output
 from rich.console import Console
 from rich.text import Text
 from rich.panel import Panel
 from rich.table import Table
 from rich.progress import Progress, SpinnerColumn, TextColumn
 # Crawl4AI imports
 from crawl4ai import (
    AsyncWebCrawler, 
    BrowserConfig, 
    CrawlerRunConfig,
    AsyncUrlSeeder, 
    SeedingConfig,
    AsyncLogger
 )
 from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 # LiteLLM for AI communication
 import litellm
 # Initialize Rich console
 console = Console()
 # Get the current directory where this script is located
 SCRIPT_DIR = Path(__file__).parent.resolve()
 # Cache configuration - relative to script directory
 CACHE_DIR = SCRIPT_DIR / "temp_cache"
 CACHE_DIR.mkdir(parents=True, exist_ok=True)
 # Testing limits
 TESTING_MODE = True
 MAX_URLS_DISCOVERY = 100 if TESTING_MODE else 1000
 MAX_URLS_TO_CRAWL = 5 if TESTING_MODE else 10
 def get_cache_key(prefix: str, *args) -> str:
    """Generate cache key from prefix and arguments"""
    content = f"{prefix}:{'|'.join(str(arg) for arg in args)}"
    return hashlib.md5(content.encode()).hexdigest()
 def load_from_cache(cache_key: str) -> Optional[any]:
    """Load data from cache if exists"""
    cache_path = CACHE_DIR / f"{cache_key}.pkl"
    if cache_path.exists():
        with open(cache_path, 'rb') as f:
            return pickle.load(f)
    return None
 def save_to_cache(cache_key: str, data: any) -> None:
    """Save data to cache"""
    cache_path = CACHE_DIR / f"{cache_key}.pkl"
    with open(cache_path, 'wb') as f:
        pickle.dump(data, f)
@dataclass
 class ResearchConfig:
    """Configuration for research pipeline"""
    # Core settings
    domain: str = "www.bbc.com/sport"
    max_urls_discovery: int = 100
    max_urls_to_crawl: int = 10
    top_k_urls: int = 10
    # Scoring and filtering
    score_threshold: float = 0.1
    scoring_method: str = "bm25"
    # Processing options
    use_llm_enhancement: bool = True
    extract_head_metadata: bool = True
    live_check: bool = True
    force_refresh: bool = False
    # Crawler settings
    max_concurrent_crawls: int = 5
    timeout: int = 30000
    headless: bool = True
    # Output settings
    save_json: bool = True
    save_markdown: bool = True
    output_dir: str = None  # Will be set in __post_init__
    # Development settings
    test_mode: bool = False
    interactive_mode: bool = False
    verbose: bool = True
    def __post_init__(self):
        """Adjust settings based on test mode"""
        if self.test_mode:
            self.max_urls_discovery = 50
            self.max_urls_to_crawl = 3
            self.top_k_urls = 5
        # Set default output directory relative to script location
        if self.output_dir is None:
            self.output_dir = str(SCRIPT_DIR / "research_results")
@dataclass
 class ResearchQuery:
    """Container for research query and metadata"""
    original_query: str
    enhanced_query: Optional[str] = None
    search_patterns: List[str] = None
    timestamp: str = None
@dataclass
 class ResearchResult:
    """Container for research results"""
    query: ResearchQuery
    discovered_urls: List[Dict]
    crawled_content: List[Dict]
    synthesis: str
    citations: List[Dict]
    metadata: Dict
 async def get_user_query() -> str:
    """
    Get research query from user input
    """
    query = input("\n🔍 Enter your research query: ")
    return query.strip()
 async def enhance_query_with_llm(query: str) -> ResearchQuery:
    """
    Use LLM to enhance the research query:
    - Extract key terms
    - Generate search patterns
    - Identify related topics
    """
    # Check cache
    cache_key = get_cache_key("enhanced_query", query)
    cached_result = load_from_cache(cache_key)
    if cached_result:
        console.print("[dim cyan]📦 Using cached enhanced query[/dim cyan]")
        return cached_result
    try:
        response = await litellm.acompletion(
            model="gemini/gemini-2.5-flash-preview-04-17",
            messages=[{
                "role": "user", 
                "content": f"""Given this research query: "{query}"
                Extract:
                1. Key terms and concepts (as a list)
                2. Related search terms
                3. A more specific/enhanced version of the query
                Return as JSON:
                {{
                    "key_terms": ["term1", "term2"],
                    "related_terms": ["related1", "related2"],
                    "enhanced_query": "enhanced version of query"
                }}"""
            }],
            # reasoning_effort="low",
            temperature=0.3,
            response_format={"type": "json_object"}
        )
        data = json.loads(response.choices[0].message.content)
        # Create search patterns
        all_terms = data["key_terms"] + data["related_terms"]
        patterns = [f"*{term.lower()}*" for term in all_terms]
        result = ResearchQuery(
            original_query=query,
            enhanced_query=data["enhanced_query"],
            search_patterns=patterns[:10],  # Limit patterns
            timestamp=datetime.now().isoformat()
        )
        # Cache the result
        save_to_cache(cache_key, result)
        return result
    except Exception as e:
        console.print(f"[yellow]⚠️ LLM enhancement failed: {e}[/yellow]")
        # Fallback to simple tokenization
        return ResearchQuery(
            original_query=query,
            enhanced_query=query,
            search_patterns=tokenize_query_to_patterns(query),
            timestamp=datetime.now().isoformat()
        )
 def tokenize_query_to_patterns(query: str) -> List[str]:
    """
    Convert query into URL patterns for URLSeeder
    Example: "AI startups funding" -> ["*ai*", "*startup*", "*funding*"]
    """
    # Simple tokenization - split and create patterns
    words = query.lower().split()
    # Filter out common words
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'that'}
    keywords = [w for w in words if w not in stop_words and len(w) > 2]
    # Create patterns
    patterns = [f"*{keyword}*" for keyword in keywords]
    return patterns[:8]  # Limit to 8 patterns
 async def discover_urls(domain: str, query: str, config: ResearchConfig) -> List[Dict]:
    """
    Use URLSeeder to discover and rank URLs:
    1. Fetch all URLs from domain
    2. Filter by patterns
    3. Extract metadata (titles, descriptions)
    4. Rank by BM25 relevance score
    5. Return top K URLs
    """
    # Check cache
    cache_key = get_cache_key("discovered_urls", domain, query, config.top_k_urls)
    cached_result = load_from_cache(cache_key)
    if cached_result and not config.force_refresh:
        console.print("[dim cyan]📦 Using cached URL discovery[/dim cyan]")
        return cached_result
    console.print(f"\n[cyan]🔍 Discovering URLs from {domain}...[/cyan]")
    # Initialize URL seeder
    seeder = AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose))
    # Configure seeding
    seeding_config = SeedingConfig(
        source="sitemap+cc",  # Use both sitemap and Common Crawl
        extract_head=config.extract_head_metadata,
        query=query,
        scoring_method=config.scoring_method,
        score_threshold=config.score_threshold,
        max_urls=config.max_urls_discovery,
        live_check=config.live_check,
        force=config.force_refresh
    )
    try:
        # Discover URLs
        urls = await seeder.urls(domain, seeding_config)
        # Sort by relevance score (descending)
        sorted_urls = sorted(
            urls, 
            key=lambda x: x.get('relevance_score', 0), 
            reverse=True
        )
        # Take top K
        top_urls = sorted_urls[:config.top_k_urls]
        console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
        # Cache the result
        save_to_cache(cache_key, top_urls)
        return top_urls
    except Exception as e:
        console.print(f"[red]❌ URL discovery failed: {e}[/red]")
        return []
 async def crawl_selected_urls(urls: List[str], query: str, config: ResearchConfig) -> List[Dict]:
    """
    Crawl selected URLs with content filtering:
    - Use AsyncWebCrawler.arun_many()
    - Apply content filter
    - Generate clean markdown
    """
    # Extract just URLs from the discovery results
    url_list = [u['url'] for u in urls if 'url' in u][:config.max_urls_to_crawl]
    if not url_list:
        console.print("[red]❌ No URLs to crawl[/red]")
        return []
    console.print(f"\n[cyan]🕷️ Crawling {len(url_list)} URLs...[/cyan]")
    # Check cache for each URL
    crawled_results = []
    urls_to_crawl = []
    for url in url_list:
        cache_key = get_cache_key("crawled_content", url, query)
        cached_content = load_from_cache(cache_key)
        if cached_content and not config.force_refresh:
            crawled_results.append(cached_content)
        else:
            urls_to_crawl.append(url)
    if urls_to_crawl:
        console.print(f"[cyan]📥 Crawling {len(urls_to_crawl)} new URLs (cached: {len(crawled_results)})[/cyan]")
        # Configure markdown generator with content filter
        md_generator = DefaultMarkdownGenerator(
            content_filter=PruningContentFilter(
                threshold=0.48,
                threshold_type="dynamic",
                min_word_threshold=10
            ),
        )
        # Configure crawler
        crawler_config = CrawlerRunConfig(
            markdown_generator=md_generator,
            exclude_external_links=True,
            excluded_tags=['nav', 'header', 'footer', 'aside'],
        )
        # Create crawler with browser config
        async with AsyncWebCrawler(
            config=BrowserConfig(
                headless=config.headless,
                verbose=config.verbose
            )
        ) as crawler:
            # Crawl URLs
            results = await crawler.arun_many(
                urls_to_crawl,
                config=crawler_config,
                max_concurrent=config.max_concurrent_crawls
            )
            # Process results
            for url, result in zip(urls_to_crawl, results):
                if result.success:
                    content_data = {
                        'url': url,
                        'title': result.metadata.get('title', ''),
                        'markdown': result.markdown.fit_markdown or result.markdown.raw_markdown,
                        'raw_length': len(result.markdown.raw_markdown),
                        'fit_length': len(result.markdown.fit_markdown) if result.markdown.fit_markdown else len(result.markdown.raw_markdown),
                        'metadata': result.metadata
                    }
                    crawled_results.append(content_data)
                    # Cache the result
                    cache_key = get_cache_key("crawled_content", url, query)
                    save_to_cache(cache_key, content_data)
                else:
                    console.print(f"  [red]❌ Failed: {url[:50]}... - {result.error}[/red]")
    console.print(f"[green]✅ Successfully crawled {len(crawled_results)} URLs[/green]")
    return crawled_results
 async def generate_research_synthesis(
    query: str, 
    crawled_content: List[Dict]
 ) -> Tuple[str, List[Dict]]:
    """
    Use LLM to synthesize research findings:
    - Analyze all crawled content
    - Generate comprehensive answer
    - Extract citations and references
    """
    if not crawled_content:
        return "No content available for synthesis.", []
    console.print("\n[cyan]🤖 Generating research synthesis...[/cyan]")
    # Prepare content for LLM
    content_sections = []
    for i, content in enumerate(crawled_content, 1):
        section = f"""
 SOURCE {i}:
 Title: {content['title']}
 URL: {content['url']}
 Content Preview:
 {content['markdown'][:1500]}...
 """
        content_sections.append(section)
    combined_content = "\n---\n".join(content_sections)
    try:
        response = await litellm.acompletion(
            model="gemini/gemini-2.5-flash-preview-04-17",
            messages=[{
                "role": "user",
                "content": f"""Research Query: "{query}"
 Based on the following sources, provide a comprehensive research synthesis.
 {combined_content}
 Please provide:
 1. An executive summary (2-3 sentences)
 2. Key findings (3-5 bullet points)
 3. Detailed analysis (2-3 paragraphs)
 4. Future implications or trends
 Format your response with clear sections and cite sources using [Source N] notation.
 Keep the total response under 800 words."""
            }],
            # reasoning_effort="medium",
            temperature=0.7
        )
        synthesis = response.choices[0].message.content
        # Extract citations from the synthesis
        citations = []
        for i, content in enumerate(crawled_content, 1):
            if f"[Source {i}]" in synthesis or f"Source {i}" in synthesis:
                citations.append({
                    'source_id': i,
                    'title': content['title'],
                    'url': content['url']
                })
        return synthesis, citations
    except Exception as e:
        console.print(f"[red]❌ Synthesis generation failed: {e}[/red]")
        # Fallback to simple summary
        summary = f"Research on '{query}' found {len(crawled_content)} relevant articles:\n\n"
        for content in crawled_content[:3]:
            summary += f"- {content['title']}\n  {content['url']}\n\n"
        return summary, []
 def format_research_output(result: ResearchResult) -> str:
    """
    Format the final research output with:
    - Executive summary
    - Key findings
    - Detailed analysis
    - Citations and sources
    """
    output = []
    output.append("\n" + "=" * 60)
    output.append("🔬 RESEARCH RESULTS")
    output.append("=" * 60)
    # Query info
    output.append(f"\n📋 Query: {result.query.original_query}")
    if result.query.enhanced_query != result.query.original_query:
        output.append(f"   Enhanced: {result.query.enhanced_query}")
    # Discovery stats
    output.append(f"\n📊 Statistics:")
    output.append(f"   - URLs discovered: {len(result.discovered_urls)}")
    output.append(f"   - URLs crawled: {len(result.crawled_content)}")
    output.append(f"   - Processing time: {result.metadata.get('duration', 'N/A')}")
    # Synthesis
    output.append(f"\n📝 SYNTHESIS")
    output.append("-" * 60)
    output.append(result.synthesis)
    # Citations
    if result.citations:
        output.append(f"\n📚 SOURCES")
        output.append("-" * 60)
        for citation in result.citations:
            output.append(f"[{citation['source_id']}] {citation['title']}")
            output.append(f"    {citation['url']}")
    return "\n".join(output)
 async def save_research_results(result: ResearchResult, config: ResearchConfig) -> Tuple[str, str]:
    """
    Save research results in JSON and Markdown formats
    Returns:
        Tuple of (json_path, markdown_path)
    """
    # Create output directory
    output_dir = Path(config.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    # Generate filename based on query and timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    query_slug = result.query.original_query[:50].replace(" ", "_").replace("/", "_")
    base_filename = f"{timestamp}_{query_slug}"
    json_path = None
    md_path = None
    # Save JSON
    if config.save_json:
        json_path = output_dir / f"{base_filename}.json"
        with open(json_path, 'w') as f:
            json.dump(asdict(result), f, indent=2, default=str)
        console.print(f"\n[green]💾 JSON saved: {json_path}[/green]")
    # Save Markdown
    if config.save_markdown:
        md_path = output_dir / f"{base_filename}.md"
        # Create formatted markdown
        md_content = [
            f"# Research Report: {result.query.original_query}",
            f"\n**Generated on:** {result.metadata.get('timestamp', 'N/A')}",
            f"\n**Domain:** {result.metadata.get('domain', 'N/A')}",
            f"\n**Processing time:** {result.metadata.get('duration', 'N/A')}",
            "\n---\n",
            "## Query Information",
            f"- **Original Query:** {result.query.original_query}",
            f"- **Enhanced Query:** {result.query.enhanced_query or 'N/A'}",
            f"- **Search Patterns:** {', '.join(result.query.search_patterns or [])}",
            "\n## Statistics",
            f"- **URLs Discovered:** {len(result.discovered_urls)}",
            f"- **URLs Crawled:** {len(result.crawled_content)}",
            f"- **Sources Cited:** {len(result.citations)}",
            "\n## Research Synthesis\n",
            result.synthesis,
            "\n## Sources\n"
        ]
        # Add citations
        for citation in result.citations:
            md_content.append(f"### [{citation['source_id']}] {citation['title']}")
            md_content.append(f"- **URL:** [{citation['url']}]({citation['url']})")
            md_content.append("")
        # Add discovered URLs summary
        md_content.extend([
            "\n## Discovered URLs (Top 10)\n",
            "| Score | URL | Title |",
            "|-------|-----|-------|"
        ])
        for url_data in result.discovered_urls[:10]:
            score = url_data.get('relevance_score', 0)
            url = url_data.get('url', '')
            title = 'N/A'
            if 'head_data' in url_data and url_data['head_data']:
                title = url_data['head_data'].get('title', 'N/A')[:60] + '...'
            md_content.append(f"| {score:.3f} | {url[:50]}... | {title} |")
        # Write markdown
        with open(md_path, 'w') as f:
            f.write('\n'.join(md_content))
        console.print(f"[green]📄 Markdown saved: {md_path}[/green]")
    return str(json_path) if json_path else None, str(md_path) if md_path else None
 async def wait_for_user(message: str = "\nPress Enter to continue..."):
    """Wait for user input in interactive mode"""
    input(message)
 async def research_pipeline(
    query: str,
    config: ResearchConfig
 ) -> ResearchResult:
    """
    Main research pipeline orchestrator with configurable settings
    """
    start_time = datetime.now()
    # Display pipeline header
    header = Panel(
        f"[bold cyan]Research Pipeline[/bold cyan]\n\n"
        f"[dim]Domain:[/dim] {config.domain}\n"
        f"[dim]Mode:[/dim] {'Test' if config.test_mode else 'Production'}\n"
        f"[dim]Interactive:[/dim] {'Yes' if config.interactive_mode else 'No'}",
        title="🚀 Starting",
        border_style="cyan"
    )
    console.print(header)
    # Step 1: Enhance query (optional)
    console.print(f"\n[bold cyan]📝 Step 1: Query Processing[/bold cyan]")
    if config.interactive_mode:
        await wait_for_user()
    if config.use_llm_enhancement:
        research_query = await enhance_query_with_llm(query)
    else:
        research_query = ResearchQuery(
            original_query=query,
            enhanced_query=query,
            search_patterns=tokenize_query_to_patterns(query),
            timestamp=datetime.now().isoformat()
        )
    console.print(f"   [green]✅ Query ready:[/green] {research_query.enhanced_query or query}")
    # Step 2: Discover URLs
    console.print(f"\n[bold cyan]🔍 Step 2: URL Discovery[/bold cyan]")
    if config.interactive_mode:
        await wait_for_user()
    discovered_urls = await discover_urls(
        domain=config.domain,
        query=research_query.enhanced_query or query,
        config=config
    )
    if not discovered_urls:
        return ResearchResult(
            query=research_query,
            discovered_urls=[],
            crawled_content=[],
            synthesis="No relevant URLs found for the given query.",
            citations=[],
            metadata={'duration': str(datetime.now() - start_time)}
        )
    console.print(f"   [green]✅ Found {len(discovered_urls)} relevant URLs[/green]")
    # Step 3: Crawl selected URLs
    console.print(f"\n[bold cyan]🕷️ Step 3: Content Crawling[/bold cyan]")
    if config.interactive_mode:
        await wait_for_user()
    crawled_content = await crawl_selected_urls(
        urls=discovered_urls,
        query=research_query.enhanced_query or query,
        config=config
    )
    console.print(f"   [green]✅ Successfully crawled {len(crawled_content)} pages[/green]")
    # Step 4: Generate synthesis
    console.print(f"\n[bold cyan]🤖 Step 4: Synthesis Generation[/bold cyan]")
    if config.interactive_mode:
        await wait_for_user()
    synthesis, citations = await generate_research_synthesis(
        query=research_query.enhanced_query or query,
        crawled_content=crawled_content
    )
    console.print(f"   [green]✅ Generated synthesis with {len(citations)} citations[/green]")
    # Step 5: Create result
    result = ResearchResult(
        query=research_query,
        discovered_urls=discovered_urls,
        crawled_content=crawled_content,
        synthesis=synthesis,
        citations=citations,
        metadata={
            'duration': str(datetime.now() - start_time),
            'domain': config.domain,
            'timestamp': datetime.now().isoformat(),
            'config': asdict(config)
        }
    )
    duration = datetime.now() - start_time
    console.print(f"\n[bold green]✅ Research completed in {duration}[/bold green]")
    return result
 async def main():
    """
    Main entry point for the BBC Sport Research Assistant
    """
    # Example queries
    example_queries = [
        "Premier League transfer news and rumors",
        "Champions League match results and analysis", 
        "World Cup qualifying updates",
        "Football injury reports and return dates",
        "Tennis grand slam tournament results"
    ]
    # Display header
    console.print(Panel.fit(
        "[bold cyan]BBC Sport Research Assistant[/bold cyan]\n\n"
        "This tool demonstrates efficient research using URLSeeder:\n"
        "[dim]• Discover all URLs without crawling\n"
        "• Filter and rank by relevance\n"
        "• Crawl only the most relevant content\n"
        "• Generate AI-powered insights with citations[/dim]\n\n"
        f"[dim]📁 Working directory: {SCRIPT_DIR}[/dim]",
        title="🔬 Welcome",
        border_style="cyan"
    ))
    # Configuration options table
    config_table = Table(title="\n⚙️  Configuration Options", show_header=False, box=None)
    config_table.add_column(style="bold cyan", width=3)
    config_table.add_column()
    config_table.add_row("1", "Quick Test Mode (3 URLs, fast)")
    config_table.add_row("2", "Standard Mode (10 URLs, balanced)")
    config_table.add_row("3", "Comprehensive Mode (20 URLs, thorough)")
    config_table.add_row("4", "Custom Configuration")
    console.print(config_table)
    config_choice = input("\nSelect configuration (1-4): ").strip()
    # Create config based on choice
    if config_choice == "1":
        config = ResearchConfig(test_mode=True, interactive_mode=False)
    elif config_choice == "2":
        config = ResearchConfig(max_urls_to_crawl=10, top_k_urls=10)
    elif config_choice == "3":
        config = ResearchConfig(max_urls_to_crawl=20, top_k_urls=20, max_urls_discovery=200)
    else:
        # Custom configuration
        config = ResearchConfig()
        config.test_mode = input("\nTest mode? (y/n): ").lower() == 'y'
        config.interactive_mode = input("Interactive mode (pause between steps)? (y/n): ").lower() == 'y'
        config.use_llm_enhancement = input("Use AI to enhance queries? (y/n): ").lower() == 'y'
        if not config.test_mode:
            try:
                config.max_urls_to_crawl = int(input("Max URLs to crawl (default 10): ") or "10")
                config.top_k_urls = int(input("Top K URLs to select (default 10): ") or "10")
            except ValueError:
                console.print("[yellow]Using default values[/yellow]")
    # Display example queries
    query_table = Table(title="\n📋 Example Queries", show_header=False, box=None)
    query_table.add_column(style="bold cyan", width=3)
    query_table.add_column()
    for i, q in enumerate(example_queries, 1):
        query_table.add_row(str(i), q)
    console.print(query_table)
    query_input = input("\nSelect a query (1-5) or enter your own: ").strip()
    if query_input.isdigit() and 1 <= int(query_input) <= len(example_queries):
        query = example_queries[int(query_input) - 1]
    else:
        query = query_input if query_input else example_queries[0]
    console.print(f"\n[bold cyan]📝 Selected Query:[/bold cyan] {query}")
    # Run the research pipeline
    result = await research_pipeline(query=query, config=config)
    # Display results
    formatted_output = format_research_output(result)
    # print(formatted_output)
    console.print(Panel.fit(
        formatted_output,
        title="🔬 Research Results",
        border_style="green"
    ))
    # Save results
    if config.save_json or config.save_markdown:
        json_path, md_path = await save_research_results(result, config)
        # print(f"\n✅ Results saved successfully!")
        if json_path:
            console.print(f"[green]JSON saved at:[/green] {json_path}")
        if md_path:
            console.print(f"[green]Markdown saved at:[/green] {md_path}")
 if __name__ == "__main__":
    asyncio.run(main())