Add BBC Sp0ort Research Assistant pipeline example

- Implemented a comprehensive research pipeline using URLSeeder.
- Steps include user query input, optional LLM enhancement, URL discovery and ranking, content crawling, and synthesis generation.
- Introduced caching mechanism for enhanced query results and crawled content.
- Configurable settings for testing and production modes.
- Output results in JSON and Markdown formats with detailed research insights and citations.
This commit is contained in:
UncleCode
2025-06-04 23:23:21 +08:00
parent 09fd3e152a
commit b5c2732f88
2 changed files with 1324 additions and 273 deletions

View File

@@ -14,7 +14,16 @@ Features
""" """
from __future__ import annotations from __future__ import annotations
import aiofiles, asyncio, gzip, hashlib, io, json, os, pathlib, re, time import aiofiles
import asyncio
import gzip
import hashlib
import io
import json
import os
import pathlib
import re
import time
from datetime import timedelta from datetime import timedelta
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Union from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
@@ -42,7 +51,8 @@ except ImportError:
# Import AsyncLoggerBase from crawl4ai's logger module # Import AsyncLoggerBase from crawl4ai's logger module
# Assuming crawl4ai/async_logger.py defines AsyncLoggerBase # Assuming crawl4ai/async_logger.py defines AsyncLoggerBase
# You might need to adjust this import based on your exact file structure # You might need to adjust this import based on your exact file structure
from .async_logger import AsyncLoggerBase, AsyncLogger # Import AsyncLogger for default if needed # Import AsyncLogger for default if needed
from .async_logger import AsyncLoggerBase, AsyncLogger
# Import SeedingConfig for type hints # Import SeedingConfig for type hints
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
@@ -61,10 +71,13 @@ _meta_rx = re.compile(
r'<meta\s+(?:[^>]*?(?:name|property|http-equiv)\s*=\s*["\']?([^"\' >]+)[^>]*?content\s*=\s*["\']?([^"\' >]+)[^>]*?)\/?>', r'<meta\s+(?:[^>]*?(?:name|property|http-equiv)\s*=\s*["\']?([^"\' >]+)[^>]*?content\s*=\s*["\']?([^"\' >]+)[^>]*?)\/?>',
re.I) re.I)
_charset_rx = re.compile(r'<meta\s+[^>]*charset=["\']?([^"\' >]+)', re.I) _charset_rx = re.compile(r'<meta\s+[^>]*charset=["\']?([^"\' >]+)', re.I)
_title_rx = re.compile(r'<title>(.*?)</title>', re.I|re.S) _title_rx = re.compile(r'<title>(.*?)</title>', re.I | re.S)
_link_rx = re.compile(r'<link\s+[^>]*rel=["\']?([^"\' >]+)[^>]*href=["\']?([^"\' >]+)', re.I) _link_rx = re.compile(
r'<link\s+[^>]*rel=["\']?([^"\' >]+)[^>]*href=["\']?([^"\' >]+)', re.I)
# ────────────────────────────────────────────────────────────────────────── helpers # ────────────────────────────────────────────────────────────────────────── helpers
def _match(url: str, pattern: str) -> bool: def _match(url: str, pattern: str) -> bool:
if fnmatch.fnmatch(url, pattern): if fnmatch.fnmatch(url, pattern):
return True return True
@@ -72,11 +85,13 @@ def _match(url: str, pattern: str) -> bool:
return (fnmatch.fnmatch(canon, pattern) return (fnmatch.fnmatch(canon, pattern)
or (canon.startswith("www.") and fnmatch.fnmatch(canon[4:], pattern))) or (canon.startswith("www.") and fnmatch.fnmatch(canon[4:], pattern)))
def _parse_head(src: str) -> Dict[str, Any]: def _parse_head(src: str) -> Dict[str, Any]:
if LXML: if LXML:
try: try:
if isinstance(src, str): if isinstance(src, str):
src = src.encode("utf-8", "replace") # strip Unicode, let lxml decode # strip Unicode, let lxml decode
src = src.encode("utf-8", "replace")
doc = lxml_html.fromstring(src) doc = lxml_html.fromstring(src)
except (ValueError, etree.ParserError): except (ValueError, etree.ParserError):
return {} # malformed, bail gracefully return {} # malformed, bail gracefully
@@ -87,13 +102,18 @@ def _parse_head(src: str) -> Dict[str, Any]:
"meta": {}, "link": {}, "jsonld": [] "meta": {}, "link": {}, "jsonld": []
} }
for el in doc.xpath(".//meta"): for el in doc.xpath(".//meta"):
k = el.attrib.get("name") or el.attrib.get("property") or el.attrib.get("http-equiv") k = el.attrib.get("name") or el.attrib.get(
if k: info["meta"][k.lower()] = el.attrib.get("content", "") "property") or el.attrib.get("http-equiv")
elif "charset" in el.attrib: info["charset"] = el.attrib["charset"].lower() if k:
info["meta"][k.lower()] = el.attrib.get("content", "")
elif "charset" in el.attrib:
info["charset"] = el.attrib["charset"].lower()
for el in doc.xpath(".//link"): for el in doc.xpath(".//link"):
rel = " ".join(el.attrib.get("rel", [])).lower() rel = " ".join(el.attrib.get("rel", [])).lower()
if not rel: continue if not rel:
entry = {a: el.attrib[a] for a in ("href","as","type","hreflang") if a in el.attrib} continue
entry = {a: el.attrib[a] for a in (
"href", "as", "type", "hreflang") if a in el.attrib}
info["link"].setdefault(rel, []).append(entry) info["link"].setdefault(rel, []).append(entry)
# Extract JSON-LD structured data # Extract JSON-LD structured data
for script in doc.xpath('.//script[@type="application/ld+json"]'): for script in doc.xpath('.//script[@type="application/ld+json"]'):
@@ -109,14 +129,19 @@ def _parse_head(src: str) -> Dict[str, Any]:
info["lang"] = html_elem.attrib.get("lang", "") info["lang"] = html_elem.attrib.get("lang", "")
return info return info
# regex fallback # regex fallback
info: Dict[str,Any] = {"title":None,"charset":None,"meta":{},"link":{},"jsonld":[],"lang":""} info: Dict[str, Any] = {"title": None, "charset": None,
m=_title_rx.search(src); info["title"]=m.group(1).strip() if m else None "meta": {}, "link": {}, "jsonld": [], "lang": ""}
for k,v in _meta_rx.findall(src): info["meta"][k.lower()]=v m = _title_rx.search(src)
m=_charset_rx.search(src); info["charset"]=m.group(1).lower() if m else None info["title"] = m.group(1).strip() if m else None
for rel,href in _link_rx.findall(src): for k, v in _meta_rx.findall(src):
info["link"].setdefault(rel.lower(),[]).append({"href":href}) info["meta"][k.lower()] = v
m = _charset_rx.search(src)
info["charset"] = m.group(1).lower() if m else None
for rel, href in _link_rx.findall(src):
info["link"].setdefault(rel.lower(), []).append({"href": href})
# Try to extract JSON-LD with regex # Try to extract JSON-LD with regex
jsonld_pattern = re.compile(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.I|re.S) jsonld_pattern = re.compile(
r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.I | re.S)
for match in jsonld_pattern.findall(src): for match in jsonld_pattern.findall(src):
try: try:
jsonld_data = json.loads(match.strip()) jsonld_data = json.loads(match.strip())
@@ -130,41 +155,72 @@ def _parse_head(src: str) -> Dict[str, Any]:
return info return info
# ────────────────────────────────────────────────────────────────────────── class # ────────────────────────────────────────────────────────────────────────── class
class AsyncUrlSeeder: class AsyncUrlSeeder:
""" """
Async version of UrlSeeder. Async version of UrlSeeder.
Call pattern is await/async for / async with. Call pattern is await/async for / async with.
Public coroutine Public coroutines
---------------- -----------------
await seed.urls(...) await seed.urls(...)
returns List[Dict[str,Any]] (url, status, head_data) returns List[Dict[str,Any]] (url, status, head_data)
await seed.many_urls(...)
returns Dict[str, List[Dict[str,Any]]]
await seed.close()
closes the HTTP client if owned by seeder
Usage examples
--------------
# Manual cleanup:
seeder = AsyncUrlSeeder()
try:
urls = await seeder.urls("example.com", config)
finally:
await seeder.close()
# Using async context manager (recommended):
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config)
# Reusing existing client:
client = httpx.AsyncClient()
seeder = AsyncUrlSeeder(client=client)
urls = await seeder.urls("example.com", config)
# No need to close seeder, as it doesn't own the client
""" """
def __init__( def __init__(
self, self,
ttl: timedelta = TTL, ttl: timedelta = TTL,
client: Optional[httpx.AsyncClient]=None, client: Optional[httpx.AsyncClient] = None,
logger: Optional[AsyncLoggerBase] = None, # NEW: Add logger parameter logger: Optional[AsyncLoggerBase] = None, # NEW: Add logger parameter
base_directory: Optional[Union[str, pathlib.Path]] = None, # NEW: Add base_directory # NEW: Add base_directory
base_directory: Optional[Union[str, pathlib.Path]] = None,
cache_root: Optional[Union[str, Path]] = None, cache_root: Optional[Union[str, Path]] = None,
): ):
self.ttl = ttl self.ttl = ttl
self._owns_client = client is None # Track if we created the client
self.client = client or httpx.AsyncClient(http2=True, timeout=20, headers={ self.client = client or httpx.AsyncClient(http2=True, timeout=20, headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}) })
self.logger = logger # Store the logger instance self.logger = logger # Store the logger instance
self.base_directory = pathlib.Path(base_directory or os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())) # Resolve base_directory self.base_directory = pathlib.Path(base_directory or os.getenv(
self.cache_dir = self.base_directory / ".crawl4ai" / "seeder_cache" # NEW: Specific cache dir for seeder "CRAWL4_AI_BASE_DIRECTORY", Path.home())) # Resolve base_directory
self.cache_dir = self.base_directory / ".crawl4ai" / \
"seeder_cache" # NEW: Specific cache dir for seeder
self.cache_dir.mkdir(parents=True, exist_ok=True) # Ensure it exists self.cache_dir.mkdir(parents=True, exist_ok=True) # Ensure it exists
self.index_cache_path = self.cache_dir / "latest_cc_index.txt" # NEW: Index cache path self.index_cache_path = self.cache_dir / \
"latest_cc_index.txt" # NEW: Index cache path
# defer grabbing the index inside an active loop blows up # defer grabbing the index inside an active loop blows up
self.index_id: Optional[str] = None self.index_id: Optional[str] = None
self._rate_sem: Optional[asyncio.Semaphore] = None self._rate_sem: Optional[asyncio.Semaphore] = None
# ───────── cache dirs ───────── # ───────── cache dirs ─────────
self.cache_root = Path(os.path.expanduser(cache_root or "~/.cache/url_seeder")) self.cache_root = Path(os.path.expanduser(
cache_root or "~/.cache/url_seeder"))
(self.cache_root / "live").mkdir(parents=True, exist_ok=True) (self.cache_root / "live").mkdir(parents=True, exist_ok=True)
(self.cache_root / "head").mkdir(exist_ok=True) (self.cache_root / "head").mkdir(exist_ok=True)
@@ -173,7 +229,8 @@ class AsyncUrlSeeder:
if self.logger: if self.logger:
log_method = getattr(self.logger, level, None) log_method = getattr(self.logger, level, None)
if log_method: if log_method:
log_method(message=message, tag=tag, params=kwargs.get('params', {})) log_method(message=message, tag=tag,
params=kwargs.get('params', {}))
# else: # Fallback for unknown level, should not happen with AsyncLoggerBase # else: # Fallback for unknown level, should not happen with AsyncLoggerBase
# print(f"[{tag}] {level.upper()}: {message.format(**kwargs)}") # print(f"[{tag}] {level.upper()}: {message.format(**kwargs)}")
@@ -182,32 +239,31 @@ class AsyncUrlSeeder:
h = hashlib.sha1(url.encode()).hexdigest() h = hashlib.sha1(url.encode()).hexdigest()
return self.cache_root / kind / f"{h}.json" return self.cache_root / kind / f"{h}.json"
def _cache_get(self, kind: str, url: str) -> Optional[Dict[str, Any]]: async def _cache_get(self, kind: str, url: str) -> Optional[Dict[str, Any]]:
p = self._cache_path(kind, url) p = self._cache_path(kind, url)
if not p.exists(): if not p.exists():
return None return None
# TTL check if time.time()-p.stat().st_mtime > self.ttl.total_seconds():
if time.time() - p.stat().st_mtime > self.ttl.total_seconds():
return None return None
try: try:
return json.loads(p.read_text()) async with aiofiles.open(p, "r") as f:
return json.loads(await f.read())
except Exception: except Exception:
return None return None
def _cache_set(self, kind: str, url: str, data: Dict[str, Any]) -> None: async def _cache_set(self, kind: str, url: str, data: Dict[str, Any]) -> None:
try: try:
self._cache_path(kind, url).write_text( async with aiofiles.open(self._cache_path(kind, url), "w") as f:
json.dumps(data, separators=(",", ":")) await f.write(json.dumps(data, separators=(",", ":")))
)
except Exception: except Exception:
pass pass
# ─────────────────────────────── discovery entry # ─────────────────────────────── discovery entry
async def urls(self, async def urls(self,
domain: str, domain: str,
config: "SeedingConfig", config: "SeedingConfig",
) -> List[Dict[str,Any]]: ) -> List[Dict[str, Any]]:
""" """
Fetch URLs for a domain using configuration from SeedingConfig. Fetch URLs for a domain using configuration from SeedingConfig.
@@ -228,7 +284,8 @@ class AsyncUrlSeeder:
hits_per_sec = config.hits_per_sec hits_per_sec = config.hits_per_sec
self.force = config.force # Store force flag as instance attribute self.force = config.force # Store force flag as instance attribute
force = config.force force = config.force
verbose = config.verbose if config.verbose is not None else (self.logger.verbose if self.logger else False) verbose = config.verbose if config.verbose is not None else (
self.logger.verbose if self.logger else False)
max_urls = config.max_urls if config.max_urls is not None else -1 max_urls = config.max_urls if config.max_urls is not None else -1
query = config.query query = config.query
score_threshold = config.score_threshold score_threshold = config.score_threshold
@@ -247,11 +304,13 @@ class AsyncUrlSeeder:
valid_sources = {"cc", "sitemap"} valid_sources = {"cc", "sitemap"}
for s in sources: for s in sources:
if s not in valid_sources: if s not in valid_sources:
raise ValueError(f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}") raise ValueError(
f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
if hits_per_sec: if hits_per_sec:
if hits_per_sec <= 0: if hits_per_sec <= 0:
self._log("warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED") self._log(
"warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
self._rate_sem = None self._rate_sem = None
else: else:
self._rate_sem = asyncio.Semaphore(hits_per_sec) self._rate_sem = asyncio.Semaphore(hits_per_sec)
@@ -268,11 +327,14 @@ class AsyncUrlSeeder:
async for u in self._from_sitemaps(domain, pattern, force): async for u in self._from_sitemaps(domain, pattern, force):
yield u yield u
if "cc" in sources: if "cc" in sources:
self._log("debug", "Fetching from Common Crawl...", tag="URL_SEED") self._log("debug", "Fetching from Common Crawl...",
tag="URL_SEED")
async for u in self._from_cc(domain, pattern, force): async for u in self._from_cc(domain, pattern, force):
yield u yield u
queue = asyncio.Queue() # Use bounded queue to prevent RAM spikes with large domains
queue_size = min(10000, max(1000, concurrency * 100)) # Dynamic size based on concurrency
queue = asyncio.Queue(maxsize=queue_size)
producer_done = asyncio.Event() producer_done = asyncio.Event()
stop_event = asyncio.Event() stop_event = asyncio.Event()
seen: set[str] = set() seen: set[str] = set()
@@ -285,27 +347,31 @@ class AsyncUrlSeeder:
params={"url": u}, tag="URL_SEED") params={"url": u}, tag="URL_SEED")
continue continue
if stop_event.is_set(): if stop_event.is_set():
self._log("info", "Producer stopping due to max_urls limit.", tag="URL_SEED") self._log(
"info", "Producer stopping due to max_urls limit.", tag="URL_SEED")
break break
await queue.put(u) seen.add(u)
await queue.put(u) # Will block if queue is full, providing backpressure
except Exception as e: except Exception as e:
self._log("error", "Producer encountered an error: {error}", params={"error": str(e)}, tag="URL_SEED") self._log("error", "Producer encountered an error: {error}", params={
"error": str(e)}, tag="URL_SEED")
finally: finally:
producer_done.set() producer_done.set()
self._log("debug", "Producer finished.", tag="URL_SEED") self._log("debug", "Producer finished.", tag="URL_SEED")
async def worker(res_list: List[Dict[str, Any]]):
async def worker(res_list: List[Dict[str,Any]]):
while True: while True:
if queue.empty() and producer_done.is_set(): if queue.empty() and producer_done.is_set():
# self._log("debug", "Worker exiting: queue empty and producer done.", tag="URL_SEED") # self._log("debug", "Worker exiting: queue empty and producer done.", tag="URL_SEED")
break break
try: try:
url = await asyncio.wait_for(queue.get(), 5) # Increased timeout slightly # Increased timeout slightly
url = await asyncio.wait_for(queue.get(), 5)
except asyncio.TimeoutError: except asyncio.TimeoutError:
continue # Keep checking queue and producer_done status continue # Keep checking queue and producer_done status
except Exception as e: except Exception as e:
self._log("error", "Worker failed to get URL from queue: {error}", params={"error": str(e)}, tag="URL_SEED") self._log("error", "Worker failed to get URL from queue: {error}", params={
"error": str(e)}, tag="URL_SEED")
continue continue
if max_urls > 0 and len(res_list) >= max_urls: if max_urls > 0 and len(res_list) >= max_urls:
@@ -332,16 +398,17 @@ class AsyncUrlSeeder:
if self._rate_sem: # global QPS control if self._rate_sem: # global QPS control
async with self._rate_sem: async with self._rate_sem:
await self._validate(url, res_list, live_check, extract_head, await self._validate(url, res_list, live_check, extract_head,
head_timeout, verbose) head_timeout, verbose, query, score_threshold, scoring_method)
else: else:
await self._validate(url, res_list, live_check, extract_head, await self._validate(url, res_list, live_check, extract_head,
head_timeout, verbose) head_timeout, verbose, query, score_threshold, scoring_method)
queue.task_done() # Mark task as done for queue.join() if ever used queue.task_done() # Mark task as done for queue.join() if ever used
# launch # launch
results: List[Dict[str,Any]] = [] results: List[Dict[str, Any]] = []
prod_task = asyncio.create_task(producer()) prod_task = asyncio.create_task(producer())
workers = [asyncio.create_task(worker(results)) for _ in range(concurrency)] workers = [asyncio.create_task(worker(results))
for _ in range(concurrency)]
# Wait for all workers to finish # Wait for all workers to finish
await asyncio.gather(prod_task, *workers) await asyncio.gather(prod_task, *workers)
@@ -350,52 +417,15 @@ class AsyncUrlSeeder:
self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}", self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}",
params={"domain": domain, "count": len(results)}, tag="URL_SEED") params={"domain": domain, "count": len(results)}, tag="URL_SEED")
# Apply BM25 scoring if query is provided and extract_head is enabled # Sort by relevance score if query was provided
if query and extract_head and scoring_method == "bm25": if query and extract_head and scoring_method == "bm25":
self._log("info", "Applying BM25 scoring for query: '{query}'", results.sort(key=lambda x: x.get(
params={"query": query}, tag="URL_SEED") "relevance_score", 0.0), reverse=True)
self._log("info", "Sorted {count} URLs by relevance score for query: '{query}'",
# Extract text contexts from all results params={"count": len(results), "query": query}, tag="URL_SEED")
documents = []
valid_indices = []
for i, result in enumerate(results):
if result.get("head_data"):
text_context = self._extract_text_context(result["head_data"])
if text_context: # Only include non-empty contexts
documents.append(text_context)
valid_indices.append(i)
if documents:
# Calculate BM25 scores
scores = self._calculate_bm25_score(query, documents)
# Add scores to results
for idx, score in zip(valid_indices, scores):
results[idx]["relevance_score"] = float(score)
# Add zero scores to results without head_data
for i, result in enumerate(results):
if i not in valid_indices:
result["relevance_score"] = 0.0
# Filter by score threshold if specified
if score_threshold is not None:
original_count = len(results)
results = [r for r in results if r.get("relevance_score", 0.0) >= score_threshold]
self._log("info", "Filtered {filtered} URLs below score threshold {threshold}. Remaining: {remaining}",
params={"filtered": original_count - len(results),
"threshold": score_threshold,
"remaining": len(results)}, tag="URL_SEED")
# Sort by relevance score (highest first)
results.sort(key=lambda x: x.get("relevance_score", 0.0), reverse=True)
else:
self._log("warning", "No valid head data found for BM25 scoring.", tag="URL_SEED")
# Add zero scores to all results
for result in results:
result["relevance_score"] = 0.0
elif query and not extract_head: elif query and not extract_head:
self._log("warning", "Query provided but extract_head is False. Enable extract_head for relevance scoring.", tag="URL_SEED") self._log(
"warning", "Query provided but extract_head is False. Enable extract_head for relevance scoring.", tag="URL_SEED")
return results[:max_urls] if max_urls > 0 else results return results[:max_urls] if max_urls > 0 else results
@@ -403,7 +433,7 @@ class AsyncUrlSeeder:
self, self,
domains: Sequence[str], domains: Sequence[str],
config: "SeedingConfig", config: "SeedingConfig",
) -> Dict[str, List[Dict[str,Any]]]: ) -> Dict[str, List[Dict[str, Any]]]:
""" """
Fetch URLs for many domains in parallel. Fetch URLs for many domains in parallel.
@@ -430,7 +460,8 @@ class AsyncUrlSeeder:
results = await asyncio.gather(*tasks) results = await asyncio.gather(*tasks)
final_results = dict(zip(domains, results)) final_results = dict(zip(domains, results))
self._log("info", "Finished URL seeding for multiple domains.", tag="URL_SEED") self._log(
"info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
return final_results return final_results
async def _resolve_head(self, url: str) -> Optional[str]: async def _resolve_head(self, url: str) -> Optional[str]:
@@ -462,14 +493,14 @@ class AsyncUrlSeeder:
params={"url": url, "err": str(e)}, tag="URL_SEED") params={"url": url, "err": str(e)}, tag="URL_SEED")
return None return None
# ─────────────────────────────── CC # ─────────────────────────────── CC
async def _from_cc(self, domain:str, pattern:str, force:bool): async def _from_cc(self, domain: str, pattern: str, force: bool):
import re import re
digest = hashlib.md5(pattern.encode()).hexdigest()[:8] digest = hashlib.md5(pattern.encode()).hexdigest()[:8]
# ── normalise for CC (strip scheme, query, fragment) # ── normalise for CC (strip scheme, query, fragment)
raw = re.sub(r'^https?://', '', domain).split('#', 1)[0].split('?', 1)[0].lstrip('.') raw = re.sub(r'^https?://', '', domain).split('#',
1)[0].split('?', 1)[0].lstrip('.')
# ── sanitize only for cache-file name # ── sanitize only for cache-file name
safe = re.sub('[/?#]+', '_', raw) safe = re.sub('[/?#]+', '_', raw)
@@ -478,32 +509,34 @@ class AsyncUrlSeeder:
if path.exists() and not force: if path.exists() and not force:
self._log("info", "Loading CC URLs for {domain} from cache: {path}", self._log("info", "Loading CC URLs for {domain} from cache: {path}",
params={"domain": domain, "path": path}, tag="URL_SEED") params={"domain": domain, "path": path}, tag="URL_SEED")
async with aiofiles.open(path,"r") as fp: async with aiofiles.open(path, "r") as fp:
async for line in fp: async for line in fp:
url=line.strip() url = line.strip()
if _match(url,pattern): yield url if _match(url, pattern):
yield url
return return
# build CC glob if a path is present keep it, else add trailing /* # build CC glob if a path is present keep it, else add trailing /*
glob = f"*.{raw}*" if '/' in raw else f"*.{raw}/*" glob = f"*.{raw}*" if '/' in raw else f"*.{raw}/*"
url = f"https://index.commoncrawl.org/{self.index_id}-index?url={quote(glob, safe='*')}&output=json" url = f"https://index.commoncrawl.org/{self.index_id}-index?url={quote(glob, safe='*')}&output=json"
retries=(1,3,7) retries = (1, 3, 7)
self._log("info", "Fetching CC URLs for {domain} from Common Crawl index: {url}", self._log("info", "Fetching CC URLs for {domain} from Common Crawl index: {url}",
params={"domain": domain, "url": url}, tag="URL_SEED") params={"domain": domain, "url": url}, tag="URL_SEED")
for i,d in enumerate(retries+(-1,)): # last -1 means don't retry for i, d in enumerate(retries+(-1,)): # last -1 means don't retry
try: try:
async with self.client.stream("GET", url) as r: async with self.client.stream("GET", url) as r:
r.raise_for_status() r.raise_for_status()
async with aiofiles.open(path,"w") as fp: async with aiofiles.open(path, "w") as fp:
async for line in r.aiter_lines(): async for line in r.aiter_lines():
rec = json.loads(line) rec = json.loads(line)
u = rec["url"] u = rec["url"]
await fp.write(u+"\n") await fp.write(u+"\n")
if _match(u,pattern): yield u if _match(u, pattern):
yield u
return return
except httpx.HTTPStatusError as e: except httpx.HTTPStatusError as e:
if e.response.status_code==503 and i<len(retries): if e.response.status_code == 503 and i < len(retries):
self._log("warning", "Common Crawl API returned 503 for {domain}. Retrying in {delay}s.", self._log("warning", "Common Crawl API returned 503 for {domain}. Retrying in {delay}s.",
params={"domain": domain, "delay": retries[i]}, tag="URL_SEED") params={"domain": domain, "delay": retries[i]}, tag="URL_SEED")
await asyncio.sleep(retries[i]) await asyncio.sleep(retries[i])
@@ -516,9 +549,8 @@ class AsyncUrlSeeder:
params={"domain": domain, "error": str(e)}, tag="URL_SEED") params={"domain": domain, "error": str(e)}, tag="URL_SEED")
raise raise
# ─────────────────────────────── Sitemaps # ─────────────────────────────── Sitemaps
async def _from_sitemaps(self, domain:str, pattern:str, force:bool=False): async def _from_sitemaps(self, domain: str, pattern: str, force: bool = False):
""" """
1. Probe default sitemap locations. 1. Probe default sitemap locations.
2. If none exist, parse robots.txt for alternative sitemap URLs. 2. If none exist, parse robots.txt for alternative sitemap URLs.
@@ -543,15 +575,16 @@ class AsyncUrlSeeder:
# 1⃣ direct sitemap probe # 1⃣ direct sitemap probe
# strip any scheme so we can handle https → http fallback # strip any scheme so we can handle https → http fallback
host=re.sub(r'^https?://','',domain).rstrip('/') host = re.sub(r'^https?://', '', domain).rstrip('/')
schemes=('https','http') # prefer TLS, downgrade if needed schemes = ('https', 'http') # prefer TLS, downgrade if needed
for scheme in schemes: for scheme in schemes:
for suffix in ("/sitemap.xml","/sitemap_index.xml"): for suffix in ("/sitemap.xml", "/sitemap_index.xml"):
sm=f"{scheme}://{host}{suffix}" sm = f"{scheme}://{host}{suffix}"
sm = await self._resolve_head(sm) sm = await self._resolve_head(sm)
if sm: if sm:
self._log("info","Found sitemap at {url}",params={"url":sm},tag="URL_SEED") self._log("info", "Found sitemap at {url}", params={
"url": sm}, tag="URL_SEED")
async with aiofiles.open(path, "w") as fp: async with aiofiles.open(path, "w") as fp:
async for u in self._iter_sitemap(sm): async for u in self._iter_sitemap(sm):
await fp.write(u + "\n") await fp.write(u + "\n")
@@ -560,15 +593,18 @@ class AsyncUrlSeeder:
return return
# 2⃣ robots.txt fallback # 2⃣ robots.txt fallback
robots=f"https://{domain.rstrip('/')}/robots.txt" robots = f"https://{domain.rstrip('/')}/robots.txt"
try: try:
r=await self.client.get(robots,timeout=10,follow_redirects=True) r = await self.client.get(robots, timeout=10, follow_redirects=True)
if not 200<=r.status_code<300: if not 200 <= r.status_code < 300:
self._log("warning","robots.txt unavailable for {d} HTTP{c}",params={"d":domain,"c":r.status_code},tag="URL_SEED") self._log("warning", "robots.txt unavailable for {d} HTTP{c}", params={
"d": domain, "c": r.status_code}, tag="URL_SEED")
return return
sitemap_lines=[l.split(":",1)[1].strip() for l in r.text.splitlines() if l.lower().startswith("sitemap:")] sitemap_lines = [l.split(":", 1)[1].strip(
) for l in r.text.splitlines() if l.lower().startswith("sitemap:")]
except Exception as e: except Exception as e:
self._log("warning","Failed to fetch robots.txt for {d}: {e}",params={"d":domain,"e":str(e)},tag="URL_SEED") self._log("warning", "Failed to fetch robots.txt for {d}: {e}", params={
"d": domain, "e": str(e)}, tag="URL_SEED")
return return
if sitemap_lines: if sitemap_lines:
@@ -579,7 +615,7 @@ class AsyncUrlSeeder:
if _match(u, pattern): if _match(u, pattern):
yield u yield u
async def _iter_sitemap(self, url:str): async def _iter_sitemap(self, url: str):
try: try:
r = await self.client.get(url, timeout=15) r = await self.client.get(url, timeout=15)
r.raise_for_status() r.raise_for_status()
@@ -598,6 +634,11 @@ class AsyncUrlSeeder:
data = gzip.decompress(r.content) if url.endswith(".gz") else r.content data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
# Detect if this is a sitemap index by checking for <sitemapindex> or presence of <sitemap> elements
is_sitemap_index = False
sub_sitemaps = []
regular_urls = []
# Use lxml for XML parsing if available, as it's generally more robust # Use lxml for XML parsing if available, as it's generally more robust
if LXML: if LXML:
try: try:
@@ -608,19 +649,21 @@ class AsyncUrlSeeder:
# Define namespace for sitemap # Define namespace for sitemap
ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'} ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
# First check if this is a sitemap index # Check for sitemap index entries
for sitemap_elem in root.xpath('//s:sitemap/s:loc', namespaces=ns): sitemap_locs = root.xpath('//s:sitemap/s:loc', namespaces=ns)
if sitemap_locs:
is_sitemap_index = True
for sitemap_elem in sitemap_locs:
loc = sitemap_elem.text.strip() if sitemap_elem.text else "" loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
if loc: if loc:
self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED") sub_sitemaps.append(loc)
async for u in self._iter_sitemap(loc):
yield u
# Then check for regular URLs # If not a sitemap index, get regular URLs
if not is_sitemap_index:
for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns): for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
loc = loc_elem.text.strip() if loc_elem.text else "" loc = loc_elem.text.strip() if loc_elem.text else ""
if loc: if loc:
yield loc regular_urls.append(loc)
except Exception as e: except Exception as e:
self._log("error", "LXML parsing error for sitemap {url}: {error}", self._log("error", "LXML parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED") params={"url": url, "error": str(e)}, tag="URL_SEED")
@@ -636,75 +679,166 @@ class AsyncUrlSeeder:
elem.tag = elem.tag.split('}')[1] elem.tag = elem.tag.split('}')[1]
# Check for sitemap index entries # Check for sitemap index entries
for sitemap in root.findall('.//sitemap'): sitemaps = root.findall('.//sitemap')
if sitemaps:
is_sitemap_index = True
for sitemap in sitemaps:
loc_elem = sitemap.find('loc') loc_elem = sitemap.find('loc')
if loc_elem is not None and loc_elem.text: if loc_elem is not None and loc_elem.text:
loc = loc_elem.text.strip() sub_sitemaps.append(loc_elem.text.strip())
self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED")
async for u in self._iter_sitemap(loc):
yield u
# Check for regular URL entries # If not a sitemap index, get regular URLs
for url in root.findall('.//url'): if not is_sitemap_index:
loc_elem = url.find('loc') for url_elem in root.findall('.//url'):
loc_elem = url_elem.find('loc')
if loc_elem is not None and loc_elem.text: if loc_elem is not None and loc_elem.text:
yield loc_elem.text.strip() regular_urls.append(loc_elem.text.strip())
except Exception as e: except Exception as e:
self._log("error", "ElementTree parsing error for sitemap {url}: {error}", self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED") params={"url": url, "error": str(e)}, tag="URL_SEED")
return return
# Process based on type
if is_sitemap_index and sub_sitemaps:
self._log("info", "Processing sitemap index with {count} sub-sitemaps in parallel",
params={"count": len(sub_sitemaps)}, tag="URL_SEED")
# Create a bounded queue for results to prevent RAM issues
# For sitemap indexes, use a larger queue as we expect many URLs
queue_size = min(50000, len(sub_sitemaps) * 1000) # Estimate 1000 URLs per sitemap
result_queue = asyncio.Queue(maxsize=queue_size)
completed_count = 0
total_sitemaps = len(sub_sitemaps)
async def process_subsitemap(sitemap_url: str):
try:
self._log(
"debug", "Processing sub-sitemap: {url}", params={"url": sitemap_url}, tag="URL_SEED")
# Recursively process sub-sitemap
async for u in self._iter_sitemap(sitemap_url):
await result_queue.put(u) # Will block if queue is full
except Exception as e:
self._log("error", "Error processing sub-sitemap {url}: {error}",
params={"url": sitemap_url, "error": str(e)}, tag="URL_SEED")
finally:
# Put sentinel to signal completion
await result_queue.put(None)
# Start all tasks
tasks = [asyncio.create_task(process_subsitemap(sm))
for sm in sub_sitemaps]
# Yield results as they come in
while completed_count < total_sitemaps:
item = await result_queue.get()
if item is None:
completed_count += 1
else:
yield item
# Ensure all tasks are done
await asyncio.gather(*tasks, return_exceptions=True)
else:
# Regular sitemap - yield URLs directly
for u in regular_urls:
yield u
# ─────────────────────────────── validate helpers # ─────────────────────────────── validate helpers
async def _validate(self, url:str, res_list:List[Dict[str,Any]], live:bool, async def _validate(self, url: str, res_list: List[Dict[str, Any]], live: bool,
extract:bool, timeout:int, verbose:bool): extract: bool, timeout: int, verbose: bool, query: Optional[str] = None,
score_threshold: Optional[float] = None, scoring_method: str = "bm25"):
# Local verbose parameter for this function is used to decide if intermediate logs should be printed # Local verbose parameter for this function is used to decide if intermediate logs should be printed
# The main logger's verbose status should be controlled by the caller. # The main logger's verbose status should be controlled by the caller.
cache_kind = "head" if extract else "live" cache_kind = "head" if extract else "live"
# ---------- try cache ---------- # ---------- try cache ----------
if (live or extract) and not (hasattr(self, 'force') and self.force): if not (hasattr(self, 'force') and self.force):
cached = self._cache_get(cache_kind, url) cached = await self._cache_get(cache_kind, url)
if cached: if cached:
res_list.append(cached) res_list.append(cached)
return return
if extract: if extract:
self._log("debug", "Fetching head for {url}", params={"url": url}, tag="URL_SEED") self._log("debug", "Fetching head for {url}", params={
ok,html,final = await self._fetch_head(url,timeout) "url": url}, tag="URL_SEED")
status="valid" if ok else "not_valid" ok, html, final = await self._fetch_head(url, timeout)
status = "valid" if ok else "not_valid"
self._log("info" if ok else "warning", "HEAD {status} for {final_url}", self._log("info" if ok else "warning", "HEAD {status} for {final_url}",
params={"status": status.upper(), "final_url": final or url}, tag="URL_SEED") params={"status": status.upper(), "final_url": final or url}, tag="URL_SEED")
# head_data = _parse_head(html) if ok else {}
head_data = await asyncio.to_thread(_parse_head, html) if ok else {}
entry = { entry = {
"url": final or url, "url": final or url,
"status": status, "status": status,
"head_data": _parse_head(html) if ok else {}, "head_data": head_data,
} }
if live or extract:
self._cache_set(cache_kind, url, entry) # Apply BM25 scoring if query is provided and head data exists
res_list.append(entry) if query and ok and scoring_method == "bm25" and head_data:
text_context = self._extract_text_context(head_data)
if text_context:
# Calculate BM25 score for this single document
# scores = self._calculate_bm25_score(query, [text_context])
scores = await asyncio.to_thread(self._calculate_bm25_score, query, [text_context])
relevance_score = scores[0] if scores else 0.0
entry["relevance_score"] = float(relevance_score)
else:
# No text context, use URL-based scoring as fallback
relevance_score = self._calculate_url_relevance_score(
query, entry["url"])
entry["relevance_score"] = float(relevance_score)
elif query:
# Query provided but no head data - we reject this entry
self._log("debug", "No head data for {url}, using URL-based scoring",
params={"url": url}, tag="URL_SEED")
return
# relevance_score = self._calculate_url_relevance_score(query, entry["url"])
# entry["relevance_score"] = float(relevance_score)
elif live: elif live:
self._log("debug", "Performing live check for {url}", params={"url": url}, tag="URL_SEED") self._log("debug", "Performing live check for {url}", params={
ok=await self._resolve_head(url) "url": url}, tag="URL_SEED")
status="valid" if ok else "not_valid" ok = await self._resolve_head(url)
status = "valid" if ok else "not_valid"
self._log("info" if ok else "warning", "LIVE CHECK {status} for {url}", self._log("info" if ok else "warning", "LIVE CHECK {status} for {url}",
params={"status": status.upper(), "url": url}, tag="URL_SEED") params={"status": status.upper(), "url": url}, tag="URL_SEED")
entry = {"url": url, "status": status, "head_data": {}} entry = {"url": url, "status": status, "head_data": {}}
if live or extract:
self._cache_set(cache_kind, url, entry) # Apply URL-based scoring if query is provided
res_list.append(entry) if query:
relevance_score = self._calculate_url_relevance_score(
query, url)
entry["relevance_score"] = float(relevance_score)
else: else:
entry = {"url": url, "status": "unknown", "head_data": {}} entry = {"url": url, "status": "unknown", "head_data": {}}
# Apply URL-based scoring if query is provided
if query:
relevance_score = self._calculate_url_relevance_score(
query, url)
entry["relevance_score"] = float(relevance_score)
# Now decide whether to add the entry based on score threshold
if query and "relevance_score" in entry:
if score_threshold is None or entry["relevance_score"] >= score_threshold:
if live or extract: if live or extract:
self._cache_set(cache_kind, url, entry) await self._cache_set(cache_kind, url, entry)
res_list.append(entry)
else:
self._log("debug", "URL {url} filtered out with score {score} < {threshold}",
params={"url": url, "score": entry["relevance_score"], "threshold": score_threshold}, tag="URL_SEED")
else:
# No query or no scoring - add as usual
if live or extract:
await self._cache_set(cache_kind, url, entry)
res_list.append(entry) res_list.append(entry)
async def _head_ok(self, url: str, timeout: int) -> bool:
async def _head_ok(self, url:str, timeout:int)->bool:
try: try:
r=await self.client.head(url, timeout=timeout, r = await self.client.head(url, timeout=timeout,
headers={"Range":"bytes=0-0","Accept-Encoding":"identity"}) headers={"Range": "bytes=0-0", "Accept-Encoding": "identity"})
r.raise_for_status() # Raise for bad status codes (4xx, 5xx) r.raise_for_status() # Raise for bad status codes (4xx, 5xx)
return True return True
except httpx.RequestError as e: except httpx.RequestError as e:
@@ -743,7 +877,7 @@ class AsyncUrlSeeder:
follow_redirects=False, follow_redirects=False,
) as r: ) as r:
if r.status_code in (301,302,303,307,308): if r.status_code in (301, 302, 303, 307, 308):
location = r.headers.get("Location") location = r.headers.get("Location")
if location: if location:
url = urljoin(url, location) url = urljoin(url, location)
@@ -753,10 +887,12 @@ class AsyncUrlSeeder:
else: else:
self._log("warning", "Redirect status {status_code} but no Location header for {url}", self._log("warning", "Redirect status {status_code} but no Location header for {url}",
params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED") params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
return False, "", str(r.url) # Return original URL if no new location # Return original URL if no new location
return False, "", str(r.url)
# For 2xx or other non-redirect codes, proceed to read content # For 2xx or other non-redirect codes, proceed to read content
if not (200 <= r.status_code < 400): # Only allow successful codes, or continue # Only allow successful codes, or continue
if not (200 <= r.status_code < 400):
self._log("warning", "Non-success status {status_code} when fetching head for {url}", self._log("warning", "Non-success status {status_code} when fetching head for {url}",
params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED") params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
return False, "", str(r.url) return False, "", str(r.url)
@@ -787,18 +923,20 @@ class AsyncUrlSeeder:
self._log( self._log(
"warning", "warning",
"Decompression error for {url} ({encoding}): {error}", "Decompression error for {url} ({encoding}): {error}",
params={"url": r.url, "encoding": enc, "error": str(e)}, params={"url": r.url,
"encoding": enc, "error": str(e)},
tag="URL_SEED", tag="URL_SEED",
) )
# fall through with raw buf # fall through with raw buf
# Find the </head> tag case-insensitively and decode # Find the </head> tag case-insensitively and decode
idx = buf.lower().find(b"</head>") idx = buf.lower().find(b"</head>")
if idx==-1: if idx == -1:
self._log("debug", "No </head> tag found in initial bytes of {url}", self._log("debug", "No </head> tag found in initial bytes of {url}",
params={"url": r.url}, tag="URL_SEED") params={"url": r.url}, tag="URL_SEED")
# If no </head> is found, take a reasonable chunk or all if small # If no </head> is found, take a reasonable chunk or all if small
html_bytes = buf if len(buf) < 10240 else buf[:10240] # Take max 10KB if no head tag # Take max 10KB if no head tag
html_bytes = buf if len(buf) < 10240 else buf[:10240]
else: else:
html_bytes = buf[:idx+7] # Include </head> tag html_bytes = buf[:idx+7] # Include </head> tag
@@ -813,17 +951,18 @@ class AsyncUrlSeeder:
) )
html = html_bytes.decode("latin-1", "replace") html = html_bytes.decode("latin-1", "replace")
return True,html,str(r.url) # Return the actual URL after redirects # Return the actual URL after redirects
return True, html, str(r.url)
except httpx.RequestError as e: except httpx.RequestError as e:
self._log("debug", "Fetch head network error for {url}: {error}", self._log("debug", "Fetch head network error for {url}: {error}",
params={"url": url, "error": str(e)}, tag="URL_SEED") params={"url": url, "error": str(e)}, tag="URL_SEED")
return False,"",url return False, "", url
# If loop finishes without returning (e.g. too many redirects) # If loop finishes without returning (e.g. too many redirects)
self._log("warning", "Exceeded max redirects ({max_redirects}) for {url}", self._log("warning", "Exceeded max redirects ({max_redirects}) for {url}",
params={"max_redirects": max_redirects, "url": url}, tag="URL_SEED") params={"max_redirects": max_redirects, "url": url}, tag="URL_SEED")
return False,"",url return False, "", url
# ─────────────────────────────── BM25 scoring helpers # ─────────────────────────────── BM25 scoring helpers
def _extract_text_context(self, head_data: Dict[str, Any]) -> str: def _extract_text_context(self, head_data: Dict[str, Any]) -> str:
@@ -865,7 +1004,8 @@ class AsyncUrlSeeder:
if isinstance(jsonld[field], str): if isinstance(jsonld[field], str):
text_parts.append(jsonld[field]) text_parts.append(jsonld[field])
elif isinstance(jsonld[field], list): elif isinstance(jsonld[field], list):
text_parts.extend(str(item) for item in jsonld[field] if item) text_parts.extend(str(item)
for item in jsonld[field] if item)
# Handle @graph structures # Handle @graph structures
if "@graph" in jsonld and isinstance(jsonld["@graph"], list): if "@graph" in jsonld and isinstance(jsonld["@graph"], list):
@@ -878,10 +1018,99 @@ class AsyncUrlSeeder:
# Combine all text parts # Combine all text parts
return " ".join(filter(None, text_parts)) return " ".join(filter(None, text_parts))
def _calculate_url_relevance_score(self, query: str, url: str) -> float:
"""Calculate relevance score between query and URL using string matching."""
# Normalize inputs
query_lower = query.lower()
url_lower = url.lower()
# Extract URL components
from urllib.parse import urlparse
parsed = urlparse(url)
domain = parsed.netloc.replace('www.', '')
path = parsed.path.strip('/')
# Create searchable text from URL
# Split domain by dots and path by slashes
domain_parts = domain.split('.')
path_parts = [p for p in path.split('/') if p]
# Include query parameters if any
query_params = parsed.query
param_parts = []
if query_params:
for param in query_params.split('&'):
if '=' in param:
key, value = param.split('=', 1)
param_parts.extend([key, value])
# Combine all parts
all_parts = domain_parts + path_parts + param_parts
# Calculate scores
scores = []
query_tokens = query_lower.split()
# 1. Exact match in any part (highest score)
for part in all_parts:
part_lower = part.lower()
if query_lower in part_lower:
scores.append(1.0)
elif part_lower in query_lower:
scores.append(0.9)
# 2. Token matching
for token in query_tokens:
token_scores = []
for part in all_parts:
part_lower = part.lower()
if token in part_lower:
# Score based on how much of the part the token covers
coverage = len(token) / len(part_lower)
token_scores.append(0.7 * coverage)
elif part_lower in token:
coverage = len(part_lower) / len(token)
token_scores.append(0.6 * coverage)
if token_scores:
scores.append(max(token_scores))
# 3. Character n-gram similarity (for fuzzy matching)
def get_ngrams(text, n=3):
return set(text[i:i+n] for i in range(len(text)-n+1))
# Combine all URL parts into one string for n-gram comparison
url_text = ' '.join(all_parts).lower()
if len(query_lower) >= 3 and len(url_text) >= 3:
query_ngrams = get_ngrams(query_lower)
url_ngrams = get_ngrams(url_text)
if query_ngrams and url_ngrams:
intersection = len(query_ngrams & url_ngrams)
union = len(query_ngrams | url_ngrams)
jaccard = intersection / union if union > 0 else 0
scores.append(0.5 * jaccard)
# Calculate final score
if not scores:
return 0.0
# Weighted average with bias towards higher scores
scores.sort(reverse=True)
weighted_score = 0
total_weight = 0
for i, score in enumerate(scores):
weight = 1 / (i + 1) # Higher weight for better matches
weighted_score += score * weight
total_weight += weight
final_score = weighted_score / total_weight if total_weight > 0 else 0
return min(final_score, 1.0) # Cap at 1.0
def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]: def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]:
"""Calculate BM25 scores for documents against a query.""" """Calculate BM25 scores for documents against a query."""
if not HAS_BM25: if not HAS_BM25:
self._log("warning", "rank_bm25 not installed. Returning zero scores.", tag="URL_SEED") self._log(
"warning", "rank_bm25 not installed. Returning zero scores.", tag="URL_SEED")
return [0.0] * len(documents) return [0.0] * len(documents)
if not query or not documents: if not query or not documents:
@@ -912,9 +1141,25 @@ class AsyncUrlSeeder:
params={"error": str(e)}, tag="URL_SEED") params={"error": str(e)}, tag="URL_SEED")
return [0.0] * len(documents) return [0.0] * len(documents)
# ─────────────────────────────── cleanup methods
async def close(self):
"""Close the HTTP client if we own it."""
if self._owns_client and self.client:
await self.client.aclose()
self._log("debug", "Closed HTTP client", tag="URL_SEED")
async def __aenter__(self):
"""Async context manager entry."""
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
await self.close()
return False
# ─────────────────────────────── index helper # ─────────────────────────────── index helper
async def _latest_index(self)->str: async def _latest_index(self) -> str:
if self.index_cache_path.exists() and (time.time()-self.index_cache_path.stat().st_mtime)<self.ttl.total_seconds(): if self.index_cache_path.exists() and (time.time()-self.index_cache_path.stat().st_mtime) < self.ttl.total_seconds():
self._log("info", "Loading latest CC index from cache: {path}", self._log("info", "Loading latest CC index from cache: {path}",
params={"path": self.index_cache_path}, tag="URL_SEED") params={"path": self.index_cache_path}, tag="URL_SEED")
return self.index_cache_path.read_text().strip() return self.index_cache_path.read_text().strip()
@@ -923,9 +1168,9 @@ class AsyncUrlSeeder:
params={"url": COLLINFO_URL}, tag="URL_SEED") params={"url": COLLINFO_URL}, tag="URL_SEED")
try: try:
async with httpx.AsyncClient() as c: async with httpx.AsyncClient() as c:
j=await c.get(COLLINFO_URL,timeout=10) j = await c.get(COLLINFO_URL, timeout=10)
j.raise_for_status() # Raise an exception for bad status codes j.raise_for_status() # Raise an exception for bad status codes
idx=j.json()[0]["id"] idx = j.json()[0]["id"]
self.index_cache_path.write_text(idx) self.index_cache_path.write_text(idx)
self._log("success", "Successfully fetched and cached CC index: {index_id}", self._log("success", "Successfully fetched and cached CC index: {index_id}",
params={"index_id": idx}, tag="URL_SEED") params={"index_id": idx}, tag="URL_SEED")

View File

@@ -0,0 +1,806 @@
"""
BBC Sport Research Assistant Pipeline
=====================================
This example demonstrates how URLSeeder helps create an efficient research pipeline:
1. Discover all available URLs without crawling
2. Filter and rank them based on relevance
3. Crawl only the most relevant content
4. Generate comprehensive research insights
Pipeline Steps:
1. Get user query
2. Optionally enhance query using LLM
3. Use URLSeeder to discover and rank URLs
4. Crawl top K URLs with BM25 filtering
5. Generate detailed response with citations
Requirements:
- pip install crawl4ai
- pip install litellm
- export GEMINI_API_KEY="your-api-key"
Usage:
- Run normally: python bbc_sport_research_assistant.py
- Run test mode: python bbc_sport_research_assistant.py test
"""
import asyncio
import json
import os
import hashlib
import pickle
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
from pathlib import Path
# Rich for colored output
from rich.console import Console
from rich.text import Text
from rich.panel import Panel
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn
# Crawl4AI imports
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
AsyncUrlSeeder,
SeedingConfig,
AsyncLogger
)
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
# LiteLLM for AI communication
import litellm
# Initialize Rich console
console = Console()
# Get the current directory where this script is located
SCRIPT_DIR = Path(__file__).parent.resolve()
# Cache configuration - relative to script directory
CACHE_DIR = SCRIPT_DIR / "temp_cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)
# Testing limits
TESTING_MODE = True
MAX_URLS_DISCOVERY = 100 if TESTING_MODE else 1000
MAX_URLS_TO_CRAWL = 5 if TESTING_MODE else 10
def get_cache_key(prefix: str, *args) -> str:
"""Generate cache key from prefix and arguments"""
content = f"{prefix}:{'|'.join(str(arg) for arg in args)}"
return hashlib.md5(content.encode()).hexdigest()
def load_from_cache(cache_key: str) -> Optional[any]:
"""Load data from cache if exists"""
cache_path = CACHE_DIR / f"{cache_key}.pkl"
if cache_path.exists():
with open(cache_path, 'rb') as f:
return pickle.load(f)
return None
def save_to_cache(cache_key: str, data: any) -> None:
"""Save data to cache"""
cache_path = CACHE_DIR / f"{cache_key}.pkl"
with open(cache_path, 'wb') as f:
pickle.dump(data, f)
@dataclass
class ResearchConfig:
"""Configuration for research pipeline"""
# Core settings
domain: str = "www.bbc.com/sport"
max_urls_discovery: int = 100
max_urls_to_crawl: int = 10
top_k_urls: int = 10
# Scoring and filtering
score_threshold: float = 0.1
scoring_method: str = "bm25"
# Processing options
use_llm_enhancement: bool = True
extract_head_metadata: bool = True
live_check: bool = True
force_refresh: bool = False
# Crawler settings
max_concurrent_crawls: int = 5
timeout: int = 30000
headless: bool = True
# Output settings
save_json: bool = True
save_markdown: bool = True
output_dir: str = None # Will be set in __post_init__
# Development settings
test_mode: bool = False
interactive_mode: bool = False
verbose: bool = True
def __post_init__(self):
"""Adjust settings based on test mode"""
if self.test_mode:
self.max_urls_discovery = 50
self.max_urls_to_crawl = 3
self.top_k_urls = 5
# Set default output directory relative to script location
if self.output_dir is None:
self.output_dir = str(SCRIPT_DIR / "research_results")
@dataclass
class ResearchQuery:
"""Container for research query and metadata"""
original_query: str
enhanced_query: Optional[str] = None
search_patterns: List[str] = None
timestamp: str = None
@dataclass
class ResearchResult:
"""Container for research results"""
query: ResearchQuery
discovered_urls: List[Dict]
crawled_content: List[Dict]
synthesis: str
citations: List[Dict]
metadata: Dict
async def get_user_query() -> str:
"""
Get research query from user input
"""
query = input("\n🔍 Enter your research query: ")
return query.strip()
async def enhance_query_with_llm(query: str) -> ResearchQuery:
"""
Use LLM to enhance the research query:
- Extract key terms
- Generate search patterns
- Identify related topics
"""
# Check cache
cache_key = get_cache_key("enhanced_query", query)
cached_result = load_from_cache(cache_key)
if cached_result:
console.print("[dim cyan]📦 Using cached enhanced query[/dim cyan]")
return cached_result
try:
response = await litellm.acompletion(
model="gemini/gemini-2.5-flash-preview-04-17",
messages=[{
"role": "user",
"content": f"""Given this research query: "{query}"
Extract:
1. Key terms and concepts (as a list)
2. Related search terms
3. A more specific/enhanced version of the query
Return as JSON:
{{
"key_terms": ["term1", "term2"],
"related_terms": ["related1", "related2"],
"enhanced_query": "enhanced version of query"
}}"""
}],
# reasoning_effort="low",
temperature=0.3,
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
# Create search patterns
all_terms = data["key_terms"] + data["related_terms"]
patterns = [f"*{term.lower()}*" for term in all_terms]
result = ResearchQuery(
original_query=query,
enhanced_query=data["enhanced_query"],
search_patterns=patterns[:10], # Limit patterns
timestamp=datetime.now().isoformat()
)
# Cache the result
save_to_cache(cache_key, result)
return result
except Exception as e:
console.print(f"[yellow]⚠️ LLM enhancement failed: {e}[/yellow]")
# Fallback to simple tokenization
return ResearchQuery(
original_query=query,
enhanced_query=query,
search_patterns=tokenize_query_to_patterns(query),
timestamp=datetime.now().isoformat()
)
def tokenize_query_to_patterns(query: str) -> List[str]:
"""
Convert query into URL patterns for URLSeeder
Example: "AI startups funding" -> ["*ai*", "*startup*", "*funding*"]
"""
# Simple tokenization - split and create patterns
words = query.lower().split()
# Filter out common words
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'that'}
keywords = [w for w in words if w not in stop_words and len(w) > 2]
# Create patterns
patterns = [f"*{keyword}*" for keyword in keywords]
return patterns[:8] # Limit to 8 patterns
async def discover_urls(domain: str, query: str, config: ResearchConfig) -> List[Dict]:
"""
Use URLSeeder to discover and rank URLs:
1. Fetch all URLs from domain
2. Filter by patterns
3. Extract metadata (titles, descriptions)
4. Rank by BM25 relevance score
5. Return top K URLs
"""
# Check cache
cache_key = get_cache_key("discovered_urls", domain, query, config.top_k_urls)
cached_result = load_from_cache(cache_key)
if cached_result and not config.force_refresh:
console.print("[dim cyan]📦 Using cached URL discovery[/dim cyan]")
return cached_result
console.print(f"\n[cyan]🔍 Discovering URLs from {domain}...[/cyan]")
# Initialize URL seeder
seeder = AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose))
# Configure seeding
seeding_config = SeedingConfig(
source="sitemap+cc", # Use both sitemap and Common Crawl
extract_head=config.extract_head_metadata,
query=query,
scoring_method=config.scoring_method,
score_threshold=config.score_threshold,
max_urls=config.max_urls_discovery,
live_check=config.live_check,
force=config.force_refresh
)
try:
# Discover URLs
urls = await seeder.urls(domain, seeding_config)
# Sort by relevance score (descending)
sorted_urls = sorted(
urls,
key=lambda x: x.get('relevance_score', 0),
reverse=True
)
# Take top K
top_urls = sorted_urls[:config.top_k_urls]
console.print(f"[green]✅ Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
# Cache the result
save_to_cache(cache_key, top_urls)
return top_urls
except Exception as e:
console.print(f"[red]❌ URL discovery failed: {e}[/red]")
return []
async def crawl_selected_urls(urls: List[str], query: str, config: ResearchConfig) -> List[Dict]:
"""
Crawl selected URLs with content filtering:
- Use AsyncWebCrawler.arun_many()
- Apply content filter
- Generate clean markdown
"""
# Extract just URLs from the discovery results
url_list = [u['url'] for u in urls if 'url' in u][:config.max_urls_to_crawl]
if not url_list:
console.print("[red]❌ No URLs to crawl[/red]")
return []
console.print(f"\n[cyan]🕷️ Crawling {len(url_list)} URLs...[/cyan]")
# Check cache for each URL
crawled_results = []
urls_to_crawl = []
for url in url_list:
cache_key = get_cache_key("crawled_content", url, query)
cached_content = load_from_cache(cache_key)
if cached_content and not config.force_refresh:
crawled_results.append(cached_content)
else:
urls_to_crawl.append(url)
if urls_to_crawl:
console.print(f"[cyan]📥 Crawling {len(urls_to_crawl)} new URLs (cached: {len(crawled_results)})[/cyan]")
# Configure markdown generator with content filter
md_generator = DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="dynamic",
min_word_threshold=10
),
)
# Configure crawler
crawler_config = CrawlerRunConfig(
markdown_generator=md_generator,
exclude_external_links=True,
excluded_tags=['nav', 'header', 'footer', 'aside'],
)
# Create crawler with browser config
async with AsyncWebCrawler(
config=BrowserConfig(
headless=config.headless,
verbose=config.verbose
)
) as crawler:
# Crawl URLs
results = await crawler.arun_many(
urls_to_crawl,
config=crawler_config,
max_concurrent=config.max_concurrent_crawls
)
# Process results
for url, result in zip(urls_to_crawl, results):
if result.success:
content_data = {
'url': url,
'title': result.metadata.get('title', ''),
'markdown': result.markdown.fit_markdown or result.markdown.raw_markdown,
'raw_length': len(result.markdown.raw_markdown),
'fit_length': len(result.markdown.fit_markdown) if result.markdown.fit_markdown else len(result.markdown.raw_markdown),
'metadata': result.metadata
}
crawled_results.append(content_data)
# Cache the result
cache_key = get_cache_key("crawled_content", url, query)
save_to_cache(cache_key, content_data)
else:
console.print(f" [red]❌ Failed: {url[:50]}... - {result.error}[/red]")
console.print(f"[green]✅ Successfully crawled {len(crawled_results)} URLs[/green]")
return crawled_results
async def generate_research_synthesis(
query: str,
crawled_content: List[Dict]
) -> Tuple[str, List[Dict]]:
"""
Use LLM to synthesize research findings:
- Analyze all crawled content
- Generate comprehensive answer
- Extract citations and references
"""
if not crawled_content:
return "No content available for synthesis.", []
console.print("\n[cyan]🤖 Generating research synthesis...[/cyan]")
# Prepare content for LLM
content_sections = []
for i, content in enumerate(crawled_content, 1):
section = f"""
SOURCE {i}:
Title: {content['title']}
URL: {content['url']}
Content Preview:
{content['markdown'][:1500]}...
"""
content_sections.append(section)
combined_content = "\n---\n".join(content_sections)
try:
response = await litellm.acompletion(
model="gemini/gemini-2.5-flash-preview-04-17",
messages=[{
"role": "user",
"content": f"""Research Query: "{query}"
Based on the following sources, provide a comprehensive research synthesis.
{combined_content}
Please provide:
1. An executive summary (2-3 sentences)
2. Key findings (3-5 bullet points)
3. Detailed analysis (2-3 paragraphs)
4. Future implications or trends
Format your response with clear sections and cite sources using [Source N] notation.
Keep the total response under 800 words."""
}],
# reasoning_effort="medium",
temperature=0.7
)
synthesis = response.choices[0].message.content
# Extract citations from the synthesis
citations = []
for i, content in enumerate(crawled_content, 1):
if f"[Source {i}]" in synthesis or f"Source {i}" in synthesis:
citations.append({
'source_id': i,
'title': content['title'],
'url': content['url']
})
return synthesis, citations
except Exception as e:
console.print(f"[red]❌ Synthesis generation failed: {e}[/red]")
# Fallback to simple summary
summary = f"Research on '{query}' found {len(crawled_content)} relevant articles:\n\n"
for content in crawled_content[:3]:
summary += f"- {content['title']}\n {content['url']}\n\n"
return summary, []
def format_research_output(result: ResearchResult) -> str:
"""
Format the final research output with:
- Executive summary
- Key findings
- Detailed analysis
- Citations and sources
"""
output = []
output.append("\n" + "=" * 60)
output.append("🔬 RESEARCH RESULTS")
output.append("=" * 60)
# Query info
output.append(f"\n📋 Query: {result.query.original_query}")
if result.query.enhanced_query != result.query.original_query:
output.append(f" Enhanced: {result.query.enhanced_query}")
# Discovery stats
output.append(f"\n📊 Statistics:")
output.append(f" - URLs discovered: {len(result.discovered_urls)}")
output.append(f" - URLs crawled: {len(result.crawled_content)}")
output.append(f" - Processing time: {result.metadata.get('duration', 'N/A')}")
# Synthesis
output.append(f"\n📝 SYNTHESIS")
output.append("-" * 60)
output.append(result.synthesis)
# Citations
if result.citations:
output.append(f"\n📚 SOURCES")
output.append("-" * 60)
for citation in result.citations:
output.append(f"[{citation['source_id']}] {citation['title']}")
output.append(f" {citation['url']}")
return "\n".join(output)
async def save_research_results(result: ResearchResult, config: ResearchConfig) -> Tuple[str, str]:
"""
Save research results in JSON and Markdown formats
Returns:
Tuple of (json_path, markdown_path)
"""
# Create output directory
output_dir = Path(config.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Generate filename based on query and timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
query_slug = result.query.original_query[:50].replace(" ", "_").replace("/", "_")
base_filename = f"{timestamp}_{query_slug}"
json_path = None
md_path = None
# Save JSON
if config.save_json:
json_path = output_dir / f"{base_filename}.json"
with open(json_path, 'w') as f:
json.dump(asdict(result), f, indent=2, default=str)
console.print(f"\n[green]💾 JSON saved: {json_path}[/green]")
# Save Markdown
if config.save_markdown:
md_path = output_dir / f"{base_filename}.md"
# Create formatted markdown
md_content = [
f"# Research Report: {result.query.original_query}",
f"\n**Generated on:** {result.metadata.get('timestamp', 'N/A')}",
f"\n**Domain:** {result.metadata.get('domain', 'N/A')}",
f"\n**Processing time:** {result.metadata.get('duration', 'N/A')}",
"\n---\n",
"## Query Information",
f"- **Original Query:** {result.query.original_query}",
f"- **Enhanced Query:** {result.query.enhanced_query or 'N/A'}",
f"- **Search Patterns:** {', '.join(result.query.search_patterns or [])}",
"\n## Statistics",
f"- **URLs Discovered:** {len(result.discovered_urls)}",
f"- **URLs Crawled:** {len(result.crawled_content)}",
f"- **Sources Cited:** {len(result.citations)}",
"\n## Research Synthesis\n",
result.synthesis,
"\n## Sources\n"
]
# Add citations
for citation in result.citations:
md_content.append(f"### [{citation['source_id']}] {citation['title']}")
md_content.append(f"- **URL:** [{citation['url']}]({citation['url']})")
md_content.append("")
# Add discovered URLs summary
md_content.extend([
"\n## Discovered URLs (Top 10)\n",
"| Score | URL | Title |",
"|-------|-----|-------|"
])
for url_data in result.discovered_urls[:10]:
score = url_data.get('relevance_score', 0)
url = url_data.get('url', '')
title = 'N/A'
if 'head_data' in url_data and url_data['head_data']:
title = url_data['head_data'].get('title', 'N/A')[:60] + '...'
md_content.append(f"| {score:.3f} | {url[:50]}... | {title} |")
# Write markdown
with open(md_path, 'w') as f:
f.write('\n'.join(md_content))
console.print(f"[green]📄 Markdown saved: {md_path}[/green]")
return str(json_path) if json_path else None, str(md_path) if md_path else None
async def wait_for_user(message: str = "\nPress Enter to continue..."):
"""Wait for user input in interactive mode"""
input(message)
async def research_pipeline(
query: str,
config: ResearchConfig
) -> ResearchResult:
"""
Main research pipeline orchestrator with configurable settings
"""
start_time = datetime.now()
# Display pipeline header
header = Panel(
f"[bold cyan]Research Pipeline[/bold cyan]\n\n"
f"[dim]Domain:[/dim] {config.domain}\n"
f"[dim]Mode:[/dim] {'Test' if config.test_mode else 'Production'}\n"
f"[dim]Interactive:[/dim] {'Yes' if config.interactive_mode else 'No'}",
title="🚀 Starting",
border_style="cyan"
)
console.print(header)
# Step 1: Enhance query (optional)
console.print(f"\n[bold cyan]📝 Step 1: Query Processing[/bold cyan]")
if config.interactive_mode:
await wait_for_user()
if config.use_llm_enhancement:
research_query = await enhance_query_with_llm(query)
else:
research_query = ResearchQuery(
original_query=query,
enhanced_query=query,
search_patterns=tokenize_query_to_patterns(query),
timestamp=datetime.now().isoformat()
)
console.print(f" [green]✅ Query ready:[/green] {research_query.enhanced_query or query}")
# Step 2: Discover URLs
console.print(f"\n[bold cyan]🔍 Step 2: URL Discovery[/bold cyan]")
if config.interactive_mode:
await wait_for_user()
discovered_urls = await discover_urls(
domain=config.domain,
query=research_query.enhanced_query or query,
config=config
)
if not discovered_urls:
return ResearchResult(
query=research_query,
discovered_urls=[],
crawled_content=[],
synthesis="No relevant URLs found for the given query.",
citations=[],
metadata={'duration': str(datetime.now() - start_time)}
)
console.print(f" [green]✅ Found {len(discovered_urls)} relevant URLs[/green]")
# Step 3: Crawl selected URLs
console.print(f"\n[bold cyan]🕷️ Step 3: Content Crawling[/bold cyan]")
if config.interactive_mode:
await wait_for_user()
crawled_content = await crawl_selected_urls(
urls=discovered_urls,
query=research_query.enhanced_query or query,
config=config
)
console.print(f" [green]✅ Successfully crawled {len(crawled_content)} pages[/green]")
# Step 4: Generate synthesis
console.print(f"\n[bold cyan]🤖 Step 4: Synthesis Generation[/bold cyan]")
if config.interactive_mode:
await wait_for_user()
synthesis, citations = await generate_research_synthesis(
query=research_query.enhanced_query or query,
crawled_content=crawled_content
)
console.print(f" [green]✅ Generated synthesis with {len(citations)} citations[/green]")
# Step 5: Create result
result = ResearchResult(
query=research_query,
discovered_urls=discovered_urls,
crawled_content=crawled_content,
synthesis=synthesis,
citations=citations,
metadata={
'duration': str(datetime.now() - start_time),
'domain': config.domain,
'timestamp': datetime.now().isoformat(),
'config': asdict(config)
}
)
duration = datetime.now() - start_time
console.print(f"\n[bold green]✅ Research completed in {duration}[/bold green]")
return result
async def main():
"""
Main entry point for the BBC Sport Research Assistant
"""
# Example queries
example_queries = [
"Premier League transfer news and rumors",
"Champions League match results and analysis",
"World Cup qualifying updates",
"Football injury reports and return dates",
"Tennis grand slam tournament results"
]
# Display header
console.print(Panel.fit(
"[bold cyan]BBC Sport Research Assistant[/bold cyan]\n\n"
"This tool demonstrates efficient research using URLSeeder:\n"
"[dim]• Discover all URLs without crawling\n"
"• Filter and rank by relevance\n"
"• Crawl only the most relevant content\n"
"• Generate AI-powered insights with citations[/dim]\n\n"
f"[dim]📁 Working directory: {SCRIPT_DIR}[/dim]",
title="🔬 Welcome",
border_style="cyan"
))
# Configuration options table
config_table = Table(title="\n⚙️ Configuration Options", show_header=False, box=None)
config_table.add_column(style="bold cyan", width=3)
config_table.add_column()
config_table.add_row("1", "Quick Test Mode (3 URLs, fast)")
config_table.add_row("2", "Standard Mode (10 URLs, balanced)")
config_table.add_row("3", "Comprehensive Mode (20 URLs, thorough)")
config_table.add_row("4", "Custom Configuration")
console.print(config_table)
config_choice = input("\nSelect configuration (1-4): ").strip()
# Create config based on choice
if config_choice == "1":
config = ResearchConfig(test_mode=True, interactive_mode=False)
elif config_choice == "2":
config = ResearchConfig(max_urls_to_crawl=10, top_k_urls=10)
elif config_choice == "3":
config = ResearchConfig(max_urls_to_crawl=20, top_k_urls=20, max_urls_discovery=200)
else:
# Custom configuration
config = ResearchConfig()
config.test_mode = input("\nTest mode? (y/n): ").lower() == 'y'
config.interactive_mode = input("Interactive mode (pause between steps)? (y/n): ").lower() == 'y'
config.use_llm_enhancement = input("Use AI to enhance queries? (y/n): ").lower() == 'y'
if not config.test_mode:
try:
config.max_urls_to_crawl = int(input("Max URLs to crawl (default 10): ") or "10")
config.top_k_urls = int(input("Top K URLs to select (default 10): ") or "10")
except ValueError:
console.print("[yellow]Using default values[/yellow]")
# Display example queries
query_table = Table(title="\n📋 Example Queries", show_header=False, box=None)
query_table.add_column(style="bold cyan", width=3)
query_table.add_column()
for i, q in enumerate(example_queries, 1):
query_table.add_row(str(i), q)
console.print(query_table)
query_input = input("\nSelect a query (1-5) or enter your own: ").strip()
if query_input.isdigit() and 1 <= int(query_input) <= len(example_queries):
query = example_queries[int(query_input) - 1]
else:
query = query_input if query_input else example_queries[0]
console.print(f"\n[bold cyan]📝 Selected Query:[/bold cyan] {query}")
# Run the research pipeline
result = await research_pipeline(query=query, config=config)
# Display results
formatted_output = format_research_output(result)
# print(formatted_output)
console.print(Panel.fit(
formatted_output,
title="🔬 Research Results",
border_style="green"
))
# Save results
if config.save_json or config.save_markdown:
json_path, md_path = await save_research_results(result, config)
# print(f"\n✅ Results saved successfully!")
if json_path:
console.print(f"[green]JSON saved at:[/green] {json_path}")
if md_path:
console.print(f"[green]Markdown saved at:[/green] {md_path}")
if __name__ == "__main__":
asyncio.run(main())