feat: Add AsyncUrlSeeder for intelligent URL discovery and filtering
This commit introduces AsyncUrlSeeder, a high-performance URL discovery system that enables intelligent crawling at scale by pre-discovering and filtering URLs before crawling. ## Core Features ### AsyncUrlSeeder Component - Discovers URLs from multiple sources: - Sitemaps (including nested and gzipped) - Common Crawl index - Combined sources for maximum coverage - Extracts page metadata without full crawling: - Title, description, keywords - Open Graph and Twitter Card tags - JSON-LD structured data - Language and charset information - BM25 relevance scoring for intelligent filtering: - Query-based URL discovery - Configurable score thresholds - Automatic ranking by relevance - Performance optimizations: - Async/concurrent processing with configurable workers - Rate limiting (hits per second) - Automatic caching with TTL - Streaming results for large datasets ### SeedingConfig - Comprehensive configuration for URL seeding: - Source selection (sitemap, cc, or both) - URL pattern filtering with wildcards - Live URL validation options - Metadata extraction controls - BM25 scoring parameters - Concurrency and rate limiting ### Integration with AsyncWebCrawler - Seamless pipeline: discover → filter → crawl - Direct compatibility with arun_many() - Significant resource savings by pre-filtering URLs ## Documentation - Comprehensive guide comparing URL seeding vs deep crawling - Complete API reference with parameter tables - Practical examples showing all features - Performance benchmarks and best practices - Integration patterns with AsyncWebCrawler ## Examples - url_seeder_demo.py: Interactive Rich-based demo with: - Basic discovery - Cache management - Live validation - BM25 scoring - Multi-domain discovery - Complete pipeline integration - url_seeder_quick_demo.py: Screenshot-friendly examples: - Pattern-based filtering - Metadata exploration - Smart search with BM25 ## Testing - Comprehensive test suite (test_async_url_seeder_bm25.py) - Coverage of all major features - Edge cases and error handling - Performance and consistency tests ## Implementation Details - Built on httpx with HTTP/2 support - Optional dependencies: lxml, brotli, rank_bm25 - Cache management in ~/.crawl4ai/seeder_cache/ - Logger integration with AsyncLoggerBase - Proper error handling and retry logic ## Bug Fixes - Fixed logger color compatibility (lightblack → bright_black) - Corrected URL extraction from seeder results for arun_many() - Updated all examples and documentation with proper usage This feature enables users to crawl smarter, not harder, by discovering and analyzing URLs before committing resources to crawling them.
This commit is contained in:
3
.claude/settings.local.json
Normal file
3
.claude/settings.local.json
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"enableAllProjectMcpServers": false
|
||||
}
|
||||
@@ -2,7 +2,8 @@
|
||||
import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig
|
||||
# MODIFIED: Add SeedingConfig here
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
@@ -65,6 +66,8 @@ from .deep_crawling import (
|
||||
DFSDeepCrawlStrategy,
|
||||
DeepCrawlDecorator,
|
||||
)
|
||||
# NEW: Import AsyncUrlSeeder
|
||||
from .async_url_seeder import AsyncUrlSeeder
|
||||
|
||||
from .utils import (
|
||||
start_colab_display_server,
|
||||
@@ -78,6 +81,10 @@ __all__ = [
|
||||
"BrowserProfiler",
|
||||
"LLMConfig",
|
||||
"GeolocationConfig",
|
||||
# NEW: Add SeedingConfig
|
||||
"SeedingConfig",
|
||||
# NEW: Add AsyncUrlSeeder
|
||||
"AsyncUrlSeeder",
|
||||
"DeepCrawlStrategy",
|
||||
"BFSDeepCrawlStrategy",
|
||||
"BestFirstCrawlingStrategy",
|
||||
@@ -160,4 +167,4 @@ __all__ = [
|
||||
|
||||
# Disable all Pydantic warnings
|
||||
warnings.filterwarnings("ignore", module="pydantic")
|
||||
# pydantic_warnings.filter_warnings()
|
||||
# pydantic_warnings.filter_warnings()
|
||||
@@ -207,7 +207,6 @@ class GeolocationConfig:
|
||||
config_dict.update(kwargs)
|
||||
return GeolocationConfig.from_dict(config_dict)
|
||||
|
||||
|
||||
class ProxyConfig:
|
||||
def __init__(
|
||||
self,
|
||||
@@ -318,8 +317,6 @@ class ProxyConfig:
|
||||
config_dict.update(kwargs)
|
||||
return ProxyConfig.from_dict(config_dict)
|
||||
|
||||
|
||||
|
||||
class BrowserConfig:
|
||||
"""
|
||||
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
||||
@@ -597,7 +594,6 @@ class BrowserConfig:
|
||||
return config
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
|
||||
|
||||
class HTTPCrawlerConfig:
|
||||
"""HTTP-specific crawler configuration"""
|
||||
|
||||
@@ -1329,7 +1325,6 @@ class CrawlerRunConfig():
|
||||
config_dict.update(kwargs)
|
||||
return CrawlerRunConfig.from_kwargs(config_dict)
|
||||
|
||||
|
||||
class LLMConfig:
|
||||
def __init__(
|
||||
self,
|
||||
@@ -1414,4 +1409,51 @@ class LLMConfig:
|
||||
config_dict.update(kwargs)
|
||||
return LLMConfig.from_kwargs(config_dict)
|
||||
|
||||
class SeedingConfig:
|
||||
"""
|
||||
Configuration class for URL discovery and pre-validation via AsyncUrlSeeder.
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
source: str = "sitemap+cc", # Options: "sitemap", "cc", "sitemap+cc"
|
||||
pattern: Optional[str] = "*", # URL pattern to filter discovered URLs (e.g., "*example.com/blog/*")
|
||||
live_check: bool = False, # Whether to perform HEAD requests to verify URL liveness
|
||||
extract_head: bool = False, # Whether to fetch and parse <head> section for metadata
|
||||
max_urls: int = -1, # Maximum number of URLs to discover (default: -1 for no limit)
|
||||
concurrency: int = 1000, # Maximum concurrent requests for live checks/head extraction
|
||||
hits_per_sec: int = 5, # Rate limit in requests per second
|
||||
force: bool = False, # If True, bypasses the AsyncUrlSeeder's internal .jsonl cache
|
||||
base_directory: Optional[str] = None, # Base directory for UrlSeeder's cache files (.jsonl)
|
||||
llm_config: Optional[LLMConfig] = None, # Forward LLM config for future use (e.g., relevance scoring)
|
||||
verbose: Optional[bool] = None, # Override crawler's general verbose setting
|
||||
query: Optional[str] = None, # Search query for relevance scoring
|
||||
score_threshold: Optional[float] = None, # Minimum relevance score to include URL (0.0-1.0)
|
||||
scoring_method: str = "bm25", # Scoring method: "bm25" (default), future: "semantic"
|
||||
):
|
||||
self.source = source
|
||||
self.pattern = pattern
|
||||
self.live_check = live_check
|
||||
self.extract_head = extract_head
|
||||
self.max_urls = max_urls
|
||||
self.concurrency = concurrency
|
||||
self.hits_per_sec = hits_per_sec
|
||||
self.force = force
|
||||
self.base_directory = base_directory
|
||||
self.llm_config = llm_config
|
||||
self.verbose = verbose
|
||||
self.query = query
|
||||
self.score_threshold = score_threshold
|
||||
self.scoring_method = scoring_method
|
||||
|
||||
# Add to_dict, from_kwargs, and clone methods for consistency
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {k: v for k, v in self.__dict__.items() if k != 'llm_config' or v is not None}
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: Dict[str, Any]) -> 'SeedingConfig':
|
||||
return SeedingConfig(**kwargs)
|
||||
|
||||
def clone(self, **kwargs: Any) -> 'SeedingConfig':
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return SeedingConfig.from_kwargs(config_dict)
|
||||
|
||||
@@ -29,7 +29,7 @@ class LogLevel(Enum):
|
||||
class LogColor(str, Enum):
|
||||
"""Enum for log colors."""
|
||||
|
||||
DEBUG = "lightblack"
|
||||
DEBUG = "bright_black"
|
||||
INFO = "cyan"
|
||||
SUCCESS = "green"
|
||||
WARNING = "yellow"
|
||||
|
||||
944
crawl4ai/async_url_seeder.py
Normal file
944
crawl4ai/async_url_seeder.py
Normal file
@@ -0,0 +1,944 @@
|
||||
"""
|
||||
async_url_seeder.py
|
||||
Fast async URL discovery for Crawl4AI
|
||||
|
||||
Features
|
||||
--------
|
||||
* Common-Crawl streaming via httpx.AsyncClient (HTTP/2, keep-alive)
|
||||
* robots.txt → sitemap chain (.gz + nested indexes) via async httpx
|
||||
* Per-domain CDX result cache on disk (~/.crawl4ai/<index>_<domain>_<hash>.jsonl)
|
||||
* Optional HEAD-only liveness check
|
||||
* Optional partial <head> download + meta parsing
|
||||
* Global hits-per-second rate-limit via asyncio.Semaphore
|
||||
* Concurrency in the thousands — fine on a single event-loop
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import aiofiles, asyncio, gzip, hashlib, io, json, os, pathlib, re, time
|
||||
from datetime import timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterable, List, Optional, Sequence, Union
|
||||
from urllib.parse import quote, urljoin
|
||||
|
||||
import httpx
|
||||
import fnmatch
|
||||
try:
|
||||
from lxml import html as lxml_html
|
||||
from lxml import etree
|
||||
LXML = True
|
||||
except ImportError:
|
||||
LXML = False
|
||||
try:
|
||||
import brotli
|
||||
HAS_BROTLI = True
|
||||
except ImportError:
|
||||
HAS_BROTLI = False
|
||||
try:
|
||||
import rank_bm25
|
||||
HAS_BM25 = True
|
||||
except ImportError:
|
||||
HAS_BM25 = False
|
||||
|
||||
# Import AsyncLoggerBase from crawl4ai's logger module
|
||||
# Assuming crawl4ai/async_logger.py defines AsyncLoggerBase
|
||||
# You might need to adjust this import based on your exact file structure
|
||||
from .async_logger import AsyncLoggerBase, AsyncLogger # Import AsyncLogger for default if needed
|
||||
|
||||
# Import SeedingConfig for type hints
|
||||
from typing import TYPE_CHECKING
|
||||
if TYPE_CHECKING:
|
||||
from .async_configs import SeedingConfig
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────────── consts
|
||||
COLLINFO_URL = "https://index.commoncrawl.org/collinfo.json"
|
||||
# CACHE_DIR = pathlib.Path("~/.crawl4ai").expanduser() # REMOVED: now managed by __init__
|
||||
# CACHE_DIR.mkdir(exist_ok=True) # REMOVED: now managed by __init__
|
||||
# INDEX_CACHE = CACHE_DIR / "latest_cc_index.txt" # REMOVED: now managed by __init__
|
||||
TTL = timedelta(days=7) # Keeping this constant as it's a seeder-specific TTL
|
||||
|
||||
_meta_rx = re.compile(
|
||||
r'<meta\s+(?:[^>]*?(?:name|property|http-equiv)\s*=\s*["\']?([^"\' >]+)[^>]*?content\s*=\s*["\']?([^"\' >]+)[^>]*?)\/?>',
|
||||
re.I)
|
||||
_charset_rx = re.compile(r'<meta\s+[^>]*charset=["\']?([^"\' >]+)', re.I)
|
||||
_title_rx = re.compile(r'<title>(.*?)</title>', re.I|re.S)
|
||||
_link_rx = re.compile(r'<link\s+[^>]*rel=["\']?([^"\' >]+)[^>]*href=["\']?([^"\' >]+)', re.I)
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────────── helpers
|
||||
def _match(url: str, pattern: str) -> bool:
|
||||
if fnmatch.fnmatch(url, pattern):
|
||||
return True
|
||||
canon = url.split("://", 1)[-1]
|
||||
return (fnmatch.fnmatch(canon, pattern)
|
||||
or (canon.startswith("www.") and fnmatch.fnmatch(canon[4:], pattern)))
|
||||
|
||||
def _parse_head(src: str) -> Dict[str, Any]:
|
||||
if LXML:
|
||||
try:
|
||||
if isinstance(src, str):
|
||||
src = src.encode("utf-8", "replace") # strip Unicode, let lxml decode
|
||||
doc = lxml_html.fromstring(src)
|
||||
except (ValueError, etree.ParserError):
|
||||
return {} # malformed, bail gracefully
|
||||
info: Dict[str, Any] = {
|
||||
"title": (doc.find(".//title").text or "").strip()
|
||||
if doc.find(".//title") is not None else None,
|
||||
"charset": None,
|
||||
"meta": {}, "link": {}, "jsonld": []
|
||||
}
|
||||
for el in doc.xpath(".//meta"):
|
||||
k = el.attrib.get("name") or el.attrib.get("property") or el.attrib.get("http-equiv")
|
||||
if k: info["meta"][k.lower()] = el.attrib.get("content", "")
|
||||
elif "charset" in el.attrib: info["charset"] = el.attrib["charset"].lower()
|
||||
for el in doc.xpath(".//link"):
|
||||
rel = " ".join(el.attrib.get("rel", [])).lower()
|
||||
if not rel: continue
|
||||
entry = {a: el.attrib[a] for a in ("href","as","type","hreflang") if a in el.attrib}
|
||||
info["link"].setdefault(rel, []).append(entry)
|
||||
# Extract JSON-LD structured data
|
||||
for script in doc.xpath('.//script[@type="application/ld+json"]'):
|
||||
if script.text:
|
||||
try:
|
||||
jsonld_data = json.loads(script.text.strip())
|
||||
info["jsonld"].append(jsonld_data)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Extract html lang attribute
|
||||
html_elem = doc.find(".//html")
|
||||
if html_elem is not None:
|
||||
info["lang"] = html_elem.attrib.get("lang", "")
|
||||
return info
|
||||
# regex fallback
|
||||
info: Dict[str,Any] = {"title":None,"charset":None,"meta":{},"link":{},"jsonld":[],"lang":""}
|
||||
m=_title_rx.search(src); info["title"]=m.group(1).strip() if m else None
|
||||
for k,v in _meta_rx.findall(src): info["meta"][k.lower()]=v
|
||||
m=_charset_rx.search(src); info["charset"]=m.group(1).lower() if m else None
|
||||
for rel,href in _link_rx.findall(src):
|
||||
info["link"].setdefault(rel.lower(),[]).append({"href":href})
|
||||
# Try to extract JSON-LD with regex
|
||||
jsonld_pattern = re.compile(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', re.I|re.S)
|
||||
for match in jsonld_pattern.findall(src):
|
||||
try:
|
||||
jsonld_data = json.loads(match.strip())
|
||||
info["jsonld"].append(jsonld_data)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Try to extract lang attribute
|
||||
lang_match = re.search(r'<html[^>]*lang=["\']?([^"\' >]+)', src, re.I)
|
||||
if lang_match:
|
||||
info["lang"] = lang_match.group(1)
|
||||
return info
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────────── class
|
||||
class AsyncUrlSeeder:
|
||||
"""
|
||||
Async version of UrlSeeder.
|
||||
Call pattern is await/async for / async with.
|
||||
|
||||
Public coroutine
|
||||
----------------
|
||||
await seed.urls(...)
|
||||
returns List[Dict[str,Any]] (url, status, head_data)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
ttl: timedelta = TTL,
|
||||
client: Optional[httpx.AsyncClient]=None,
|
||||
logger: Optional[AsyncLoggerBase] = None, # NEW: Add logger parameter
|
||||
base_directory: Optional[Union[str, pathlib.Path]] = None, # NEW: Add base_directory
|
||||
cache_root: Optional[Union[str, Path]] = None,
|
||||
):
|
||||
self.ttl = ttl
|
||||
self.client = client or httpx.AsyncClient(http2=True, timeout=20, headers={
|
||||
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) +AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
|
||||
})
|
||||
self.logger = logger # Store the logger instance
|
||||
self.base_directory = pathlib.Path(base_directory or os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())) # Resolve base_directory
|
||||
self.cache_dir = self.base_directory / ".crawl4ai" / "seeder_cache" # NEW: Specific cache dir for seeder
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True) # Ensure it exists
|
||||
self.index_cache_path = self.cache_dir / "latest_cc_index.txt" # NEW: Index cache path
|
||||
|
||||
# defer – grabbing the index inside an active loop blows up
|
||||
self.index_id: Optional[str] = None
|
||||
self._rate_sem: Optional[asyncio.Semaphore] = None
|
||||
|
||||
# ───────── cache dirs ─────────
|
||||
self.cache_root = Path(os.path.expanduser(cache_root or "~/.cache/url_seeder"))
|
||||
(self.cache_root / "live").mkdir(parents=True, exist_ok=True)
|
||||
(self.cache_root / "head").mkdir(exist_ok=True)
|
||||
|
||||
def _log(self, level: str, message: str, tag: str = "URL_SEED", **kwargs: Any):
|
||||
"""Helper to log messages using the provided logger, if available."""
|
||||
if self.logger:
|
||||
log_method = getattr(self.logger, level, None)
|
||||
if log_method:
|
||||
log_method(message=message, tag=tag, params=kwargs.get('params', {}))
|
||||
# else: # Fallback for unknown level, should not happen with AsyncLoggerBase
|
||||
# print(f"[{tag}] {level.upper()}: {message.format(**kwargs)}")
|
||||
|
||||
# ───────── cache helpers ─────────
|
||||
def _cache_path(self, kind: str, url: str) -> Path:
|
||||
h = hashlib.sha1(url.encode()).hexdigest()
|
||||
return self.cache_root / kind / f"{h}.json"
|
||||
|
||||
def _cache_get(self, kind: str, url: str) -> Optional[Dict[str, Any]]:
|
||||
p = self._cache_path(kind, url)
|
||||
if not p.exists():
|
||||
return None
|
||||
# TTL check
|
||||
if time.time() - p.stat().st_mtime > self.ttl.total_seconds():
|
||||
return None
|
||||
try:
|
||||
return json.loads(p.read_text())
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _cache_set(self, kind: str, url: str, data: Dict[str, Any]) -> None:
|
||||
try:
|
||||
self._cache_path(kind, url).write_text(
|
||||
json.dumps(data, separators=(",", ":"))
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ─────────────────────────────── discovery entry
|
||||
async def urls(self,
|
||||
domain: str,
|
||||
config: "SeedingConfig",
|
||||
) -> List[Dict[str,Any]]:
|
||||
"""
|
||||
Fetch URLs for a domain using configuration from SeedingConfig.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
domain : str
|
||||
The domain to fetch URLs for (e.g., "example.com")
|
||||
config : SeedingConfig
|
||||
Configuration object containing all seeding parameters
|
||||
"""
|
||||
# Extract parameters from config
|
||||
pattern = config.pattern or "*"
|
||||
source = config.source
|
||||
live_check = config.live_check
|
||||
extract_head = config.extract_head
|
||||
concurrency = config.concurrency
|
||||
head_timeout = 5 # Default timeout for HEAD requests
|
||||
hits_per_sec = config.hits_per_sec
|
||||
self.force = config.force # Store force flag as instance attribute
|
||||
force = config.force
|
||||
verbose = config.verbose if config.verbose is not None else (self.logger.verbose if self.logger else False)
|
||||
max_urls = config.max_urls if config.max_urls is not None else -1
|
||||
query = config.query
|
||||
score_threshold = config.score_threshold
|
||||
scoring_method = config.scoring_method
|
||||
|
||||
# Ensure seeder's logger verbose matches the config's verbose if it's set
|
||||
if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
|
||||
self.logger.verbose = config.verbose
|
||||
|
||||
# ensure we have the latest CC collection id
|
||||
if self.index_id is None:
|
||||
self.index_id = await self._latest_index()
|
||||
|
||||
# Parse source parameter - split by '+' to get list of sources
|
||||
sources = source.split('+')
|
||||
valid_sources = {"cc", "sitemap"}
|
||||
for s in sources:
|
||||
if s not in valid_sources:
|
||||
raise ValueError(f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
|
||||
|
||||
if hits_per_sec:
|
||||
if hits_per_sec <= 0:
|
||||
self._log("warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
|
||||
self._rate_sem = None
|
||||
else:
|
||||
self._rate_sem = asyncio.Semaphore(hits_per_sec)
|
||||
else:
|
||||
self._rate_sem = None # Ensure it's None if no rate limiting
|
||||
|
||||
self._log("info", "Starting URL seeding for {domain} with source={source}",
|
||||
params={"domain": domain, "source": source}, tag="URL_SEED")
|
||||
|
||||
# choose stream
|
||||
async def gen():
|
||||
if "sitemap" in sources:
|
||||
self._log("debug", "Fetching from sitemaps...", tag="URL_SEED")
|
||||
async for u in self._from_sitemaps(domain, pattern, force):
|
||||
yield u
|
||||
if "cc" in sources:
|
||||
self._log("debug", "Fetching from Common Crawl...", tag="URL_SEED")
|
||||
async for u in self._from_cc(domain, pattern, force):
|
||||
yield u
|
||||
|
||||
queue = asyncio.Queue()
|
||||
producer_done = asyncio.Event()
|
||||
stop_event = asyncio.Event()
|
||||
seen: set[str] = set()
|
||||
|
||||
async def producer():
|
||||
try:
|
||||
async for u in gen():
|
||||
if u in seen:
|
||||
self._log("debug", "Skipping duplicate URL: {url}",
|
||||
params={"url": u}, tag="URL_SEED")
|
||||
continue
|
||||
if stop_event.is_set():
|
||||
self._log("info", "Producer stopping due to max_urls limit.", tag="URL_SEED")
|
||||
break
|
||||
await queue.put(u)
|
||||
except Exception as e:
|
||||
self._log("error", "Producer encountered an error: {error}", params={"error": str(e)}, tag="URL_SEED")
|
||||
finally:
|
||||
producer_done.set()
|
||||
self._log("debug", "Producer finished.", tag="URL_SEED")
|
||||
|
||||
|
||||
async def worker(res_list: List[Dict[str,Any]]):
|
||||
while True:
|
||||
if queue.empty() and producer_done.is_set():
|
||||
# self._log("debug", "Worker exiting: queue empty and producer done.", tag="URL_SEED")
|
||||
break
|
||||
try:
|
||||
url = await asyncio.wait_for(queue.get(), 5) # Increased timeout slightly
|
||||
except asyncio.TimeoutError:
|
||||
continue # Keep checking queue and producer_done status
|
||||
except Exception as e:
|
||||
self._log("error", "Worker failed to get URL from queue: {error}", params={"error": str(e)}, tag="URL_SEED")
|
||||
continue
|
||||
|
||||
if max_urls > 0 and len(res_list) >= max_urls:
|
||||
self._log(
|
||||
"info",
|
||||
"Worker stopping due to max_urls limit.",
|
||||
tag="URL_SEED",
|
||||
)
|
||||
stop_event.set()
|
||||
|
||||
# mark the current item done
|
||||
queue.task_done()
|
||||
|
||||
# flush whatever is still sitting in the queue so
|
||||
# queue.join() can finish cleanly
|
||||
while not queue.empty():
|
||||
try:
|
||||
queue.get_nowait()
|
||||
queue.task_done()
|
||||
except asyncio.QueueEmpty:
|
||||
break
|
||||
break
|
||||
|
||||
if self._rate_sem: # global QPS control
|
||||
async with self._rate_sem:
|
||||
await self._validate(url, res_list, live_check, extract_head,
|
||||
head_timeout, verbose)
|
||||
else:
|
||||
await self._validate(url, res_list, live_check, extract_head,
|
||||
head_timeout, verbose)
|
||||
queue.task_done() # Mark task as done for queue.join() if ever used
|
||||
|
||||
# launch
|
||||
results: List[Dict[str,Any]] = []
|
||||
prod_task = asyncio.create_task(producer())
|
||||
workers = [asyncio.create_task(worker(results)) for _ in range(concurrency)]
|
||||
|
||||
# Wait for all workers to finish
|
||||
await asyncio.gather(prod_task, *workers)
|
||||
await queue.join() # Ensure all queued items are processed
|
||||
|
||||
self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}",
|
||||
params={"domain": domain, "count": len(results)}, tag="URL_SEED")
|
||||
|
||||
# Apply BM25 scoring if query is provided and extract_head is enabled
|
||||
if query and extract_head and scoring_method == "bm25":
|
||||
self._log("info", "Applying BM25 scoring for query: '{query}'",
|
||||
params={"query": query}, tag="URL_SEED")
|
||||
|
||||
# Extract text contexts from all results
|
||||
documents = []
|
||||
valid_indices = []
|
||||
for i, result in enumerate(results):
|
||||
if result.get("head_data"):
|
||||
text_context = self._extract_text_context(result["head_data"])
|
||||
if text_context: # Only include non-empty contexts
|
||||
documents.append(text_context)
|
||||
valid_indices.append(i)
|
||||
|
||||
if documents:
|
||||
# Calculate BM25 scores
|
||||
scores = self._calculate_bm25_score(query, documents)
|
||||
|
||||
# Add scores to results
|
||||
for idx, score in zip(valid_indices, scores):
|
||||
results[idx]["relevance_score"] = float(score)
|
||||
|
||||
# Add zero scores to results without head_data
|
||||
for i, result in enumerate(results):
|
||||
if i not in valid_indices:
|
||||
result["relevance_score"] = 0.0
|
||||
|
||||
# Filter by score threshold if specified
|
||||
if score_threshold is not None:
|
||||
original_count = len(results)
|
||||
results = [r for r in results if r.get("relevance_score", 0.0) >= score_threshold]
|
||||
self._log("info", "Filtered {filtered} URLs below score threshold {threshold}. Remaining: {remaining}",
|
||||
params={"filtered": original_count - len(results),
|
||||
"threshold": score_threshold,
|
||||
"remaining": len(results)}, tag="URL_SEED")
|
||||
|
||||
# Sort by relevance score (highest first)
|
||||
results.sort(key=lambda x: x.get("relevance_score", 0.0), reverse=True)
|
||||
else:
|
||||
self._log("warning", "No valid head data found for BM25 scoring.", tag="URL_SEED")
|
||||
# Add zero scores to all results
|
||||
for result in results:
|
||||
result["relevance_score"] = 0.0
|
||||
elif query and not extract_head:
|
||||
self._log("warning", "Query provided but extract_head is False. Enable extract_head for relevance scoring.", tag="URL_SEED")
|
||||
|
||||
return results[:max_urls] if max_urls > 0 else results
|
||||
|
||||
async def many_urls(
|
||||
self,
|
||||
domains: Sequence[str],
|
||||
config: "SeedingConfig",
|
||||
) -> Dict[str, List[Dict[str,Any]]]:
|
||||
"""
|
||||
Fetch URLs for many domains in parallel.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
domains : Sequence[str]
|
||||
List of domains to fetch URLs for
|
||||
config : SeedingConfig
|
||||
Configuration object containing all seeding parameters
|
||||
|
||||
Returns a {domain: urls-list} dict.
|
||||
"""
|
||||
self._log("info", "Starting URL seeding for {count} domains...",
|
||||
params={"count": len(domains)}, tag="URL_SEED")
|
||||
|
||||
# Ensure seeder's logger verbose matches the config's verbose if it's set
|
||||
if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
|
||||
self.logger.verbose = config.verbose
|
||||
|
||||
tasks = [
|
||||
self.urls(domain, config)
|
||||
for domain in domains
|
||||
]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
final_results = dict(zip(domains, results))
|
||||
self._log("info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
|
||||
return final_results
|
||||
|
||||
async def _resolve_head(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
HEAD-probe a URL.
|
||||
|
||||
Returns:
|
||||
* the same URL if it answers 2xx,
|
||||
* the absolute redirect target if it answers 3xx,
|
||||
* None on any other status or network error.
|
||||
"""
|
||||
try:
|
||||
r = await self.client.head(url, timeout=10, follow_redirects=False)
|
||||
|
||||
# direct hit
|
||||
if 200 <= r.status_code < 300:
|
||||
return str(r.url)
|
||||
|
||||
# single level redirect
|
||||
if r.status_code in (301, 302, 303, 307, 308):
|
||||
loc = r.headers.get("location")
|
||||
if loc:
|
||||
return urljoin(url, loc)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self._log("debug", "HEAD {url} failed: {err}",
|
||||
params={"url": url, "err": str(e)}, tag="URL_SEED")
|
||||
return None
|
||||
|
||||
|
||||
# ─────────────────────────────── CC
|
||||
async def _from_cc(self, domain:str, pattern:str, force:bool):
|
||||
import re
|
||||
digest = hashlib.md5(pattern.encode()).hexdigest()[:8]
|
||||
|
||||
# ── normalise for CC (strip scheme, query, fragment)
|
||||
raw = re.sub(r'^https?://', '', domain).split('#', 1)[0].split('?', 1)[0].lstrip('.')
|
||||
|
||||
# ── sanitize only for cache-file name
|
||||
safe = re.sub('[/?#]+', '_', raw)
|
||||
path = self.cache_dir / f"{self.index_id}_{safe}_{digest}.jsonl"
|
||||
|
||||
if path.exists() and not force:
|
||||
self._log("info", "Loading CC URLs for {domain} from cache: {path}",
|
||||
params={"domain": domain, "path": path}, tag="URL_SEED")
|
||||
async with aiofiles.open(path,"r") as fp:
|
||||
async for line in fp:
|
||||
url=line.strip()
|
||||
if _match(url,pattern): yield url
|
||||
return
|
||||
|
||||
# build CC glob – if a path is present keep it, else add trailing /*
|
||||
glob = f"*.{raw}*" if '/' in raw else f"*.{raw}/*"
|
||||
url = f"https://index.commoncrawl.org/{self.index_id}-index?url={quote(glob, safe='*')}&output=json"
|
||||
|
||||
retries=(1,3,7)
|
||||
self._log("info", "Fetching CC URLs for {domain} from Common Crawl index: {url}",
|
||||
params={"domain": domain, "url": url}, tag="URL_SEED")
|
||||
for i,d in enumerate(retries+(-1,)): # last -1 means don't retry
|
||||
try:
|
||||
async with self.client.stream("GET", url) as r:
|
||||
r.raise_for_status()
|
||||
async with aiofiles.open(path,"w") as fp:
|
||||
async for line in r.aiter_lines():
|
||||
rec = json.loads(line)
|
||||
u = rec["url"]
|
||||
await fp.write(u+"\n")
|
||||
if _match(u,pattern): yield u
|
||||
return
|
||||
except httpx.HTTPStatusError as e:
|
||||
if e.response.status_code==503 and i<len(retries):
|
||||
self._log("warning", "Common Crawl API returned 503 for {domain}. Retrying in {delay}s.",
|
||||
params={"domain": domain, "delay": retries[i]}, tag="URL_SEED")
|
||||
await asyncio.sleep(retries[i])
|
||||
continue
|
||||
self._log("error", "HTTP error fetching CC index for {domain}: {error}",
|
||||
params={"domain": domain, "error": str(e)}, tag="URL_SEED")
|
||||
raise
|
||||
except Exception as e:
|
||||
self._log("error", "Error fetching CC index for {domain}: {error}",
|
||||
params={"domain": domain, "error": str(e)}, tag="URL_SEED")
|
||||
raise
|
||||
|
||||
|
||||
# ─────────────────────────────── Sitemaps
|
||||
async def _from_sitemaps(self, domain:str, pattern:str, force:bool=False):
|
||||
"""
|
||||
1. Probe default sitemap locations.
|
||||
2. If none exist, parse robots.txt for alternative sitemap URLs.
|
||||
3. Yield only URLs that match `pattern`.
|
||||
"""
|
||||
|
||||
# ── cache file (same logic as _from_cc)
|
||||
host = re.sub(r'^https?://', '', domain).rstrip('/')
|
||||
host = re.sub('[/?#]+', '_', domain)
|
||||
digest = hashlib.md5(pattern.encode()).hexdigest()[:8]
|
||||
path = self.cache_dir / f"sitemap_{host}_{digest}.jsonl"
|
||||
|
||||
if path.exists() and not force:
|
||||
self._log("info", "Loading sitemap URLs for {d} from cache: {p}",
|
||||
params={"d": host, "p": str(path)}, tag="URL_SEED")
|
||||
async with aiofiles.open(path, "r") as fp:
|
||||
async for line in fp:
|
||||
url = line.strip()
|
||||
if _match(url, pattern):
|
||||
yield url
|
||||
return
|
||||
|
||||
# 1️⃣ direct sitemap probe
|
||||
# strip any scheme so we can handle https → http fallback
|
||||
host=re.sub(r'^https?://','',domain).rstrip('/')
|
||||
|
||||
schemes=('https','http') # prefer TLS, downgrade if needed
|
||||
for scheme in schemes:
|
||||
for suffix in ("/sitemap.xml","/sitemap_index.xml"):
|
||||
sm=f"{scheme}://{host}{suffix}"
|
||||
sm = await self._resolve_head(sm)
|
||||
if sm:
|
||||
self._log("info","Found sitemap at {url}",params={"url":sm},tag="URL_SEED")
|
||||
async with aiofiles.open(path, "w") as fp:
|
||||
async for u in self._iter_sitemap(sm):
|
||||
await fp.write(u + "\n")
|
||||
if _match(u, pattern):
|
||||
yield u
|
||||
return
|
||||
|
||||
# 2️⃣ robots.txt fallback
|
||||
robots=f"https://{domain.rstrip('/')}/robots.txt"
|
||||
try:
|
||||
r=await self.client.get(robots,timeout=10,follow_redirects=True)
|
||||
if not 200<=r.status_code<300:
|
||||
self._log("warning","robots.txt unavailable for {d} HTTP{c}",params={"d":domain,"c":r.status_code},tag="URL_SEED")
|
||||
return
|
||||
sitemap_lines=[l.split(":",1)[1].strip() for l in r.text.splitlines() if l.lower().startswith("sitemap:")]
|
||||
except Exception as e:
|
||||
self._log("warning","Failed to fetch robots.txt for {d}: {e}",params={"d":domain,"e":str(e)},tag="URL_SEED")
|
||||
return
|
||||
|
||||
if sitemap_lines:
|
||||
async with aiofiles.open(path, "w") as fp:
|
||||
for sm in sitemap_lines:
|
||||
async for u in self._iter_sitemap(sm):
|
||||
await fp.write(u + "\n")
|
||||
if _match(u, pattern):
|
||||
yield u
|
||||
|
||||
async def _iter_sitemap(self, url:str):
|
||||
try:
|
||||
r = await self.client.get(url, timeout=15)
|
||||
r.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}",
|
||||
params={"url": url, "status_code": e.response.status_code}, tag="URL_SEED")
|
||||
return
|
||||
except httpx.RequestError as e:
|
||||
self._log("warning", "Network error fetching sitemap {url}: {error}",
|
||||
params={"url": url, "error": str(e)}, tag="URL_SEED")
|
||||
return
|
||||
except Exception as e:
|
||||
self._log("error", "Unexpected error fetching sitemap {url}: {error}",
|
||||
params={"url": url, "error": str(e)}, tag="URL_SEED")
|
||||
return
|
||||
|
||||
data = gzip.decompress(r.content) if url.endswith(".gz") else r.content
|
||||
|
||||
# Use lxml for XML parsing if available, as it's generally more robust
|
||||
if LXML:
|
||||
try:
|
||||
# Use XML parser for sitemaps, not HTML parser
|
||||
parser = etree.XMLParser(recover=True)
|
||||
root = etree.fromstring(data, parser=parser)
|
||||
|
||||
# Define namespace for sitemap
|
||||
ns = {'s': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
|
||||
|
||||
# First check if this is a sitemap index
|
||||
for sitemap_elem in root.xpath('//s:sitemap/s:loc', namespaces=ns):
|
||||
loc = sitemap_elem.text.strip() if sitemap_elem.text else ""
|
||||
if loc:
|
||||
self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED")
|
||||
async for u in self._iter_sitemap(loc):
|
||||
yield u
|
||||
|
||||
# Then check for regular URLs
|
||||
for loc_elem in root.xpath('//s:url/s:loc', namespaces=ns):
|
||||
loc = loc_elem.text.strip() if loc_elem.text else ""
|
||||
if loc:
|
||||
yield loc
|
||||
except Exception as e:
|
||||
self._log("error", "LXML parsing error for sitemap {url}: {error}",
|
||||
params={"url": url, "error": str(e)}, tag="URL_SEED")
|
||||
return
|
||||
else: # Fallback to xml.etree.ElementTree
|
||||
import xml.etree.ElementTree as ET
|
||||
try:
|
||||
# Parse the XML
|
||||
root = ET.fromstring(data)
|
||||
# Remove namespace from tags for easier processing
|
||||
for elem in root.iter():
|
||||
if '}' in elem.tag:
|
||||
elem.tag = elem.tag.split('}')[1]
|
||||
|
||||
# Check for sitemap index entries
|
||||
for sitemap in root.findall('.//sitemap'):
|
||||
loc_elem = sitemap.find('loc')
|
||||
if loc_elem is not None and loc_elem.text:
|
||||
loc = loc_elem.text.strip()
|
||||
self._log("debug", "Found nested sitemap: {loc}", params={"loc": loc}, tag="URL_SEED")
|
||||
async for u in self._iter_sitemap(loc):
|
||||
yield u
|
||||
|
||||
# Check for regular URL entries
|
||||
for url in root.findall('.//url'):
|
||||
loc_elem = url.find('loc')
|
||||
if loc_elem is not None and loc_elem.text:
|
||||
yield loc_elem.text.strip()
|
||||
except Exception as e:
|
||||
self._log("error", "ElementTree parsing error for sitemap {url}: {error}",
|
||||
params={"url": url, "error": str(e)}, tag="URL_SEED")
|
||||
return
|
||||
|
||||
|
||||
# ─────────────────────────────── validate helpers
|
||||
async def _validate(self, url:str, res_list:List[Dict[str,Any]], live:bool,
|
||||
extract:bool, timeout:int, verbose:bool):
|
||||
# Local verbose parameter for this function is used to decide if intermediate logs should be printed
|
||||
# The main logger's verbose status should be controlled by the caller.
|
||||
|
||||
cache_kind = "head" if extract else "live"
|
||||
|
||||
# ---------- try cache ----------
|
||||
if (live or extract) and not (hasattr(self, 'force') and self.force):
|
||||
cached = self._cache_get(cache_kind, url)
|
||||
if cached:
|
||||
res_list.append(cached)
|
||||
return
|
||||
|
||||
if extract:
|
||||
self._log("debug", "Fetching head for {url}", params={"url": url}, tag="URL_SEED")
|
||||
ok,html,final = await self._fetch_head(url,timeout)
|
||||
status="valid" if ok else "not_valid"
|
||||
self._log("info" if ok else "warning", "HEAD {status} for {final_url}",
|
||||
params={"status": status.upper(), "final_url": final or url}, tag="URL_SEED")
|
||||
entry = {
|
||||
"url": final or url,
|
||||
"status": status,
|
||||
"head_data": _parse_head(html) if ok else {},
|
||||
}
|
||||
if live or extract:
|
||||
self._cache_set(cache_kind, url, entry)
|
||||
res_list.append(entry)
|
||||
elif live:
|
||||
self._log("debug", "Performing live check for {url}", params={"url": url}, tag="URL_SEED")
|
||||
ok=await self._resolve_head(url)
|
||||
status="valid" if ok else "not_valid"
|
||||
self._log("info" if ok else "warning", "LIVE CHECK {status} for {url}",
|
||||
params={"status": status.upper(), "url": url}, tag="URL_SEED")
|
||||
entry = {"url": url, "status": status, "head_data": {}}
|
||||
if live or extract:
|
||||
self._cache_set(cache_kind, url, entry)
|
||||
res_list.append(entry)
|
||||
else:
|
||||
entry = {"url": url, "status": "unknown", "head_data": {}}
|
||||
if live or extract:
|
||||
self._cache_set(cache_kind, url, entry)
|
||||
res_list.append(entry)
|
||||
|
||||
|
||||
async def _head_ok(self, url:str, timeout:int)->bool:
|
||||
try:
|
||||
r=await self.client.head(url, timeout=timeout,
|
||||
headers={"Range":"bytes=0-0","Accept-Encoding":"identity"})
|
||||
r.raise_for_status() # Raise for bad status codes (4xx, 5xx)
|
||||
return True
|
||||
except httpx.RequestError as e:
|
||||
self._log("debug", "HEAD check network error for {url}: {error}",
|
||||
params={"url": url, "error": str(e)}, tag="URL_SEED")
|
||||
return False
|
||||
except httpx.HTTPStatusError as e:
|
||||
self._log("debug", "HEAD check HTTP status error for {url}: {status_code}",
|
||||
params={"url": url, "status_code": e.response.status_code}, tag="URL_SEED")
|
||||
return False
|
||||
except Exception as e:
|
||||
self._log("error", "Unexpected error during HEAD check for {url}: {error}",
|
||||
params={"url": url, "error": str(e)}, tag="URL_SEED")
|
||||
return False
|
||||
|
||||
async def _fetch_head(
|
||||
self,
|
||||
url: str,
|
||||
timeout: int,
|
||||
max_redirects: int = 5,
|
||||
max_bytes: int = 65_536, # stop after 64 kB even if </head> never comes
|
||||
chunk_size: int = 4096, # how much we read per await
|
||||
):
|
||||
for _ in range(max_redirects+1):
|
||||
try:
|
||||
# ask the first `max_bytes` and force plain text to avoid
|
||||
# partial-gzip decode headaches
|
||||
async with self.client.stream(
|
||||
"GET",
|
||||
url,
|
||||
timeout=timeout,
|
||||
headers={
|
||||
# "Range": f"bytes=0-{max_bytes-1}", # Dropped the Range header – no need now, and some servers ignore it. We still keep an upper‐bound max_bytes as a fail-safe.
|
||||
"Accept-Encoding": "identity",
|
||||
},
|
||||
follow_redirects=False,
|
||||
) as r:
|
||||
|
||||
if r.status_code in (301,302,303,307,308):
|
||||
location = r.headers.get("Location")
|
||||
if location:
|
||||
url = urljoin(url, location)
|
||||
self._log("debug", "Redirecting from {original_url} to {new_url}",
|
||||
params={"original_url": r.url, "new_url": url}, tag="URL_SEED")
|
||||
continue
|
||||
else:
|
||||
self._log("warning", "Redirect status {status_code} but no Location header for {url}",
|
||||
params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
|
||||
return False, "", str(r.url) # Return original URL if no new location
|
||||
|
||||
# For 2xx or other non-redirect codes, proceed to read content
|
||||
if not (200 <= r.status_code < 400): # Only allow successful codes, or continue
|
||||
self._log("warning", "Non-success status {status_code} when fetching head for {url}",
|
||||
params={"status_code": r.status_code, "url": r.url}, tag="URL_SEED")
|
||||
return False, "", str(r.url)
|
||||
|
||||
buf = bytearray()
|
||||
async for chunk in r.aiter_bytes(chunk_size):
|
||||
buf.extend(chunk)
|
||||
low = buf.lower()
|
||||
if b"</head>" in low or len(buf) >= max_bytes:
|
||||
await r.aclose()
|
||||
break
|
||||
|
||||
enc = r.headers.get("Content-Encoding", "").lower()
|
||||
try:
|
||||
if enc == "gzip" and buf[:2] == b"\x1f\x8b":
|
||||
buf = gzip.decompress(buf)
|
||||
elif enc == "br" and HAS_BROTLI and buf[:4] == b"\x8b\x6c\x0a\x1a":
|
||||
buf = brotli.decompress(buf)
|
||||
elif enc in {"gzip", "br"}:
|
||||
# Header says “gzip” or “br” but payload is plain – ignore
|
||||
self._log(
|
||||
"debug",
|
||||
"Skipping bogus {encoding} for {url}",
|
||||
params={"encoding": enc, "url": r.url},
|
||||
tag="URL_SEED",
|
||||
)
|
||||
except Exception as e:
|
||||
self._log(
|
||||
"warning",
|
||||
"Decompression error for {url} ({encoding}): {error}",
|
||||
params={"url": r.url, "encoding": enc, "error": str(e)},
|
||||
tag="URL_SEED",
|
||||
)
|
||||
# fall through with raw buf
|
||||
|
||||
# Find the </head> tag case-insensitively and decode
|
||||
idx = buf.lower().find(b"</head>")
|
||||
if idx==-1:
|
||||
self._log("debug", "No </head> tag found in initial bytes of {url}",
|
||||
params={"url": r.url}, tag="URL_SEED")
|
||||
# If no </head> is found, take a reasonable chunk or all if small
|
||||
html_bytes = buf if len(buf) < 10240 else buf[:10240] # Take max 10KB if no head tag
|
||||
else:
|
||||
html_bytes = buf[:idx+7] # Include </head> tag
|
||||
|
||||
try:
|
||||
html = html_bytes.decode("utf-8", "replace")
|
||||
except Exception as e:
|
||||
self._log(
|
||||
"warning",
|
||||
"Failed to decode head content for {url}: {error}",
|
||||
params={"url": r.url, "error": str(e)},
|
||||
tag="URL_SEED",
|
||||
)
|
||||
html = html_bytes.decode("latin-1", "replace")
|
||||
|
||||
return True,html,str(r.url) # Return the actual URL after redirects
|
||||
|
||||
except httpx.RequestError as e:
|
||||
self._log("debug", "Fetch head network error for {url}: {error}",
|
||||
params={"url": url, "error": str(e)}, tag="URL_SEED")
|
||||
return False,"",url
|
||||
|
||||
# If loop finishes without returning (e.g. too many redirects)
|
||||
self._log("warning", "Exceeded max redirects ({max_redirects}) for {url}",
|
||||
params={"max_redirects": max_redirects, "url": url}, tag="URL_SEED")
|
||||
return False,"",url
|
||||
|
||||
# ─────────────────────────────── BM25 scoring helpers
|
||||
def _extract_text_context(self, head_data: Dict[str, Any]) -> str:
|
||||
"""Extract all relevant text from head metadata for scoring."""
|
||||
# Priority fields with their weights (for future enhancement)
|
||||
text_parts = []
|
||||
|
||||
# Title
|
||||
if head_data.get("title"):
|
||||
text_parts.append(head_data["title"])
|
||||
|
||||
# Standard meta tags
|
||||
meta = head_data.get("meta", {})
|
||||
for key in ["description", "keywords", "author", "subject", "summary", "abstract"]:
|
||||
if meta.get(key):
|
||||
text_parts.append(meta[key])
|
||||
|
||||
# Open Graph tags
|
||||
for key in ["og:title", "og:description", "og:site_name", "article:tag"]:
|
||||
if meta.get(key):
|
||||
text_parts.append(meta[key])
|
||||
|
||||
# Twitter Card tags
|
||||
for key in ["twitter:title", "twitter:description", "twitter:image:alt"]:
|
||||
if meta.get(key):
|
||||
text_parts.append(meta[key])
|
||||
|
||||
# Dublin Core tags
|
||||
for key in ["dc.title", "dc.description", "dc.subject", "dc.creator"]:
|
||||
if meta.get(key):
|
||||
text_parts.append(meta[key])
|
||||
|
||||
# JSON-LD structured data
|
||||
for jsonld in head_data.get("jsonld", []):
|
||||
if isinstance(jsonld, dict):
|
||||
# Extract common fields from JSON-LD
|
||||
for field in ["name", "headline", "description", "abstract", "keywords"]:
|
||||
if field in jsonld:
|
||||
if isinstance(jsonld[field], str):
|
||||
text_parts.append(jsonld[field])
|
||||
elif isinstance(jsonld[field], list):
|
||||
text_parts.extend(str(item) for item in jsonld[field] if item)
|
||||
|
||||
# Handle @graph structures
|
||||
if "@graph" in jsonld and isinstance(jsonld["@graph"], list):
|
||||
for item in jsonld["@graph"]:
|
||||
if isinstance(item, dict):
|
||||
for field in ["name", "headline", "description"]:
|
||||
if field in item and isinstance(item[field], str):
|
||||
text_parts.append(item[field])
|
||||
|
||||
# Combine all text parts
|
||||
return " ".join(filter(None, text_parts))
|
||||
|
||||
def _calculate_bm25_score(self, query: str, documents: List[str]) -> List[float]:
|
||||
"""Calculate BM25 scores for documents against a query."""
|
||||
if not HAS_BM25:
|
||||
self._log("warning", "rank_bm25 not installed. Returning zero scores.", tag="URL_SEED")
|
||||
return [0.0] * len(documents)
|
||||
|
||||
if not query or not documents:
|
||||
return [0.0] * len(documents)
|
||||
|
||||
# Tokenize query and documents (simple whitespace tokenization)
|
||||
# For production, consider using a proper tokenizer
|
||||
query_tokens = query.lower().split()
|
||||
tokenized_docs = [doc.lower().split() for doc in documents]
|
||||
|
||||
# Handle edge case where all documents are empty
|
||||
if all(len(doc) == 0 for doc in tokenized_docs):
|
||||
return [0.0] * len(documents)
|
||||
|
||||
# Create BM25 instance and calculate scores
|
||||
try:
|
||||
from rank_bm25 import BM25Okapi
|
||||
bm25 = BM25Okapi(tokenized_docs)
|
||||
scores = bm25.get_scores(query_tokens)
|
||||
|
||||
# Normalize scores to 0-1 range
|
||||
max_score = max(scores) if max(scores) > 0 else 1.0
|
||||
normalized_scores = [score / max_score for score in scores]
|
||||
|
||||
return normalized_scores
|
||||
except Exception as e:
|
||||
self._log("error", "Error calculating BM25 scores: {error}",
|
||||
params={"error": str(e)}, tag="URL_SEED")
|
||||
return [0.0] * len(documents)
|
||||
|
||||
# ─────────────────────────────── index helper
|
||||
async def _latest_index(self)->str:
|
||||
if self.index_cache_path.exists() and (time.time()-self.index_cache_path.stat().st_mtime)<self.ttl.total_seconds():
|
||||
self._log("info", "Loading latest CC index from cache: {path}",
|
||||
params={"path": self.index_cache_path}, tag="URL_SEED")
|
||||
return self.index_cache_path.read_text().strip()
|
||||
|
||||
self._log("info", "Fetching latest Common Crawl index from {url}",
|
||||
params={"url": COLLINFO_URL}, tag="URL_SEED")
|
||||
try:
|
||||
async with httpx.AsyncClient() as c:
|
||||
j=await c.get(COLLINFO_URL,timeout=10)
|
||||
j.raise_for_status() # Raise an exception for bad status codes
|
||||
idx=j.json()[0]["id"]
|
||||
self.index_cache_path.write_text(idx)
|
||||
self._log("success", "Successfully fetched and cached CC index: {index_id}",
|
||||
params={"index_id": idx}, tag="URL_SEED")
|
||||
return idx
|
||||
except httpx.RequestError as e:
|
||||
self._log("error", "Network error fetching CC index info: {error}",
|
||||
params={"error": str(e)}, tag="URL_SEED")
|
||||
raise
|
||||
except httpx.HTTPStatusError as e:
|
||||
self._log("error", "HTTP error fetching CC index info: {status_code}",
|
||||
params={"status_code": e.response.status_code}, tag="URL_SEED")
|
||||
raise
|
||||
except Exception as e:
|
||||
self._log("error", "Unexpected error fetching CC index info: {error}",
|
||||
params={"error": str(e)}, tag="URL_SEED")
|
||||
raise
|
||||
@@ -35,9 +35,10 @@ from .markdown_generation_strategy import (
|
||||
)
|
||||
from .deep_crawling import DeepCrawlDecorator
|
||||
from .async_logger import AsyncLogger, AsyncLoggerBase
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig, SeedingConfig
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||
from .async_url_seeder import AsyncUrlSeeder
|
||||
|
||||
from .utils import (
|
||||
sanitize_input_encode,
|
||||
@@ -163,6 +164,8 @@ class AsyncWebCrawler:
|
||||
# Decorate arun method with deep crawling capabilities
|
||||
self._deep_handler = DeepCrawlDecorator(self)
|
||||
self.arun = self._deep_handler(self.arun)
|
||||
|
||||
self.url_seeder: Optional[AsyncUrlSeeder] = None
|
||||
|
||||
async def start(self):
|
||||
"""
|
||||
@@ -744,3 +747,94 @@ class AsyncWebCrawler:
|
||||
else:
|
||||
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
||||
return [transform_result(res) for res in _results]
|
||||
|
||||
async def aseed_urls(
|
||||
self,
|
||||
domain_or_domains: Union[str, List[str]],
|
||||
config: Optional[SeedingConfig] = None,
|
||||
**kwargs
|
||||
) -> Union[List[str], Dict[str, List[Union[str, Dict[str, Any]]]]]:
|
||||
"""
|
||||
Discovers, filters, and optionally validates URLs for a given domain(s)
|
||||
using sitemaps and Common Crawl archives.
|
||||
|
||||
Args:
|
||||
domain_or_domains: A single domain string (e.g., "iana.org") or a list of domains.
|
||||
config: A SeedingConfig object to control the seeding process.
|
||||
Parameters passed directly via kwargs will override those in 'config'.
|
||||
**kwargs: Additional parameters (e.g., `source`, `live_check`, `extract_head`,
|
||||
`pattern`, `concurrency`, `hits_per_sec`, `force_refresh`, `verbose`)
|
||||
that will be used to construct or update the SeedingConfig.
|
||||
|
||||
Returns:
|
||||
If `extract_head` is False:
|
||||
- For a single domain: `List[str]` of discovered URLs.
|
||||
- For multiple domains: `Dict[str, List[str]]` mapping each domain to its URLs.
|
||||
If `extract_head` is True:
|
||||
- For a single domain: `List[Dict[str, Any]]` where each dict contains 'url'
|
||||
and 'head_data' (parsed <head> metadata).
|
||||
- For multiple domains: `Dict[str, List[Dict[str, Any]]]` mapping each domain
|
||||
to a list of URL data dictionaries.
|
||||
|
||||
Raises:
|
||||
ValueError: If `domain_or_domains` is not a string or a list of strings.
|
||||
Exception: Any underlying exceptions from AsyncUrlSeeder or network operations.
|
||||
|
||||
Example:
|
||||
>>> # Discover URLs from sitemap with live check for 'example.com'
|
||||
>>> result = await crawler.aseed_urls("example.com", source="sitemap", live_check=True, hits_per_sec=10)
|
||||
|
||||
>>> # Discover URLs from Common Crawl, extract head data for 'example.com' and 'python.org'
|
||||
>>> multi_domain_result = await crawler.aseed_urls(
|
||||
>>> ["example.com", "python.org"],
|
||||
>>> source="cc", extract_head=True, concurrency=200, hits_per_sec=50
|
||||
>>> )
|
||||
"""
|
||||
# Initialize AsyncUrlSeeder here if it hasn't been already
|
||||
if not self.url_seeder:
|
||||
# Pass the crawler's base_directory for seeder's cache management
|
||||
# Pass the crawler's logger for consistent logging
|
||||
self.url_seeder = AsyncUrlSeeder(
|
||||
base_directory=self.crawl4ai_folder,
|
||||
logger=self.logger
|
||||
)
|
||||
|
||||
# Merge config object with direct kwargs, giving kwargs precedence
|
||||
seeding_config = config.clone(**kwargs) if config else SeedingConfig.from_kwargs(kwargs)
|
||||
|
||||
# Ensure base_directory is set for the seeder's cache
|
||||
seeding_config.base_directory = seeding_config.base_directory or self.crawl4ai_folder
|
||||
# Ensure the seeder uses the crawler's logger (if not already set)
|
||||
if not self.url_seeder.logger:
|
||||
self.url_seeder.logger = self.logger
|
||||
|
||||
# Pass verbose setting if explicitly provided in SeedingConfig or kwargs
|
||||
if seeding_config.verbose is not None:
|
||||
self.url_seeder.logger.verbose = seeding_config.verbose
|
||||
else: # Default to crawler's verbose setting
|
||||
self.url_seeder.logger.verbose = self.logger.verbose
|
||||
|
||||
|
||||
if isinstance(domain_or_domains, str):
|
||||
self.logger.info(
|
||||
message="Starting URL seeding for domain: {domain}",
|
||||
tag="SEED",
|
||||
params={"domain": domain_or_domains}
|
||||
)
|
||||
return await self.url_seeder.urls(
|
||||
domain_or_domains,
|
||||
seeding_config
|
||||
)
|
||||
elif isinstance(domain_or_domains, (list, tuple)):
|
||||
self.logger.info(
|
||||
message="Starting URL seeding for {count} domains",
|
||||
tag="SEED",
|
||||
params={"count": len(domain_or_domains)}
|
||||
)
|
||||
# AsyncUrlSeeder.many_urls directly accepts a list of domains and individual params.
|
||||
return await self.url_seeder.many_urls(
|
||||
domain_or_domains,
|
||||
seeding_config
|
||||
)
|
||||
else:
|
||||
raise ValueError("`domain_or_domains` must be a string or a list of strings.")
|
||||
@@ -10,12 +10,16 @@ CacheMode = Union['CacheModeType']
|
||||
CrawlResult = Union['CrawlResultType']
|
||||
CrawlerHub = Union['CrawlerHubType']
|
||||
BrowserProfiler = Union['BrowserProfilerType']
|
||||
# NEW: Add AsyncUrlSeederType
|
||||
AsyncUrlSeeder = Union['AsyncUrlSeederType']
|
||||
|
||||
# Configuration types
|
||||
BrowserConfig = Union['BrowserConfigType']
|
||||
CrawlerRunConfig = Union['CrawlerRunConfigType']
|
||||
HTTPCrawlerConfig = Union['HTTPCrawlerConfigType']
|
||||
LLMConfig = Union['LLMConfigType']
|
||||
# NEW: Add SeedingConfigType
|
||||
SeedingConfig = Union['SeedingConfigType']
|
||||
|
||||
# Content scraping types
|
||||
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
|
||||
@@ -94,6 +98,8 @@ if TYPE_CHECKING:
|
||||
from .models import CrawlResult as CrawlResultType
|
||||
from .hub import CrawlerHub as CrawlerHubType
|
||||
from .browser_profiler import BrowserProfiler as BrowserProfilerType
|
||||
# NEW: Import AsyncUrlSeeder for type checking
|
||||
from .async_url_seeder import AsyncUrlSeeder as AsyncUrlSeederType
|
||||
|
||||
# Configuration imports
|
||||
from .async_configs import (
|
||||
@@ -101,6 +107,8 @@ if TYPE_CHECKING:
|
||||
CrawlerRunConfig as CrawlerRunConfigType,
|
||||
HTTPCrawlerConfig as HTTPCrawlerConfigType,
|
||||
LLMConfig as LLMConfigType,
|
||||
# NEW: Import SeedingConfig for type checking
|
||||
SeedingConfig as SeedingConfigType,
|
||||
)
|
||||
|
||||
# Content scraping imports
|
||||
@@ -184,4 +192,4 @@ if TYPE_CHECKING:
|
||||
|
||||
def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
|
||||
from .async_configs import LLMConfig
|
||||
return LLMConfig(*args, **kwargs)
|
||||
return LLMConfig(*args, **kwargs)
|
||||
261
docs/examples/url_seeder/url_seeder_demo.py
Normal file
261
docs/examples/url_seeder/url_seeder_demo.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
URL Seeder Demo - Interactive showcase of Crawl4AI's URL discovery capabilities
|
||||
|
||||
This demo shows:
|
||||
1. Basic URL discovery from sitemaps and Common Crawl
|
||||
2. Cache management and forced refresh
|
||||
3. Live URL validation and metadata extraction
|
||||
4. BM25 relevance scoring for intelligent filtering
|
||||
5. Integration with AsyncWebCrawler for the complete pipeline
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from datetime import datetime
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich.progress import Progress, SpinnerColumn, BarColumn, TimeElapsedColumn
|
||||
from rich.prompt import Prompt, Confirm
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig
|
||||
)
|
||||
|
||||
console = Console()
|
||||
|
||||
console.rule("[bold green]🌐 Crawl4AI URL Seeder: Interactive Demo")
|
||||
|
||||
DOMAIN = "crawl4ai.com"
|
||||
|
||||
# Utils
|
||||
|
||||
def print_head_info(head_data):
|
||||
table = Table(title="<head> Metadata", expand=True)
|
||||
table.add_column("Key", style="cyan", no_wrap=True)
|
||||
table.add_column("Value", style="magenta")
|
||||
|
||||
if not head_data:
|
||||
console.print("[yellow]No head data found.")
|
||||
return
|
||||
|
||||
if head_data.get("title"):
|
||||
table.add_row("title", head_data["title"])
|
||||
if head_data.get("charset"):
|
||||
table.add_row("charset", head_data["charset"])
|
||||
for k, v in head_data.get("meta", {}).items():
|
||||
table.add_row(f"meta:{k}", v)
|
||||
for rel, items in head_data.get("link", {}).items():
|
||||
for item in items:
|
||||
table.add_row(f"link:{rel}", item.get("href", ""))
|
||||
console.print(table)
|
||||
|
||||
|
||||
async def section_1_basic_exploration(seed: AsyncUrlSeeder):
|
||||
console.rule("[bold cyan]1. Basic Seeding")
|
||||
cfg = SeedingConfig(source="cc+sitemap", pattern="*", verbose=True)
|
||||
|
||||
start_time = time.time()
|
||||
with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
|
||||
p.add_task(description="Fetching from Common Crawl + Sitemap...", total=None)
|
||||
urls = await seed.urls(DOMAIN, cfg)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
console.print(f"[green]✓ Fetched {len(urls)} URLs in {elapsed:.2f} seconds")
|
||||
console.print(f"[dim] Speed: {len(urls)/elapsed:.0f} URLs/second[/dim]\n")
|
||||
|
||||
console.print("[bold]Sample URLs:[/bold]")
|
||||
for u in urls[:5]:
|
||||
console.print(f" • {u['url']}")
|
||||
|
||||
|
||||
async def section_2_cache_demo(seed: AsyncUrlSeeder):
|
||||
console.rule("[bold cyan]2. Caching Demonstration")
|
||||
console.print("[yellow]Using `force=True` to bypass cache and fetch fresh data.[/yellow]")
|
||||
cfg = SeedingConfig(source="cc", pattern="*crawl4ai.com/core/*", verbose=False, force = True)
|
||||
await seed.urls(DOMAIN, cfg)
|
||||
|
||||
async def section_3_live_head(seed: AsyncUrlSeeder):
|
||||
console.rule("[bold cyan]3. Live Check + Head Extraction")
|
||||
cfg = SeedingConfig(
|
||||
extract_head=True,
|
||||
concurrency=10,
|
||||
hits_per_sec=5,
|
||||
pattern="*crawl4ai.com/*",
|
||||
max_urls=10,
|
||||
verbose=False,
|
||||
)
|
||||
urls = await seed.urls(DOMAIN, cfg)
|
||||
|
||||
valid = [u for u in urls if u["status"] == "valid"]
|
||||
console.print(f"[green]Valid: {len(valid)} / {len(urls)}")
|
||||
if valid:
|
||||
print_head_info(valid[0]["head_data"])
|
||||
|
||||
|
||||
async def section_4_bm25_scoring(seed: AsyncUrlSeeder):
|
||||
console.rule("[bold cyan]4. BM25 Relevance Scoring")
|
||||
console.print("[yellow]Using AI-powered relevance scoring to find the most relevant content[/yellow]")
|
||||
|
||||
query = "markdown generation extraction strategies"
|
||||
cfg = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query=query,
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.3, # Only URLs with >30% relevance
|
||||
max_urls=20,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
|
||||
p.add_task(description=f"Searching for: '{query}'", total=None)
|
||||
urls = await seed.urls(DOMAIN, cfg)
|
||||
|
||||
console.print(f"[green]Found {len(urls)} relevant URLs (score > 0.3)")
|
||||
|
||||
# Show top results with scores
|
||||
table = Table(title="Top 5 Most Relevant Pages", expand=True)
|
||||
table.add_column("Score", style="cyan", width=8)
|
||||
table.add_column("Title", style="magenta")
|
||||
table.add_column("URL", style="blue", overflow="fold")
|
||||
|
||||
for url in urls[:5]:
|
||||
score = f"{url['relevance_score']:.2f}"
|
||||
title = url['head_data'].get('title', 'No title')[:60] + "..."
|
||||
table.add_row(score, title, url['url'])
|
||||
|
||||
console.print(table)
|
||||
|
||||
async def section_5_keyword_filter_to_agent(seed: AsyncUrlSeeder):
|
||||
console.rule("[bold cyan]5. Complete Pipeline: Discover → Filter → Crawl")
|
||||
cfg = SeedingConfig(
|
||||
extract_head=True,
|
||||
concurrency=20,
|
||||
hits_per_sec=10,
|
||||
max_urls=10,
|
||||
pattern="*crawl4ai.com/*",
|
||||
force=True,
|
||||
)
|
||||
urls = await seed.urls(DOMAIN, cfg)
|
||||
|
||||
keywords = ["deep crawling", "markdown", "llm"]
|
||||
selected = [u for u in urls if any(k in str(u["head_data"]).lower() for k in keywords)]
|
||||
|
||||
console.print(f"[cyan]Selected {len(selected)} URLs with relevant keywords:")
|
||||
for u in selected[:10]:
|
||||
console.print("•", u["url"])
|
||||
|
||||
console.print("\n[yellow]Passing above URLs to arun_many() LLM agent for crawling...")
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
crawl_run_config = CrawlerRunConfig(
|
||||
# Example crawl settings for these URLs:
|
||||
only_text=True, # Just get text content
|
||||
screenshot=False,
|
||||
pdf=False,
|
||||
word_count_threshold=50, # Only process pages with at least 50 words
|
||||
stream=True,
|
||||
verbose=False # Keep logs clean for arun_many in this demo
|
||||
)
|
||||
|
||||
# Extract just the URLs from the selected results
|
||||
urls_to_crawl = [u["url"] for u in selected]
|
||||
|
||||
# We'll stream results for large lists, but collect them here for demonstration
|
||||
crawled_results_stream = await crawler.arun_many(urls_to_crawl, config=crawl_run_config)
|
||||
final_crawled_data = []
|
||||
async for result in crawled_results_stream:
|
||||
final_crawled_data.append(result)
|
||||
if len(final_crawled_data) % 5 == 0:
|
||||
print(f" Processed {len(final_crawled_data)}/{len(urls_to_crawl)} URLs...")
|
||||
|
||||
print(f"\n Successfully crawled {len(final_crawled_data)} URLs.")
|
||||
if final_crawled_data:
|
||||
print("\n Example of a crawled result's URL and Markdown (first successful one):")
|
||||
for result in final_crawled_data:
|
||||
if result.success and result.markdown.raw_markdown:
|
||||
print(f" URL: {result.url}")
|
||||
print(f" Markdown snippet: {result.markdown.raw_markdown[:200]}...")
|
||||
break
|
||||
else:
|
||||
print(" No successful crawls with markdown found.")
|
||||
else:
|
||||
print(" No successful crawls found.")
|
||||
|
||||
|
||||
async def section_6_multi_domain(seed: AsyncUrlSeeder):
|
||||
console.rule("[bold cyan]6. Multi-Domain Discovery")
|
||||
console.print("[yellow]Discovering Python tutorials across multiple educational sites[/yellow]\n")
|
||||
|
||||
domains = ["docs.python.org", "realpython.com", "docs.crawl4ai.com"]
|
||||
cfg = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="python tutorial guide",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=5 # Per domain
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
with Progress(SpinnerColumn(), "[progress.description]{task.description}") as p:
|
||||
task = p.add_task(description="Discovering across domains...", total=None)
|
||||
results = await seed.many_urls(domains, cfg)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
total_urls = sum(len(urls) for urls in results.values())
|
||||
console.print(f"[green]✓ Found {total_urls} relevant URLs across {len(domains)} domains in {elapsed:.2f}s\n")
|
||||
|
||||
# Show results per domain
|
||||
for domain, urls in results.items():
|
||||
console.print(f"[bold]{domain}:[/bold] {len(urls)} relevant pages")
|
||||
if urls:
|
||||
top = urls[0]
|
||||
console.print(f" Top result: [{top['relevance_score']:.2f}] {top['head_data'].get('title', 'No title')}")
|
||||
|
||||
|
||||
async def main():
|
||||
seed = AsyncUrlSeeder()
|
||||
|
||||
# Interactive menu
|
||||
sections = {
|
||||
"1": ("Basic URL Discovery", section_1_basic_exploration),
|
||||
"2": ("Cache Management Demo", section_2_cache_demo),
|
||||
"3": ("Live Check & Metadata Extraction", section_3_live_head),
|
||||
"4": ("BM25 Relevance Scoring", section_4_bm25_scoring),
|
||||
"5": ("Complete Pipeline (Discover → Filter → Crawl)", section_5_keyword_filter_to_agent),
|
||||
"6": ("Multi-Domain Discovery", section_6_multi_domain),
|
||||
"7": ("Run All Demos", None)
|
||||
}
|
||||
|
||||
console.print("\n[bold]Available Demos:[/bold]")
|
||||
for key, (title, _) in sections.items():
|
||||
console.print(f" {key}. {title}")
|
||||
|
||||
choice = Prompt.ask("\n[cyan]Which demo would you like to run?[/cyan]",
|
||||
choices=list(sections.keys()),
|
||||
default="7")
|
||||
|
||||
console.print()
|
||||
|
||||
if choice == "7":
|
||||
# Run all demos
|
||||
for key, (title, func) in sections.items():
|
||||
if key != "7" and func:
|
||||
await func(seed)
|
||||
if key != "6": # Don't pause after the last demo
|
||||
if not Confirm.ask("\n[yellow]Continue to next demo?[/yellow]", default=True):
|
||||
break
|
||||
console.print()
|
||||
else:
|
||||
# Run selected demo
|
||||
_, func = sections[choice]
|
||||
await func(seed)
|
||||
|
||||
console.rule("[bold green]Demo Complete ✔︎")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
128
docs/examples/url_seeder/url_seeder_quick_demo.py
Normal file
128
docs/examples/url_seeder/url_seeder_quick_demo.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""
|
||||
🚀 URL Seeder + AsyncWebCrawler = Magic!
|
||||
Quick demo showing discovery → filter → crawl pipeline
|
||||
"""
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, AsyncWebCrawler, SeedingConfig, CrawlerRunConfig, AsyncLogger, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
# 🔍 Example 1: Discover ALL → Filter → Crawl
|
||||
async def discover_and_crawl():
|
||||
"""Find Python module tutorials & extract them all!"""
|
||||
seeder = AsyncUrlSeeder(
|
||||
logger=AsyncLogger() # Log everything
|
||||
)
|
||||
|
||||
# Step 1: See how many URLs exist (spoiler: A LOT!)
|
||||
print("📊 Let's see what RealPython has...")
|
||||
all_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(source="sitemap"))
|
||||
print(f"😱 Found {len(all_urls)} total URLs!")
|
||||
|
||||
# Step 2: Filter for Python modules (perfect size ~13)
|
||||
print("\n🎯 Filtering for 'python-modules' tutorials...")
|
||||
module_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*python-modules*",
|
||||
live_check=True # Make sure they're alive!
|
||||
))
|
||||
|
||||
print(f"✨ Found {len(module_urls)} module tutorials")
|
||||
for url in module_urls[:3]: # Show first 3
|
||||
status = "✅" if url["status"] == "valid" else "❌"
|
||||
print(f"{status} {url['url']}")
|
||||
|
||||
# Step 3: Crawl them all with pruning (keep it lean!)
|
||||
print("\n🕷️ Crawling all module tutorials...")
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter( # Smart filtering!
|
||||
threshold=0.48, # Remove fluff
|
||||
threshold_type="fixed",
|
||||
),
|
||||
),
|
||||
only_text=True,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
# Extract just the URLs from the seeder results
|
||||
urls_to_crawl = [u["url"] for u in module_urls[:5]]
|
||||
results = await crawler.arun_many(urls_to_crawl, config=config)
|
||||
|
||||
# Process & save
|
||||
saved = 0
|
||||
async for result in results:
|
||||
if result.success:
|
||||
# Save each tutorial (name from URL)
|
||||
name = result.url.split("/")[-2] + ".md"
|
||||
with open(name, "w") as f:
|
||||
f.write(result.markdown.fit_markdown)
|
||||
saved += 1
|
||||
print(f"💾 Saved: {name}")
|
||||
|
||||
print(f"\n🎉 Successfully saved {saved} tutorials!")
|
||||
|
||||
# 🔍 Example 2: Beautiful Soup articles with metadata peek
|
||||
async def explore_beautifulsoup():
|
||||
"""Discover BeautifulSoup content & peek at metadata"""
|
||||
seeder = AsyncUrlSeeder(logger=AsyncLogger() )
|
||||
|
||||
print("🍲 Looking for Beautiful Soup articles...")
|
||||
soup_urls = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True # Get the metadata!
|
||||
))
|
||||
|
||||
print(f"\n📚 Found {len(soup_urls)} Beautiful Soup articles:\n")
|
||||
|
||||
# Show what we discovered
|
||||
for i, url in enumerate(soup_urls, 1):
|
||||
meta = url["head_data"]["meta"]
|
||||
|
||||
print(f"{i}. {url['head_data']['title']}")
|
||||
print(f" 📝 {meta.get('description', 'No description')[:60]}...")
|
||||
print(f" 👤 By: {meta.get('author', 'Unknown')}")
|
||||
print(f" 🔗 {url['url']}\n")
|
||||
|
||||
# 🔍 Example 3: Smart search with BM25 relevance scoring
|
||||
async def smart_search_with_bm25():
|
||||
"""Use AI-powered relevance scoring to find the best content"""
|
||||
seeder = AsyncUrlSeeder(logger=AsyncLogger() )
|
||||
|
||||
print("🧠 Smart search: 'web scraping tutorial quiz'")
|
||||
|
||||
# Search with BM25 scoring - AI finds the best matches!
|
||||
results = await seeder.urls("realpython.com",
|
||||
SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*beautiful-soup*",
|
||||
extract_head=True,
|
||||
query="web scraping tutorial quiz", # Our search
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2 # Quality filter
|
||||
))
|
||||
|
||||
print(f"\n🎯 Top {len(results)} most relevant results:\n")
|
||||
|
||||
# Show ranked results with relevance scores
|
||||
for i, result in enumerate(results[:3], 1):
|
||||
print(f"{i}. [{result['relevance_score']:.2f}] {result['head_data']['title']}")
|
||||
print(f" 🔗 {result['url'][:60]}...")
|
||||
|
||||
print("\n✨ BM25 automatically ranked by relevance!")
|
||||
|
||||
# 🎬 Run the show!
|
||||
async def main():
|
||||
# print("=" * 60)
|
||||
# await discover_and_crawl()
|
||||
# print("\n" + "=" * 60 + "\n")
|
||||
# await explore_beautifulsoup()
|
||||
# print("\n" + "=" * 60 + "\n")
|
||||
await smart_search_with_bm25()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -16,22 +16,10 @@
|
||||
--mono-font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
|
||||
Courier New, monospace, serif;
|
||||
|
||||
--background-color: #151515; /* Dark background */
|
||||
--font-color: #eaeaea; /* Light font color for contrast */
|
||||
--invert-font-color: #151515; /* Dark color for inverted elements */
|
||||
--primary-color: #1a95e0; /* Primary color can remain the same or be adjusted for better contrast */
|
||||
--secondary-color: #727578; /* Secondary color for less important text */
|
||||
--secondary-dimmed-color: #8b857a; /* Dimmed secondary color */
|
||||
--error-color: #ff5555; /* Bright color for errors */
|
||||
--progress-bar-background: #444; /* Darker background for progress bar */
|
||||
--progress-bar-fill: #1a95e0; /* Bright color for progress bar fill */
|
||||
--code-bg-color: #1e1e1e; /* Darker background for code blocks */
|
||||
--input-style: solid; /* Keeping input style solid */
|
||||
--block-background-color: #202020; /* Darker background for block elements */
|
||||
--global-font-color: #eaeaea; /* Light font color for global elements */
|
||||
|
||||
--background-color: #222225;
|
||||
|
||||
--background-color: #070708;
|
||||
--page-width: 70em;
|
||||
--font-color: #e8e9ed;
|
||||
@@ -40,7 +28,7 @@
|
||||
--secondary-color: #d5cec0;
|
||||
--tertiary-color: #a3abba;
|
||||
--primary-dimmed-color: #09b5a5; /* Updated to the brand color */
|
||||
--primary-color: #50ffff; /* Updated to the brand color */
|
||||
--primary-color: #0fbbaa; /* Updated to the brand color */
|
||||
--accent-color: rgb(243, 128, 245);
|
||||
--error-color: #ff3c74;
|
||||
--progress-bar-background: #3f3f44;
|
||||
|
||||
1000
docs/md_v2/core/url-seeding.md
Normal file
1000
docs/md_v2/core/url-seeding.md
Normal file
File diff suppressed because it is too large
Load Diff
711
tests/general/test_async_url_seeder_bm25.py
Normal file
711
tests/general/test_async_url_seeder_bm25.py
Normal file
@@ -0,0 +1,711 @@
|
||||
"""
|
||||
Comprehensive test cases for AsyncUrlSeeder with BM25 scoring functionality.
|
||||
Tests cover all features including query-based scoring, metadata extraction,
|
||||
edge cases, and integration scenarios.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import pytest
|
||||
from typing import List, Dict, Any
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig, AsyncLogger
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
# Test domain - using docs.crawl4ai.com as it has the actual documentation
|
||||
TEST_DOMAIN = "kidocode.com"
|
||||
TEST_DOMAIN = "docs.crawl4ai.com"
|
||||
TEST_DOMAIN = "www.bbc.com/sport"
|
||||
|
||||
|
||||
class TestAsyncUrlSeederBM25:
|
||||
"""Comprehensive test suite for AsyncUrlSeeder with BM25 scoring."""
|
||||
|
||||
async def create_seeder(self):
|
||||
"""Create an AsyncUrlSeeder instance for testing."""
|
||||
logger = AsyncLogger()
|
||||
return AsyncUrlSeeder(logger=logger)
|
||||
|
||||
# ============================================
|
||||
# Basic BM25 Scoring Tests
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_basic_bm25_scoring(self, seeder):
|
||||
"""Test basic BM25 scoring with a simple query."""
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="premier league highlights",
|
||||
scoring_method="bm25",
|
||||
max_urls=200,
|
||||
verbose=True,
|
||||
force=True # Force fresh fetch
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
# Verify results have relevance scores
|
||||
assert all("relevance_score" in r for r in results)
|
||||
|
||||
# Verify scores are normalized between 0 and 1
|
||||
scores = [r["relevance_score"] for r in results]
|
||||
assert all(0.0 <= s <= 1.0 for s in scores)
|
||||
|
||||
# Verify results are sorted by relevance (descending)
|
||||
assert scores == sorted(scores, reverse=True)
|
||||
|
||||
# Print top 5 results for manual verification
|
||||
print("\nTop 5 results for 'web crawling tutorial':")
|
||||
for i, r in enumerate(results[:5]):
|
||||
print(f"{i+1}. Score: {r['relevance_score']:.3f} - {r['url']}")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_query_variations(self, seeder):
|
||||
"""Test BM25 scoring with different query variations."""
|
||||
queries = [
|
||||
"VAR controversy",
|
||||
"player ratings",
|
||||
"live score update",
|
||||
"transfer rumours",
|
||||
"post match analysis",
|
||||
"injury news"
|
||||
]
|
||||
|
||||
for query in queries:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query=query,
|
||||
scoring_method="bm25",
|
||||
max_urls=100,
|
||||
# force=True
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
# Verify each query produces scored results
|
||||
assert len(results) > 0
|
||||
assert all("relevance_score" in r for r in results)
|
||||
|
||||
print(f"\nTop result for '{query}':")
|
||||
if results:
|
||||
top = results[0]
|
||||
print(f" Score: {top['relevance_score']:.3f} - {top['url']}")
|
||||
|
||||
# ============================================
|
||||
# Score Threshold Tests
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_score_threshold_filtering(self, seeder):
|
||||
"""Test filtering results by minimum relevance score."""
|
||||
thresholds = [0.1, 0.3, 0.5, 0.7]
|
||||
|
||||
for threshold in thresholds:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="league standings",
|
||||
score_threshold=threshold,
|
||||
scoring_method="bm25",
|
||||
max_urls=50
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
# Verify all results meet threshold
|
||||
if results:
|
||||
assert all(r["relevance_score"] >= threshold for r in results)
|
||||
|
||||
print(f"\nThreshold {threshold}: {len(results)} URLs passed")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extreme_thresholds(self, seeder):
|
||||
"""Test edge cases with extreme threshold values."""
|
||||
# Very low threshold - should return many results
|
||||
config_low = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="match",
|
||||
score_threshold=0.001,
|
||||
scoring_method="bm25"
|
||||
)
|
||||
results_low = await seeder.urls(TEST_DOMAIN, config_low)
|
||||
|
||||
# Very high threshold - might return few or no results
|
||||
config_high = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="match",
|
||||
score_threshold=0.99,
|
||||
scoring_method="bm25"
|
||||
)
|
||||
results_high = await seeder.urls(TEST_DOMAIN, config_high)
|
||||
|
||||
# Low threshold should return more results than high
|
||||
assert len(results_low) >= len(results_high)
|
||||
print(f"\nLow threshold (0.001): {len(results_low)} results")
|
||||
print(f"High threshold (0.99): {len(results_high)} results")
|
||||
|
||||
# ============================================
|
||||
# Metadata Extraction Tests
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_comprehensive_metadata_extraction(self, seeder):
|
||||
"""Test extraction of all metadata types including JSON-LD."""
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="match report",
|
||||
scoring_method="bm25",
|
||||
max_urls=5,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
for result in results:
|
||||
head_data = result.get("head_data", {})
|
||||
|
||||
# Check for various metadata fields
|
||||
print(f"\nMetadata for {result['url']}:")
|
||||
print(f" Title: {head_data.get('title', 'N/A')}")
|
||||
print(f" Charset: {head_data.get('charset', 'N/A')}")
|
||||
print(f" Lang: {head_data.get('lang', 'N/A')}")
|
||||
|
||||
# Check meta tags
|
||||
meta = head_data.get("meta", {})
|
||||
if meta:
|
||||
print(" Meta tags found:")
|
||||
for key in ["description", "keywords", "author", "viewport"]:
|
||||
if key in meta:
|
||||
print(f" {key}: {meta[key][:50]}...")
|
||||
|
||||
# Check for Open Graph tags
|
||||
og_tags = {k: v for k, v in meta.items() if k.startswith("og:")}
|
||||
if og_tags:
|
||||
print(" Open Graph tags found:")
|
||||
for k, v in list(og_tags.items())[:3]:
|
||||
print(f" {k}: {v[:50]}...")
|
||||
|
||||
# Check JSON-LD
|
||||
if head_data.get("jsonld"):
|
||||
print(f" JSON-LD schemas found: {len(head_data['jsonld'])}")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_jsonld_extraction_scoring(self, seeder):
|
||||
"""Test that JSON-LD data contributes to BM25 scoring."""
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="Premier League match report highlights",
|
||||
scoring_method="bm25",
|
||||
max_urls=20
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
# Find results with JSON-LD data
|
||||
jsonld_results = [r for r in results if r.get("head_data", {}).get("jsonld")]
|
||||
|
||||
if jsonld_results:
|
||||
print(f"\nFound {len(jsonld_results)} URLs with JSON-LD data")
|
||||
for r in jsonld_results[:3]:
|
||||
print(f" Score: {r['relevance_score']:.3f} - {r['url']}")
|
||||
jsonld_data = r["head_data"]["jsonld"]
|
||||
print(f" JSON-LD types: {[item.get('@type', 'Unknown') for item in jsonld_data if isinstance(item, dict)]}")
|
||||
|
||||
# ============================================
|
||||
# Edge Cases and Error Handling
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_query(self, seeder):
|
||||
"""Test behavior with empty query string."""
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="",
|
||||
scoring_method="bm25",
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
# Should return results but all with zero scores
|
||||
assert len(results) > 0
|
||||
assert all(r.get("relevance_score", 0) == 0 for r in results)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_query_without_extract_head(self, seeder):
|
||||
"""Test query scoring when extract_head is False."""
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=False, # This should trigger a warning
|
||||
query="Premier League match report highlights",
|
||||
scoring_method="bm25",
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
# Results should not have relevance scores
|
||||
assert all("relevance_score" not in r for r in results)
|
||||
print("\nVerified: No scores added when extract_head=False")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_special_characters_in_query(self, seeder):
|
||||
"""Test queries with special characters and symbols."""
|
||||
special_queries = [
|
||||
"premier league + analytics",
|
||||
"injury/rehab routines",
|
||||
"AI-powered scouting",
|
||||
"match stats & xG",
|
||||
"tactical@breakdown",
|
||||
"transfer-window.yml"
|
||||
]
|
||||
|
||||
for query in special_queries:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query=query,
|
||||
scoring_method="bm25",
|
||||
max_urls=5
|
||||
)
|
||||
|
||||
try:
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
assert isinstance(results, list)
|
||||
print(f"\n✓ Query '{query}' processed successfully")
|
||||
except Exception as e:
|
||||
pytest.fail(f"Failed on query '{query}': {str(e)}")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unicode_query(self, seeder):
|
||||
"""Test queries with Unicode characters."""
|
||||
unicode_queries = [
|
||||
"网页爬虫", # Chinese
|
||||
"веб-краулер", # Russian
|
||||
"🚀 crawl4ai", # Emoji
|
||||
"naïve implementation", # Accented characters
|
||||
]
|
||||
|
||||
for query in unicode_queries:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query=query,
|
||||
scoring_method="bm25",
|
||||
max_urls=5
|
||||
)
|
||||
|
||||
try:
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
assert isinstance(results, list)
|
||||
print(f"\n✓ Unicode query '{query}' processed successfully")
|
||||
except Exception as e:
|
||||
print(f"\n✗ Unicode query '{query}' failed: {str(e)}")
|
||||
|
||||
# ============================================
|
||||
# Performance and Scalability Tests
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_large_scale_scoring(self, seeder):
|
||||
"""Test BM25 scoring with many URLs."""
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap", # Use both sources for more URLs
|
||||
extract_head=True,
|
||||
query="world cup group standings",
|
||||
scoring_method="bm25",
|
||||
max_urls=100,
|
||||
concurrency=20,
|
||||
hits_per_sec=10
|
||||
)
|
||||
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
elapsed = asyncio.get_event_loop().time() - start_time
|
||||
|
||||
print(f"\nProcessed {len(results)} URLs in {elapsed:.2f} seconds")
|
||||
print(f"Average time per URL: {elapsed/len(results)*1000:.1f}ms")
|
||||
|
||||
# Verify scoring worked at scale
|
||||
assert all("relevance_score" in r for r in results)
|
||||
|
||||
# Check score distribution
|
||||
scores = [r["relevance_score"] for r in results]
|
||||
print(f"Score distribution:")
|
||||
print(f" Min: {min(scores):.3f}")
|
||||
print(f" Max: {max(scores):.3f}")
|
||||
print(f" Avg: {sum(scores)/len(scores):.3f}")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_concurrent_scoring_consistency(self, seeder):
|
||||
"""Test that concurrent requests produce consistent scores."""
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="live score update",
|
||||
scoring_method="bm25",
|
||||
max_urls=20,
|
||||
concurrency=10
|
||||
)
|
||||
|
||||
# Run the same query multiple times
|
||||
results_list = []
|
||||
for _ in range(3):
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
results_list.append(results)
|
||||
|
||||
# Compare scores across runs (they should be identical for same URLs)
|
||||
url_scores = {}
|
||||
for results in results_list:
|
||||
for r in results:
|
||||
url = r["url"]
|
||||
score = r["relevance_score"]
|
||||
if url in url_scores:
|
||||
# Scores should be very close (allowing for tiny float differences)
|
||||
assert abs(url_scores[url] - score) < 0.001
|
||||
else:
|
||||
url_scores[url] = score
|
||||
|
||||
print(f"\n✓ Consistent scores across {len(results_list)} runs")
|
||||
|
||||
# ============================================
|
||||
# Multi-Domain Tests
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_many_urls_with_scoring(self, seeder):
|
||||
"""Test many_urls method with BM25 scoring."""
|
||||
domains = [TEST_DOMAIN, "docs.crawl4ai.com", "example.com"]
|
||||
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
# live_check=True,
|
||||
query="fixture list",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10,
|
||||
force=True, # Force fresh fetch
|
||||
)
|
||||
|
||||
results_dict = await seeder.many_urls(domains, config)
|
||||
|
||||
for domain, results in results_dict.items():
|
||||
print(f"\nDomain: {domain}")
|
||||
print(f" Found {len(results)} URLs above threshold")
|
||||
if results:
|
||||
top = results[0]
|
||||
print(f" Top result: {top['relevance_score']:.3f} - {top['url']}")
|
||||
|
||||
# ============================================
|
||||
# Complex Query Tests
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multi_word_complex_queries(self, seeder):
|
||||
"""Test complex multi-word queries."""
|
||||
complex_queries = [
|
||||
"how to follow live match commentary",
|
||||
"extract expected goals stats from match data",
|
||||
"premier league match report analysis",
|
||||
"transfer rumours and confirmed signings tracker",
|
||||
"tactical breakdown of high press strategy"
|
||||
]
|
||||
|
||||
for query in complex_queries:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query=query,
|
||||
scoring_method="bm25",
|
||||
max_urls=5
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
if results:
|
||||
print(f"\nQuery: '{query}'")
|
||||
print(f"Top match: {results[0]['relevance_score']:.3f} - {results[0]['url']}")
|
||||
|
||||
# Extract matched terms from metadata
|
||||
head_data = results[0].get("head_data", {})
|
||||
title = head_data.get("title", "")
|
||||
description = head_data.get("meta", {}).get("description", "")
|
||||
|
||||
# Simple term matching for verification
|
||||
query_terms = set(query.lower().split())
|
||||
title_terms = set(title.lower().split())
|
||||
desc_terms = set(description.lower().split())
|
||||
|
||||
matched_terms = query_terms & (title_terms | desc_terms)
|
||||
if matched_terms:
|
||||
print(f"Matched terms: {', '.join(matched_terms)}")
|
||||
|
||||
# ============================================
|
||||
# Cache and Force Tests
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scoring_with_cache(self, seeder):
|
||||
"""Test that scoring works correctly with cached results."""
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="injury update timeline",
|
||||
scoring_method="bm25",
|
||||
max_urls=10,
|
||||
force=False # Use cache
|
||||
)
|
||||
|
||||
# First run - populate cache
|
||||
results1 = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
# Second run - should use cache
|
||||
results2 = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
# Results should be identical
|
||||
assert len(results1) == len(results2)
|
||||
for r1, r2 in zip(results1, results2):
|
||||
assert r1["url"] == r2["url"]
|
||||
assert abs(r1["relevance_score"] - r2["relevance_score"]) < 0.001
|
||||
|
||||
print("\n✓ Cache produces consistent scores")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_force_refresh_scoring(self, seeder):
|
||||
"""Test force=True bypasses cache for fresh scoring."""
|
||||
config_cached = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="transfer window",
|
||||
scoring_method="bm25",
|
||||
max_urls=5,
|
||||
force=False
|
||||
)
|
||||
|
||||
config_forced = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="transfer window",
|
||||
scoring_method="bm25",
|
||||
max_urls=5,
|
||||
force=True
|
||||
)
|
||||
|
||||
# Run with cache
|
||||
start1 = asyncio.get_event_loop().time()
|
||||
results1 = await seeder.urls(TEST_DOMAIN, config_cached)
|
||||
time1 = asyncio.get_event_loop().time() - start1
|
||||
|
||||
# Run with force (should be slower due to fresh fetch)
|
||||
start2 = asyncio.get_event_loop().time()
|
||||
results2 = await seeder.urls(TEST_DOMAIN, config_forced)
|
||||
time2 = asyncio.get_event_loop().time() - start2
|
||||
|
||||
print(f"\nCached run: {time1:.2f}s")
|
||||
print(f"Forced run: {time2:.2f}s")
|
||||
|
||||
# Both should produce scored results
|
||||
assert all("relevance_score" in r for r in results1)
|
||||
assert all("relevance_score" in r for r in results2)
|
||||
|
||||
# ============================================
|
||||
# Source Combination Tests
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scoring_with_multiple_sources(self, seeder):
|
||||
"""Test BM25 scoring with combined sources (cc+sitemap)."""
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap",
|
||||
extract_head=True,
|
||||
query="match highlights video",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.3,
|
||||
max_urls=30,
|
||||
concurrency=15
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
# Verify we got results from both sources
|
||||
print(f"\nCombined sources returned {len(results)} URLs above threshold")
|
||||
|
||||
# Check URL diversity
|
||||
unique_paths = set()
|
||||
for r in results:
|
||||
path = r["url"].replace("https://", "").replace("http://", "").split("/", 1)[-1]
|
||||
unique_paths.add(path.split("?")[0]) # Remove query params
|
||||
|
||||
print(f"Unique paths found: {len(unique_paths)}")
|
||||
|
||||
# All should be scored and above threshold
|
||||
assert all(r["relevance_score"] >= 0.3 for r in results)
|
||||
|
||||
# ============================================
|
||||
# Integration Tests
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_full_workflow_integration(self, seeder):
|
||||
"""Test complete workflow: discover -> score -> filter -> use."""
|
||||
# Step 1: Discover and score URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="premier league opening fixtures",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.4,
|
||||
max_urls=10,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
print(f"\nStep 1: Found {len(results)} relevant URLs")
|
||||
|
||||
# Step 2: Analyze top results
|
||||
if results:
|
||||
top_urls = results[:3]
|
||||
print("\nStep 2: Top 3 URLs for crawling:")
|
||||
for i, r in enumerate(top_urls):
|
||||
print(f"{i+1}. Score: {r['relevance_score']:.3f}")
|
||||
print(f" URL: {r['url']}")
|
||||
print(f" Title: {r['head_data'].get('title', 'N/A')}")
|
||||
|
||||
# Check metadata quality
|
||||
meta = r['head_data'].get('meta', {})
|
||||
if 'description' in meta:
|
||||
print(f" Description: {meta['description'][:80]}...")
|
||||
|
||||
# Step 3: Verify these URLs would be good for actual crawling
|
||||
assert all(r["status"] == "valid" for r in results[:3])
|
||||
print("\nStep 3: All top URLs are valid for crawling ✓")
|
||||
|
||||
# ============================================
|
||||
# Report Generation
|
||||
# ============================================
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_scoring_report(self, seeder):
|
||||
"""Generate a comprehensive report of BM25 scoring effectiveness."""
|
||||
queries = {
|
||||
"beginner": "match schedule",
|
||||
"advanced": "tactical analysis pressing",
|
||||
"api": "VAR decision explanation",
|
||||
"deployment": "fixture changes due to weather",
|
||||
"extraction": "expected goals statistics"
|
||||
}
|
||||
|
||||
report = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"domain": TEST_DOMAIN,
|
||||
"results": {}
|
||||
}
|
||||
|
||||
for category, query in queries.items():
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query=query,
|
||||
scoring_method="bm25",
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
results = await seeder.urls(TEST_DOMAIN, config)
|
||||
|
||||
report["results"][category] = {
|
||||
"query": query,
|
||||
"total_results": len(results),
|
||||
"top_results": [
|
||||
{
|
||||
"url": r["url"],
|
||||
"score": r["relevance_score"],
|
||||
"title": r["head_data"].get("title", "")
|
||||
}
|
||||
for r in results[:3]
|
||||
],
|
||||
"score_distribution": {
|
||||
"min": min(r["relevance_score"] for r in results) if results else 0,
|
||||
"max": max(r["relevance_score"] for r in results) if results else 0,
|
||||
"avg": sum(r["relevance_score"] for r in results) / len(results) if results else 0
|
||||
}
|
||||
}
|
||||
|
||||
# Print report
|
||||
print("\n" + "="*60)
|
||||
print("BM25 SCORING EFFECTIVENESS REPORT")
|
||||
print("="*60)
|
||||
print(f"Domain: {report['domain']}")
|
||||
print(f"Timestamp: {report['timestamp']}")
|
||||
print("\nResults by Category:")
|
||||
|
||||
for category, data in report["results"].items():
|
||||
print(f"\n{category.upper()}: '{data['query']}'")
|
||||
print(f" Total results: {data['total_results']}")
|
||||
print(f" Score range: {data['score_distribution']['min']:.3f} - {data['score_distribution']['max']:.3f}")
|
||||
print(f" Average score: {data['score_distribution']['avg']:.3f}")
|
||||
print(" Top matches:")
|
||||
for i, result in enumerate(data['top_results']):
|
||||
print(f" {i+1}. [{result['score']:.3f}] {result['title']}")
|
||||
|
||||
|
||||
# ============================================
|
||||
# Standalone test runner
|
||||
# ============================================
|
||||
|
||||
async def run_all_tests():
|
||||
"""Run all tests standalone (without pytest)."""
|
||||
print("Running AsyncUrlSeeder BM25 Tests...")
|
||||
print("="*60)
|
||||
|
||||
test_instance = TestAsyncUrlSeederBM25()
|
||||
seeder = await test_instance.create_seeder()
|
||||
|
||||
# Run each test method
|
||||
test_methods = [
|
||||
# test_instance.test_basic_bm25_scoring,
|
||||
# test_instance.test_query_variations,
|
||||
# test_instance.test_score_threshold_filtering,
|
||||
# test_instance.test_extreme_thresholds,
|
||||
# test_instance.test_comprehensive_metadata_extraction,
|
||||
# test_instance.test_jsonld_extraction_scoring,
|
||||
# test_instance.test_empty_query,
|
||||
# test_instance.test_query_without_extract_head,
|
||||
# test_instance.test_special_characters_in_query,
|
||||
# test_instance.test_unicode_query,
|
||||
# test_instance.test_large_scale_scoring,
|
||||
# test_instance.test_concurrent_scoring_consistency,
|
||||
# test_instance.test_many_urls_with_scoring,
|
||||
test_instance.test_multi_word_complex_queries,
|
||||
test_instance.test_scoring_with_cache,
|
||||
test_instance.test_force_refresh_scoring,
|
||||
test_instance.test_scoring_with_multiple_sources,
|
||||
test_instance.test_full_workflow_integration,
|
||||
test_instance.test_generate_scoring_report
|
||||
]
|
||||
|
||||
for test_method in test_methods:
|
||||
try:
|
||||
print(f"\nRunning {test_method.__name__}...")
|
||||
await test_method(seeder)
|
||||
print(f"✓ {test_method.__name__} passed")
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f"✗ {test_method.__name__} failed: {str(e)}")
|
||||
print(f" Error type: {type(e).__name__}")
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("Test suite completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run tests directly
|
||||
asyncio.run(run_all_tests())
|
||||
Reference in New Issue
Block a user