feat: Add advanced link head extraction with three-layer scoring system (#1)
Squashed commit from feature/link-extractor branch implementing comprehensive link analysis: - Extract HTML head content from discovered links with parallel processing - Three-layer scoring: Intrinsic (URL quality), Contextual (BM25), and Total scores - New LinkExtractionConfig class for type-safe configuration - Pattern-based filtering for internal/external links - Comprehensive documentation and examples
This commit is contained in:
@@ -37,6 +37,7 @@ from .content_filter_strategy import (
|
||||
)
|
||||
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
|
||||
from .components.crawler_monitor import CrawlerMonitor
|
||||
from .link_extractor import LinkExtractor
|
||||
from .async_dispatcher import (
|
||||
MemoryAdaptiveDispatcher,
|
||||
SemaphoreDispatcher,
|
||||
@@ -141,6 +142,7 @@ __all__ = [
|
||||
"SemaphoreDispatcher",
|
||||
"RateLimiter",
|
||||
"CrawlerMonitor",
|
||||
"LinkExtractor",
|
||||
"DisplayMode",
|
||||
"MarkdownGenerationResult",
|
||||
"Crawl4aiDockerClient",
|
||||
|
||||
@@ -17,7 +17,7 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
|
||||
from .cache_context import CacheMode
|
||||
@@ -594,6 +594,101 @@ class BrowserConfig:
|
||||
return config
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
|
||||
class LinkExtractionConfig:
|
||||
"""Configuration for link head extraction and scoring."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
include_internal: bool = True,
|
||||
include_external: bool = False,
|
||||
include_patterns: Optional[List[str]] = None,
|
||||
exclude_patterns: Optional[List[str]] = None,
|
||||
concurrency: int = 10,
|
||||
timeout: int = 5,
|
||||
max_links: int = 100,
|
||||
query: Optional[str] = None,
|
||||
score_threshold: Optional[float] = None,
|
||||
verbose: bool = False
|
||||
):
|
||||
"""
|
||||
Initialize link extraction configuration.
|
||||
|
||||
Args:
|
||||
include_internal: Whether to include same-domain links
|
||||
include_external: Whether to include different-domain links
|
||||
include_patterns: List of glob patterns to include (e.g., ["*/docs/*", "*/api/*"])
|
||||
exclude_patterns: List of glob patterns to exclude (e.g., ["*/login*", "*/admin*"])
|
||||
concurrency: Number of links to process simultaneously
|
||||
timeout: Timeout in seconds for each link's head extraction
|
||||
max_links: Maximum number of links to process (prevents overload)
|
||||
query: Query string for BM25 contextual scoring (optional)
|
||||
score_threshold: Minimum relevance score to include links (0.0-1.0, optional)
|
||||
verbose: Show detailed progress during extraction
|
||||
"""
|
||||
self.include_internal = include_internal
|
||||
self.include_external = include_external
|
||||
self.include_patterns = include_patterns
|
||||
self.exclude_patterns = exclude_patterns
|
||||
self.concurrency = concurrency
|
||||
self.timeout = timeout
|
||||
self.max_links = max_links
|
||||
self.query = query
|
||||
self.score_threshold = score_threshold
|
||||
self.verbose = verbose
|
||||
|
||||
# Validation
|
||||
if concurrency <= 0:
|
||||
raise ValueError("concurrency must be positive")
|
||||
if timeout <= 0:
|
||||
raise ValueError("timeout must be positive")
|
||||
if max_links <= 0:
|
||||
raise ValueError("max_links must be positive")
|
||||
if score_threshold is not None and not (0.0 <= score_threshold <= 1.0):
|
||||
raise ValueError("score_threshold must be between 0.0 and 1.0")
|
||||
if not include_internal and not include_external:
|
||||
raise ValueError("At least one of include_internal or include_external must be True")
|
||||
|
||||
@staticmethod
|
||||
def from_dict(config_dict: Dict[str, Any]) -> "LinkExtractionConfig":
|
||||
"""Create LinkExtractionConfig from dictionary (for backward compatibility)."""
|
||||
if not config_dict:
|
||||
return None
|
||||
|
||||
return LinkExtractionConfig(
|
||||
include_internal=config_dict.get("include_internal", True),
|
||||
include_external=config_dict.get("include_external", False),
|
||||
include_patterns=config_dict.get("include_patterns"),
|
||||
exclude_patterns=config_dict.get("exclude_patterns"),
|
||||
concurrency=config_dict.get("concurrency", 10),
|
||||
timeout=config_dict.get("timeout", 5),
|
||||
max_links=config_dict.get("max_links", 100),
|
||||
query=config_dict.get("query"),
|
||||
score_threshold=config_dict.get("score_threshold"),
|
||||
verbose=config_dict.get("verbose", False)
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert to dictionary format."""
|
||||
return {
|
||||
"include_internal": self.include_internal,
|
||||
"include_external": self.include_external,
|
||||
"include_patterns": self.include_patterns,
|
||||
"exclude_patterns": self.exclude_patterns,
|
||||
"concurrency": self.concurrency,
|
||||
"timeout": self.timeout,
|
||||
"max_links": self.max_links,
|
||||
"query": self.query,
|
||||
"score_threshold": self.score_threshold,
|
||||
"verbose": self.verbose
|
||||
}
|
||||
|
||||
def clone(self, **kwargs) -> "LinkExtractionConfig":
|
||||
"""Create a copy with updated values."""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return LinkExtractionConfig.from_dict(config_dict)
|
||||
|
||||
|
||||
class HTTPCrawlerConfig:
|
||||
"""HTTP-specific crawler configuration"""
|
||||
|
||||
@@ -829,6 +924,9 @@ class CrawlerRunConfig():
|
||||
Default: [].
|
||||
exclude_internal_links (bool): If True, exclude internal links from the results.
|
||||
Default: False.
|
||||
score_links (bool): If True, calculate intrinsic quality scores for all links using URL structure,
|
||||
text quality, and contextual relevance metrics. Separate from link_extraction_config.
|
||||
Default: False.
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
verbose (bool): Enable verbose logging.
|
||||
@@ -939,6 +1037,7 @@ class CrawlerRunConfig():
|
||||
exclude_social_media_links: bool = False,
|
||||
exclude_domains: list = None,
|
||||
exclude_internal_links: bool = False,
|
||||
score_links: bool = False,
|
||||
# Debugging and Logging Parameters
|
||||
verbose: bool = True,
|
||||
log_console: bool = False,
|
||||
@@ -955,6 +1054,8 @@ class CrawlerRunConfig():
|
||||
user_agent_generator_config: dict = {},
|
||||
# Deep Crawl Parameters
|
||||
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||
# Link Extraction Parameters
|
||||
link_extraction_config: Union[LinkExtractionConfig, Dict[str, Any]] = None,
|
||||
# Experimental Parameters
|
||||
experimental: Dict[str, Any] = None,
|
||||
):
|
||||
@@ -976,7 +1077,7 @@ class CrawlerRunConfig():
|
||||
self.remove_forms = remove_forms
|
||||
self.prettiify = prettiify
|
||||
self.parser_type = parser_type
|
||||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||
self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
|
||||
self.proxy_config = proxy_config
|
||||
self.proxy_rotation_strategy = proxy_rotation_strategy
|
||||
|
||||
@@ -1042,6 +1143,7 @@ class CrawlerRunConfig():
|
||||
self.exclude_social_media_links = exclude_social_media_links
|
||||
self.exclude_domains = exclude_domains or []
|
||||
self.exclude_internal_links = exclude_internal_links
|
||||
self.score_links = score_links
|
||||
|
||||
# Debugging and Logging Parameters
|
||||
self.verbose = verbose
|
||||
@@ -1084,6 +1186,17 @@ class CrawlerRunConfig():
|
||||
# Deep Crawl Parameters
|
||||
self.deep_crawl_strategy = deep_crawl_strategy
|
||||
|
||||
# Link Extraction Parameters
|
||||
if link_extraction_config is None:
|
||||
self.link_extraction_config = None
|
||||
elif isinstance(link_extraction_config, LinkExtractionConfig):
|
||||
self.link_extraction_config = link_extraction_config
|
||||
elif isinstance(link_extraction_config, dict):
|
||||
# Convert dict to config object for backward compatibility
|
||||
self.link_extraction_config = LinkExtractionConfig.from_dict(link_extraction_config)
|
||||
else:
|
||||
raise ValueError("link_extraction_config must be LinkExtractionConfig object or dict")
|
||||
|
||||
# Experimental Parameters
|
||||
self.experimental = experimental or {}
|
||||
|
||||
@@ -1241,6 +1354,7 @@ class CrawlerRunConfig():
|
||||
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
|
||||
exclude_domains=kwargs.get("exclude_domains", []),
|
||||
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
||||
score_links=kwargs.get("score_links", False),
|
||||
# Debugging and Logging Parameters
|
||||
verbose=kwargs.get("verbose", True),
|
||||
log_console=kwargs.get("log_console", False),
|
||||
@@ -1256,6 +1370,8 @@ class CrawlerRunConfig():
|
||||
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
|
||||
# Deep Crawl Parameters
|
||||
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
|
||||
# Link Extraction Parameters
|
||||
link_extraction_config=kwargs.get("link_extraction_config"),
|
||||
url=kwargs.get("url"),
|
||||
# Experimental Parameters
|
||||
experimental=kwargs.get("experimental"),
|
||||
@@ -1339,6 +1455,7 @@ class CrawlerRunConfig():
|
||||
"exclude_social_media_links": self.exclude_social_media_links,
|
||||
"exclude_domains": self.exclude_domains,
|
||||
"exclude_internal_links": self.exclude_internal_links,
|
||||
"score_links": self.score_links,
|
||||
"verbose": self.verbose,
|
||||
"log_console": self.log_console,
|
||||
"capture_network_requests": self.capture_network_requests,
|
||||
@@ -1350,6 +1467,7 @@ class CrawlerRunConfig():
|
||||
"user_agent_mode": self.user_agent_mode,
|
||||
"user_agent_generator_config": self.user_agent_generator_config,
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"link_extraction_config": self.link_extraction_config.to_dict() if self.link_extraction_config else None,
|
||||
"url": self.url,
|
||||
"experimental": self.experimental,
|
||||
}
|
||||
|
||||
@@ -109,12 +109,16 @@ def _parse_head(src: str) -> Dict[str, Any]:
|
||||
elif "charset" in el.attrib:
|
||||
info["charset"] = el.attrib["charset"].lower()
|
||||
for el in doc.xpath(".//link"):
|
||||
rel = " ".join(el.attrib.get("rel", [])).lower()
|
||||
if not rel:
|
||||
rel_attr = el.attrib.get("rel", "")
|
||||
if not rel_attr:
|
||||
continue
|
||||
# Handle multiple space-separated rel values
|
||||
rel_values = rel_attr.lower().split()
|
||||
entry = {a: el.attrib[a] for a in (
|
||||
"href", "as", "type", "hreflang") if a in el.attrib}
|
||||
info["link"].setdefault(rel, []).append(entry)
|
||||
# Add entry for each rel value
|
||||
for rel in rel_values:
|
||||
info["link"].setdefault(rel, []).append(entry)
|
||||
# Extract JSON-LD structured data
|
||||
for script in doc.xpath('.//script[@type="application/ld+json"]'):
|
||||
if script.text:
|
||||
@@ -467,6 +471,200 @@ class AsyncUrlSeeder:
|
||||
"info", "Finished URL seeding for multiple domains.", tag="URL_SEED")
|
||||
return final_results
|
||||
|
||||
async def extract_head_for_urls(
|
||||
self,
|
||||
urls: List[str],
|
||||
config: Optional["SeedingConfig"] = None,
|
||||
concurrency: int = 10,
|
||||
timeout: int = 5
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract head content for a custom list of URLs using URLSeeder's parallel processing.
|
||||
|
||||
This method reuses URLSeeder's efficient parallel processing, caching, and head extraction
|
||||
logic to process a custom list of URLs rather than discovering URLs from sources.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
urls : List[str]
|
||||
List of URLs to extract head content from
|
||||
config : SeedingConfig, optional
|
||||
Configuration object. If None, uses default settings for head extraction
|
||||
concurrency : int, default=10
|
||||
Number of concurrent requests
|
||||
timeout : int, default=5
|
||||
Timeout for each request in seconds
|
||||
|
||||
Returns
|
||||
-------
|
||||
List[Dict[str, Any]]
|
||||
List of dictionaries containing url, status, head_data, and optional relevance_score
|
||||
"""
|
||||
# Create default config if none provided
|
||||
if config is None:
|
||||
# Import here to avoid circular imports
|
||||
from .async_configs import SeedingConfig
|
||||
config = SeedingConfig(
|
||||
extract_head=True,
|
||||
concurrency=concurrency,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Override concurrency and ensure head extraction is enabled
|
||||
config.concurrency = concurrency
|
||||
config.extract_head = True
|
||||
|
||||
self._log("info", "Starting head extraction for {count} custom URLs",
|
||||
params={"count": len(urls)}, tag="URL_SEED")
|
||||
|
||||
# Setup rate limiting if specified in config
|
||||
if config.hits_per_sec:
|
||||
if config.hits_per_sec <= 0:
|
||||
self._log("warning", "hits_per_sec must be positive. Disabling rate limiting.", tag="URL_SEED")
|
||||
self._rate_sem = None
|
||||
else:
|
||||
self._rate_sem = asyncio.Semaphore(config.hits_per_sec)
|
||||
else:
|
||||
self._rate_sem = None
|
||||
|
||||
# Use bounded queue to prevent memory issues with large URL lists
|
||||
queue_size = min(10000, max(1000, concurrency * 100))
|
||||
queue = asyncio.Queue(maxsize=queue_size)
|
||||
producer_done = asyncio.Event()
|
||||
stop_event = asyncio.Event()
|
||||
seen: set[str] = set()
|
||||
|
||||
# Results collection
|
||||
results: List[Dict[str, Any]] = []
|
||||
|
||||
async def producer():
|
||||
"""Producer to feed URLs into the queue."""
|
||||
try:
|
||||
for url in urls:
|
||||
if url in seen:
|
||||
self._log("debug", "Skipping duplicate URL: {url}",
|
||||
params={"url": url}, tag="URL_SEED")
|
||||
continue
|
||||
if stop_event.is_set():
|
||||
break
|
||||
seen.add(url)
|
||||
await queue.put(url)
|
||||
finally:
|
||||
producer_done.set()
|
||||
|
||||
async def worker(res_list: List[Dict[str, Any]]):
|
||||
"""Worker to process URLs from the queue."""
|
||||
while True:
|
||||
try:
|
||||
# Wait for URL or producer completion
|
||||
url = await asyncio.wait_for(queue.get(), timeout=1.0)
|
||||
except asyncio.TimeoutError:
|
||||
if producer_done.is_set() and queue.empty():
|
||||
break
|
||||
continue
|
||||
|
||||
try:
|
||||
# Use existing _validate method which handles head extraction, caching, etc.
|
||||
await self._validate(
|
||||
url, res_list,
|
||||
live=False, # We're not doing live checks, just head extraction
|
||||
extract=True, # Always extract head content
|
||||
timeout=timeout,
|
||||
verbose=config.verbose or False,
|
||||
query=config.query,
|
||||
score_threshold=config.score_threshold,
|
||||
scoring_method=config.scoring_method or "bm25",
|
||||
filter_nonsense=config.filter_nonsense_urls
|
||||
)
|
||||
except Exception as e:
|
||||
self._log("error", "Failed to process URL {url}: {error}",
|
||||
params={"url": url, "error": str(e)}, tag="URL_SEED")
|
||||
# Add failed entry to results
|
||||
res_list.append({
|
||||
"url": url,
|
||||
"status": "failed",
|
||||
"head_data": {},
|
||||
"error": str(e)
|
||||
})
|
||||
finally:
|
||||
queue.task_done()
|
||||
|
||||
# Start producer
|
||||
producer_task = asyncio.create_task(producer())
|
||||
|
||||
# Start workers
|
||||
worker_tasks = []
|
||||
for _ in range(concurrency):
|
||||
worker_task = asyncio.create_task(worker(results))
|
||||
worker_tasks.append(worker_task)
|
||||
|
||||
# Wait for producer to finish
|
||||
await producer_task
|
||||
|
||||
# Wait for all items to be processed
|
||||
await queue.join()
|
||||
|
||||
# Cancel workers
|
||||
for task in worker_tasks:
|
||||
task.cancel()
|
||||
|
||||
# Wait for workers to finish canceling
|
||||
await asyncio.gather(*worker_tasks, return_exceptions=True)
|
||||
|
||||
# Apply BM25 scoring if query is provided
|
||||
if config.query and config.scoring_method == "bm25":
|
||||
results = await self._apply_bm25_scoring(results, config)
|
||||
|
||||
# Apply score threshold filtering
|
||||
if config.score_threshold is not None:
|
||||
results = [r for r in results if r.get("relevance_score", 0) >= config.score_threshold]
|
||||
|
||||
# Sort by relevance score if available
|
||||
if any("relevance_score" in r for r in results):
|
||||
results.sort(key=lambda x: x.get("relevance_score", 0), reverse=True)
|
||||
|
||||
self._log("info", "Completed head extraction for {count} URLs, {success} successful",
|
||||
params={
|
||||
"count": len(urls),
|
||||
"success": len([r for r in results if r.get("status") == "valid"])
|
||||
}, tag="URL_SEED")
|
||||
|
||||
return results
|
||||
|
||||
async def _apply_bm25_scoring(self, results: List[Dict[str, Any]], config: "SeedingConfig") -> List[Dict[str, Any]]:
|
||||
"""Apply BM25 scoring to results that have head_data."""
|
||||
if not HAS_BM25:
|
||||
self._log("warning", "BM25 scoring requested but rank_bm25 not available", tag="URL_SEED")
|
||||
return results
|
||||
|
||||
# Extract text contexts from head data
|
||||
text_contexts = []
|
||||
valid_results = []
|
||||
|
||||
for result in results:
|
||||
if result.get("status") == "valid" and result.get("head_data"):
|
||||
text_context = self._extract_text_context(result["head_data"])
|
||||
if text_context:
|
||||
text_contexts.append(text_context)
|
||||
valid_results.append(result)
|
||||
else:
|
||||
# Use URL-based scoring as fallback
|
||||
score = self._calculate_url_relevance_score(config.query, result["url"])
|
||||
result["relevance_score"] = float(score)
|
||||
elif result.get("status") == "valid":
|
||||
# No head data but valid URL - use URL-based scoring
|
||||
score = self._calculate_url_relevance_score(config.query, result["url"])
|
||||
result["relevance_score"] = float(score)
|
||||
|
||||
# Calculate BM25 scores for results with text context
|
||||
if text_contexts and valid_results:
|
||||
scores = await asyncio.to_thread(self._calculate_bm25_score, config.query, text_contexts)
|
||||
for i, result in enumerate(valid_results):
|
||||
if i < len(scores):
|
||||
result["relevance_score"] = float(scores[i])
|
||||
|
||||
return results
|
||||
|
||||
async def _resolve_head(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
HEAD-probe a URL.
|
||||
|
||||
@@ -23,6 +23,8 @@ from .utils import (
|
||||
is_external_url,
|
||||
get_base_domain,
|
||||
extract_metadata_using_lxml,
|
||||
extract_page_context,
|
||||
calculate_link_intrinsic_score,
|
||||
)
|
||||
from lxml import etree
|
||||
from lxml import html as lhtml
|
||||
@@ -944,6 +946,72 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
# Update the links dictionary with unique links
|
||||
links["internal"] = list(internal_links_dict.values())
|
||||
links["external"] = list(external_links_dict.values())
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_extraction_config = kwargs.get("link_extraction_config")
|
||||
if link_extraction_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_extractor import LinkExtractor
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_extraction_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
|
||||
|
||||
# Convert dict links to Link objects
|
||||
internal_links = [Link(**link_data) for link_data in links["internal"]]
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkExtractor
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_extraction_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkExtractor(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
try:
|
||||
# Check if we're already in an async context
|
||||
loop = asyncio.get_running_loop()
|
||||
# If we're in an async context, we need to run in a thread
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(asyncio.run, extract_links())
|
||||
updated_links = future.result()
|
||||
except RuntimeError:
|
||||
# No running loop, we can use asyncio.run directly
|
||||
updated_links = asyncio.run(extract_links())
|
||||
|
||||
# Convert back to dict format
|
||||
links["internal"] = [link.dict() for link in updated_links.internal]
|
||||
links["external"] = [link.dict() for link in updated_links.external]
|
||||
|
||||
if verbose:
|
||||
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
|
||||
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
|
||||
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
|
||||
params={
|
||||
"internal_success": successful_internal,
|
||||
"internal_total": len(updated_links.internal),
|
||||
"external_success": successful_external,
|
||||
"external_total": len(updated_links.external)
|
||||
}, tag="LINK_EXTRACT")
|
||||
else:
|
||||
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
|
||||
|
||||
except Exception as e:
|
||||
self._log("error", f"Link head extraction failed: {str(e)}", tag="LINK_EXTRACT")
|
||||
# Continue with original links if extraction fails
|
||||
|
||||
# # Process images using ThreadPoolExecutor
|
||||
imgs = body.find_all("img")
|
||||
@@ -1037,6 +1105,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
media: Dict[str, List],
|
||||
internal_links_dict: Dict[str, Any],
|
||||
external_links_dict: Dict[str, Any],
|
||||
page_context: dict = None,
|
||||
**kwargs,
|
||||
) -> bool:
|
||||
base_domain = kwargs.get("base_domain", get_base_domain(url))
|
||||
@@ -1056,6 +1125,25 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
"title": link.get("title", "").strip(),
|
||||
"base_domain": base_domain,
|
||||
}
|
||||
|
||||
# Add intrinsic scoring if enabled
|
||||
if kwargs.get("score_links", False) and page_context is not None:
|
||||
try:
|
||||
intrinsic_score = calculate_link_intrinsic_score(
|
||||
link_text=link_data["text"],
|
||||
url=normalized_href,
|
||||
title_attr=link_data["title"],
|
||||
class_attr=link.get("class", ""),
|
||||
rel_attr=link.get("rel", ""),
|
||||
page_context=page_context
|
||||
)
|
||||
link_data["intrinsic_score"] = intrinsic_score
|
||||
except Exception:
|
||||
# Fail gracefully - assign default score
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
else:
|
||||
# No scoring enabled - assign infinity (all links equal priority)
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
if is_external:
|
||||
@@ -1491,6 +1579,33 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
# Extract page context for link scoring (if enabled) - do this BEFORE any removals
|
||||
page_context = None
|
||||
if kwargs.get("score_links", False):
|
||||
try:
|
||||
# Extract title
|
||||
title_elements = doc.xpath('//title')
|
||||
page_title = title_elements[0].text_content() if title_elements else ""
|
||||
|
||||
# Extract headlines
|
||||
headlines = []
|
||||
for tag in ['h1', 'h2', 'h3']:
|
||||
elements = doc.xpath(f'//{tag}')
|
||||
for el in elements:
|
||||
text = el.text_content().strip()
|
||||
if text:
|
||||
headlines.append(text)
|
||||
headlines_text = ' '.join(headlines)
|
||||
|
||||
# Extract meta description
|
||||
meta_desc_elements = doc.xpath('//meta[@name="description"]/@content')
|
||||
meta_description = meta_desc_elements[0] if meta_desc_elements else ""
|
||||
|
||||
# Create page context
|
||||
page_context = extract_page_context(page_title, headlines_text, meta_description, url)
|
||||
except Exception:
|
||||
page_context = {} # Fail gracefully
|
||||
|
||||
# Early removal of all images if exclude_all_images is set
|
||||
# This is more efficient in lxml as we remove elements before any processing
|
||||
if kwargs.get("exclude_all_images", False):
|
||||
@@ -1579,6 +1694,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
media,
|
||||
internal_links_dict,
|
||||
external_links_dict,
|
||||
page_context=page_context,
|
||||
base_domain=base_domain,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -1623,14 +1739,84 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
method="html",
|
||||
with_tail=False,
|
||||
).strip()
|
||||
|
||||
# Create links dictionary in the format expected by LinkExtractor
|
||||
links = {
|
||||
"internal": list(internal_links_dict.values()),
|
||||
"external": list(external_links_dict.values()),
|
||||
}
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_extraction_config = kwargs.get("link_extraction_config")
|
||||
if link_extraction_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_extractor import LinkExtractor
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_extraction_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
|
||||
|
||||
# Convert dict links to Link objects
|
||||
internal_links = [Link(**link_data) for link_data in links["internal"]]
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkExtractor
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_extraction_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_extraction_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkExtractor(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
try:
|
||||
# Check if we're already in an async context
|
||||
loop = asyncio.get_running_loop()
|
||||
# If we're in an async context, we need to run in a thread
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(asyncio.run, extract_links())
|
||||
updated_links = future.result()
|
||||
except RuntimeError:
|
||||
# No running loop, we can use asyncio.run directly
|
||||
updated_links = asyncio.run(extract_links())
|
||||
|
||||
# Convert back to dict format
|
||||
links["internal"] = [link.dict() for link in updated_links.internal]
|
||||
links["external"] = [link.dict() for link in updated_links.external]
|
||||
|
||||
if verbose:
|
||||
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
|
||||
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
|
||||
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
|
||||
params={
|
||||
"internal_success": successful_internal,
|
||||
"internal_total": len(updated_links.internal),
|
||||
"external_success": successful_external,
|
||||
"external_total": len(updated_links.external)
|
||||
}, tag="LINK_EXTRACT")
|
||||
else:
|
||||
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
|
||||
|
||||
except Exception as e:
|
||||
self._log("error", f"Error during link head extraction: {str(e)}", tag="LINK_EXTRACT")
|
||||
# Continue with original links if head extraction fails
|
||||
|
||||
return {
|
||||
"cleaned_html": cleaned_html,
|
||||
"success": success,
|
||||
"media": media,
|
||||
"links": {
|
||||
"internal": list(internal_links_dict.values()),
|
||||
"external": list(external_links_dict.values()),
|
||||
},
|
||||
"links": links,
|
||||
"metadata": meta,
|
||||
}
|
||||
|
||||
|
||||
395
crawl4ai/link_extractor.py
Normal file
395
crawl4ai/link_extractor.py
Normal file
@@ -0,0 +1,395 @@
|
||||
"""
|
||||
Link Extractor for Crawl4AI
|
||||
|
||||
Extracts head content from links discovered during crawling using URLSeeder's
|
||||
efficient parallel processing and caching infrastructure.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import fnmatch
|
||||
from typing import Dict, List, Optional, Any
|
||||
from .async_logger import AsyncLogger
|
||||
from .async_url_seeder import AsyncUrlSeeder
|
||||
from .async_configs import SeedingConfig, CrawlerRunConfig
|
||||
from .models import Links, Link
|
||||
from .utils import calculate_total_score
|
||||
|
||||
|
||||
class LinkExtractor:
|
||||
"""
|
||||
Extracts head content from links using URLSeeder's parallel processing infrastructure.
|
||||
|
||||
This class provides intelligent link filtering and head content extraction with:
|
||||
- Pattern-based inclusion/exclusion filtering
|
||||
- Parallel processing with configurable concurrency
|
||||
- Caching for performance
|
||||
- BM25 relevance scoring
|
||||
- Memory-safe processing for large link sets
|
||||
"""
|
||||
|
||||
def __init__(self, logger: Optional[AsyncLogger] = None):
|
||||
"""
|
||||
Initialize the LinkExtractor.
|
||||
|
||||
Args:
|
||||
logger: Optional logger instance for recording events
|
||||
"""
|
||||
self.logger = logger
|
||||
self.seeder: Optional[AsyncUrlSeeder] = None
|
||||
self._owns_seeder = False
|
||||
|
||||
async def __aenter__(self):
|
||||
"""Async context manager entry."""
|
||||
await self.start()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Async context manager exit."""
|
||||
await self.close()
|
||||
|
||||
async def start(self):
|
||||
"""Initialize the URLSeeder instance."""
|
||||
if not self.seeder:
|
||||
self.seeder = AsyncUrlSeeder(logger=self.logger)
|
||||
await self.seeder.__aenter__()
|
||||
self._owns_seeder = True
|
||||
|
||||
async def close(self):
|
||||
"""Clean up resources."""
|
||||
if self.seeder and self._owns_seeder:
|
||||
await self.seeder.__aexit__(None, None, None)
|
||||
self.seeder = None
|
||||
self._owns_seeder = False
|
||||
|
||||
def _log(self, level: str, message: str, tag: str = "LINK_EXTRACT", **kwargs):
|
||||
"""Helper method to safely log messages."""
|
||||
if self.logger:
|
||||
log_method = getattr(self.logger, level, None)
|
||||
if log_method:
|
||||
log_method(message=message, tag=tag, params=kwargs.get('params', {}))
|
||||
|
||||
async def extract_link_heads(
|
||||
self,
|
||||
links: Links,
|
||||
config: CrawlerRunConfig
|
||||
) -> Links:
|
||||
"""
|
||||
Extract head content for filtered links and attach to Link objects.
|
||||
|
||||
Args:
|
||||
links: Links object containing internal and external links
|
||||
config: CrawlerRunConfig with link_extraction_config settings
|
||||
|
||||
Returns:
|
||||
Links object with head_data attached to filtered Link objects
|
||||
"""
|
||||
link_config = config.link_extraction_config
|
||||
|
||||
# Ensure seeder is initialized
|
||||
await self.start()
|
||||
|
||||
# Filter links based on configuration
|
||||
filtered_urls = self._filter_links(links, link_config)
|
||||
|
||||
if not filtered_urls:
|
||||
self._log("info", "No links matched filtering criteria")
|
||||
return links
|
||||
|
||||
self._log("info", "Extracting head content for {count} filtered links",
|
||||
params={"count": len(filtered_urls)})
|
||||
|
||||
# Extract head content using URLSeeder
|
||||
head_results = await self._extract_heads_parallel(filtered_urls, link_config)
|
||||
|
||||
# Merge results back into Link objects
|
||||
updated_links = self._merge_head_data(links, head_results, config)
|
||||
|
||||
self._log("info", "Completed head extraction for links, {success} successful",
|
||||
params={"success": len([r for r in head_results if r.get("status") == "valid"])})
|
||||
|
||||
return updated_links
|
||||
|
||||
def _filter_links(self, links: Links, link_config: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Filter links based on configuration parameters.
|
||||
|
||||
Args:
|
||||
links: Links object containing internal and external links
|
||||
link_config: Configuration dictionary for link extraction
|
||||
|
||||
Returns:
|
||||
List of filtered URL strings
|
||||
"""
|
||||
filtered_urls = []
|
||||
|
||||
# Include internal links if configured
|
||||
if link_config.include_internal:
|
||||
filtered_urls.extend([link.href for link in links.internal if link.href])
|
||||
self._log("debug", "Added {count} internal links",
|
||||
params={"count": len(links.internal)})
|
||||
|
||||
# Include external links if configured
|
||||
if link_config.include_external:
|
||||
filtered_urls.extend([link.href for link in links.external if link.href])
|
||||
self._log("debug", "Added {count} external links",
|
||||
params={"count": len(links.external)})
|
||||
|
||||
# Apply include patterns
|
||||
include_patterns = link_config.include_patterns
|
||||
if include_patterns:
|
||||
filtered_urls = [
|
||||
url for url in filtered_urls
|
||||
if any(fnmatch.fnmatch(url, pattern) for pattern in include_patterns)
|
||||
]
|
||||
self._log("debug", "After include patterns: {count} links remain",
|
||||
params={"count": len(filtered_urls)})
|
||||
|
||||
# Apply exclude patterns
|
||||
exclude_patterns = link_config.exclude_patterns
|
||||
if exclude_patterns:
|
||||
filtered_urls = [
|
||||
url for url in filtered_urls
|
||||
if not any(fnmatch.fnmatch(url, pattern) for pattern in exclude_patterns)
|
||||
]
|
||||
self._log("debug", "After exclude patterns: {count} links remain",
|
||||
params={"count": len(filtered_urls)})
|
||||
|
||||
# Limit number of links
|
||||
max_links = link_config.max_links
|
||||
if max_links > 0 and len(filtered_urls) > max_links:
|
||||
filtered_urls = filtered_urls[:max_links]
|
||||
self._log("debug", "Limited to {max_links} links",
|
||||
params={"max_links": max_links})
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_urls = []
|
||||
for url in filtered_urls:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_urls.append(url)
|
||||
|
||||
self._log("debug", "Final filtered URLs: {count} unique links",
|
||||
params={"count": len(unique_urls)})
|
||||
|
||||
return unique_urls
|
||||
|
||||
async def _extract_heads_parallel(
|
||||
self,
|
||||
urls: List[str],
|
||||
link_config: Dict[str, Any]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract head content for URLs using URLSeeder's parallel processing.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to process
|
||||
link_config: Configuration dictionary for link extraction
|
||||
|
||||
Returns:
|
||||
List of dictionaries with url, status, head_data, and optional relevance_score
|
||||
"""
|
||||
verbose = link_config.verbose
|
||||
concurrency = link_config.concurrency
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting batch processing: {total} links with {concurrency} concurrent workers",
|
||||
params={"total": len(urls), "concurrency": concurrency})
|
||||
|
||||
# Create SeedingConfig for URLSeeder
|
||||
seeding_config = SeedingConfig(
|
||||
extract_head=True,
|
||||
concurrency=concurrency,
|
||||
hits_per_sec=getattr(link_config, 'hits_per_sec', None),
|
||||
query=link_config.query,
|
||||
score_threshold=link_config.score_threshold,
|
||||
scoring_method="bm25" if link_config.query else None,
|
||||
verbose=verbose
|
||||
)
|
||||
|
||||
# Use URLSeeder's extract_head_for_urls method with progress tracking
|
||||
if verbose:
|
||||
# Create a wrapper to track progress
|
||||
results = await self._extract_with_progress(urls, seeding_config, link_config)
|
||||
else:
|
||||
results = await self.seeder.extract_head_for_urls(
|
||||
urls=urls,
|
||||
config=seeding_config,
|
||||
concurrency=concurrency,
|
||||
timeout=link_config.timeout
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
async def _extract_with_progress(
|
||||
self,
|
||||
urls: List[str],
|
||||
seeding_config: SeedingConfig,
|
||||
link_config: Dict[str, Any]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Extract head content with progress reporting."""
|
||||
|
||||
total_urls = len(urls)
|
||||
concurrency = link_config.concurrency
|
||||
batch_size = max(1, total_urls // 10) # Report progress every 10%
|
||||
|
||||
# Process URLs and track progress
|
||||
completed = 0
|
||||
successful = 0
|
||||
failed = 0
|
||||
|
||||
# Create a custom progress tracking version
|
||||
# We'll modify URLSeeder's method to include progress callbacks
|
||||
|
||||
# For now, let's use the existing method and report at the end
|
||||
# In a production version, we would modify URLSeeder to accept progress callbacks
|
||||
|
||||
self._log("info", "Processing links in batches...")
|
||||
|
||||
# Use existing method
|
||||
results = await self.seeder.extract_head_for_urls(
|
||||
urls=urls,
|
||||
config=seeding_config,
|
||||
concurrency=concurrency,
|
||||
timeout=link_config.timeout
|
||||
)
|
||||
|
||||
# Count results
|
||||
for result in results:
|
||||
completed += 1
|
||||
if result.get("status") == "valid":
|
||||
successful += 1
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
# Final progress report
|
||||
self._log("info", "Batch processing completed: {completed}/{total} processed, {successful} successful, {failed} failed",
|
||||
params={
|
||||
"completed": completed,
|
||||
"total": total_urls,
|
||||
"successful": successful,
|
||||
"failed": failed
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
def _merge_head_data(
|
||||
self,
|
||||
original_links: Links,
|
||||
head_results: List[Dict[str, Any]],
|
||||
config: CrawlerRunConfig
|
||||
) -> Links:
|
||||
"""
|
||||
Merge head extraction results back into Link objects.
|
||||
|
||||
Args:
|
||||
original_links: Original Links object
|
||||
head_results: Results from head extraction
|
||||
|
||||
Returns:
|
||||
Links object with head_data attached to matching links
|
||||
"""
|
||||
# Create URL to head_data mapping
|
||||
url_to_head_data = {}
|
||||
for result in head_results:
|
||||
url = result.get("url")
|
||||
if url:
|
||||
url_to_head_data[url] = {
|
||||
"head_data": result.get("head_data", {}),
|
||||
"status": result.get("status", "unknown"),
|
||||
"error": result.get("error"),
|
||||
"relevance_score": result.get("relevance_score")
|
||||
}
|
||||
|
||||
# Update internal links
|
||||
updated_internal = []
|
||||
for link in original_links.internal:
|
||||
if link.href in url_to_head_data:
|
||||
head_info = url_to_head_data[link.href]
|
||||
# Create new Link object with head data and scoring
|
||||
contextual_score = head_info.get("relevance_score")
|
||||
|
||||
updated_link = Link(
|
||||
href=link.href,
|
||||
text=link.text,
|
||||
title=link.title,
|
||||
base_domain=link.base_domain,
|
||||
head_data=head_info["head_data"],
|
||||
head_extraction_status=head_info["status"],
|
||||
head_extraction_error=head_info.get("error"),
|
||||
intrinsic_score=getattr(link, 'intrinsic_score', None),
|
||||
contextual_score=contextual_score
|
||||
)
|
||||
|
||||
# Add relevance score to head_data for backward compatibility
|
||||
if contextual_score is not None:
|
||||
updated_link.head_data = updated_link.head_data or {}
|
||||
updated_link.head_data["relevance_score"] = contextual_score
|
||||
|
||||
# Calculate total score combining intrinsic and contextual scores
|
||||
updated_link.total_score = calculate_total_score(
|
||||
intrinsic_score=updated_link.intrinsic_score,
|
||||
contextual_score=updated_link.contextual_score,
|
||||
score_links_enabled=getattr(config, 'score_links', False),
|
||||
query_provided=bool(config.link_extraction_config.query)
|
||||
)
|
||||
|
||||
updated_internal.append(updated_link)
|
||||
else:
|
||||
# Keep original link unchanged
|
||||
updated_internal.append(link)
|
||||
|
||||
# Update external links
|
||||
updated_external = []
|
||||
for link in original_links.external:
|
||||
if link.href in url_to_head_data:
|
||||
head_info = url_to_head_data[link.href]
|
||||
# Create new Link object with head data and scoring
|
||||
contextual_score = head_info.get("relevance_score")
|
||||
|
||||
updated_link = Link(
|
||||
href=link.href,
|
||||
text=link.text,
|
||||
title=link.title,
|
||||
base_domain=link.base_domain,
|
||||
head_data=head_info["head_data"],
|
||||
head_extraction_status=head_info["status"],
|
||||
head_extraction_error=head_info.get("error"),
|
||||
intrinsic_score=getattr(link, 'intrinsic_score', None),
|
||||
contextual_score=contextual_score
|
||||
)
|
||||
|
||||
# Add relevance score to head_data for backward compatibility
|
||||
if contextual_score is not None:
|
||||
updated_link.head_data = updated_link.head_data or {}
|
||||
updated_link.head_data["relevance_score"] = contextual_score
|
||||
|
||||
# Calculate total score combining intrinsic and contextual scores
|
||||
updated_link.total_score = calculate_total_score(
|
||||
intrinsic_score=updated_link.intrinsic_score,
|
||||
contextual_score=updated_link.contextual_score,
|
||||
score_links_enabled=getattr(config, 'score_links', False),
|
||||
query_provided=bool(config.link_extraction_config.query)
|
||||
)
|
||||
|
||||
updated_external.append(updated_link)
|
||||
else:
|
||||
# Keep original link unchanged
|
||||
updated_external.append(link)
|
||||
|
||||
# Sort links by relevance score if available
|
||||
if any(hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data
|
||||
for link in updated_internal + updated_external):
|
||||
|
||||
def get_relevance_score(link):
|
||||
if hasattr(link, 'head_data') and link.head_data and 'relevance_score' in link.head_data:
|
||||
return link.head_data['relevance_score']
|
||||
return 0.0
|
||||
|
||||
updated_internal.sort(key=get_relevance_score, reverse=True)
|
||||
updated_external.sort(key=get_relevance_score, reverse=True)
|
||||
|
||||
return Links(
|
||||
internal=updated_internal,
|
||||
external=updated_external
|
||||
)
|
||||
@@ -345,6 +345,12 @@ class Link(BaseModel):
|
||||
text: Optional[str] = ""
|
||||
title: Optional[str] = ""
|
||||
base_domain: Optional[str] = ""
|
||||
head_data: Optional[Dict[str, Any]] = None # Head metadata extracted from link target
|
||||
head_extraction_status: Optional[str] = None # "success", "failed", "skipped"
|
||||
head_extraction_error: Optional[str] = None # Error message if extraction failed
|
||||
intrinsic_score: Optional[float] = None # Quality score based on URL structure, text, and context
|
||||
contextual_score: Optional[float] = None # BM25 relevance score based on query and head content
|
||||
total_score: Optional[float] = None # Combined score from intrinsic and contextual scores
|
||||
|
||||
|
||||
class Media(BaseModel):
|
||||
|
||||
@@ -2939,3 +2939,212 @@ pip install -q nest_asyncio google-colab
|
||||
echo "✅ Setup complete!"
|
||||
''')
|
||||
|
||||
|
||||
# Link Quality Scoring Functions
|
||||
def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict:
|
||||
"""
|
||||
Extract page context for link scoring - called ONCE per page for performance.
|
||||
Parser-agnostic function that takes pre-extracted data.
|
||||
|
||||
Args:
|
||||
page_title: Title of the page
|
||||
headlines_text: Combined text from h1, h2, h3 elements
|
||||
meta_description: Meta description content
|
||||
base_url: Base URL of the page
|
||||
|
||||
Returns:
|
||||
Dictionary containing page context data for fast link scoring
|
||||
"""
|
||||
context = {
|
||||
'terms': set(),
|
||||
'headlines': headlines_text or '',
|
||||
'meta_description': meta_description or '',
|
||||
'domain': '',
|
||||
'is_docs_site': False
|
||||
}
|
||||
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(base_url)
|
||||
context['domain'] = parsed.netloc.lower()
|
||||
|
||||
# Check if this is a documentation/reference site
|
||||
context['is_docs_site'] = any(indicator in context['domain']
|
||||
for indicator in ['docs.', 'api.', 'developer.', 'reference.'])
|
||||
|
||||
# Create term set for fast intersection (performance optimization)
|
||||
all_text = ((page_title or '') + ' ' + context['headlines'] + ' ' + context['meta_description']).lower()
|
||||
# Simple tokenization - fast and sufficient for scoring
|
||||
context['terms'] = set(word.strip('.,!?;:"()[]{}')
|
||||
for word in all_text.split()
|
||||
if len(word.strip('.,!?;:"()[]{}')) > 2)
|
||||
|
||||
except Exception:
|
||||
# Fail gracefully - return empty context
|
||||
pass
|
||||
|
||||
return context
|
||||
|
||||
|
||||
def calculate_link_intrinsic_score(
|
||||
link_text: str,
|
||||
url: str,
|
||||
title_attr: str,
|
||||
class_attr: str,
|
||||
rel_attr: str,
|
||||
page_context: dict
|
||||
) -> float:
|
||||
"""
|
||||
Ultra-fast link quality scoring using only provided data (no DOM access needed).
|
||||
Parser-agnostic function.
|
||||
|
||||
Args:
|
||||
link_text: Text content of the link
|
||||
url: Link URL
|
||||
title_attr: Title attribute of the link
|
||||
class_attr: Class attribute of the link
|
||||
rel_attr: Rel attribute of the link
|
||||
page_context: Pre-computed page context from extract_page_context()
|
||||
|
||||
Returns:
|
||||
Quality score (0.0 - 10.0), higher is better
|
||||
"""
|
||||
score = 0.0
|
||||
|
||||
try:
|
||||
# 1. ATTRIBUTE QUALITY (string analysis - very fast)
|
||||
if title_attr and len(title_attr.strip()) > 3:
|
||||
score += 1.0
|
||||
|
||||
class_str = (class_attr or '').lower()
|
||||
# Navigation/important classes boost score
|
||||
if any(nav_class in class_str for nav_class in ['nav', 'menu', 'primary', 'main', 'important']):
|
||||
score += 1.5
|
||||
# Marketing/ad classes reduce score
|
||||
if any(bad_class in class_str for bad_class in ['ad', 'sponsor', 'track', 'promo', 'banner']):
|
||||
score -= 1.0
|
||||
|
||||
rel_str = (rel_attr or '').lower()
|
||||
# Semantic rel values
|
||||
if any(good_rel in rel_str for good_rel in ['canonical', 'next', 'prev', 'chapter']):
|
||||
score += 1.0
|
||||
if any(bad_rel in rel_str for bad_rel in ['nofollow', 'sponsored', 'ugc']):
|
||||
score -= 0.5
|
||||
|
||||
# 2. URL STRUCTURE QUALITY (string operations - very fast)
|
||||
url_lower = url.lower()
|
||||
|
||||
# High-value path patterns
|
||||
if any(good_path in url_lower for good_path in ['/docs/', '/api/', '/guide/', '/tutorial/', '/reference/', '/manual/']):
|
||||
score += 2.0
|
||||
elif any(medium_path in url_lower for medium_path in ['/blog/', '/article/', '/post/', '/news/']):
|
||||
score += 1.0
|
||||
|
||||
# Penalize certain patterns
|
||||
if any(bad_path in url_lower for bad_path in ['/admin/', '/login/', '/cart/', '/checkout/', '/track/', '/click/']):
|
||||
score -= 1.5
|
||||
|
||||
# URL depth (shallow URLs often more important)
|
||||
url_depth = url.count('/') - 2 # Subtract protocol and domain
|
||||
if url_depth <= 2:
|
||||
score += 1.0
|
||||
elif url_depth > 5:
|
||||
score -= 0.5
|
||||
|
||||
# HTTPS bonus
|
||||
if url.startswith('https://'):
|
||||
score += 0.5
|
||||
|
||||
# 3. TEXT QUALITY (string analysis - very fast)
|
||||
if link_text:
|
||||
text_clean = link_text.strip()
|
||||
if len(text_clean) > 3:
|
||||
score += 1.0
|
||||
|
||||
# Multi-word links are usually more descriptive
|
||||
word_count = len(text_clean.split())
|
||||
if word_count >= 2:
|
||||
score += 0.5
|
||||
if word_count >= 4:
|
||||
score += 0.5
|
||||
|
||||
# Avoid generic link text
|
||||
generic_texts = ['click here', 'read more', 'more info', 'link', 'here']
|
||||
if text_clean.lower() in generic_texts:
|
||||
score -= 1.0
|
||||
|
||||
# 4. CONTEXTUAL RELEVANCE (pre-computed page terms - very fast)
|
||||
if page_context.get('terms') and link_text:
|
||||
link_words = set(word.strip('.,!?;:"()[]{}').lower()
|
||||
for word in link_text.split()
|
||||
if len(word.strip('.,!?;:"()[]{}')) > 2)
|
||||
|
||||
if link_words:
|
||||
# Calculate word overlap ratio
|
||||
overlap = len(link_words & page_context['terms'])
|
||||
if overlap > 0:
|
||||
relevance_ratio = overlap / min(len(link_words), 10) # Cap to avoid over-weighting
|
||||
score += relevance_ratio * 2.0 # Up to 2 points for relevance
|
||||
|
||||
# 5. DOMAIN CONTEXT BONUSES (very fast string checks)
|
||||
if page_context.get('is_docs_site', False):
|
||||
# Documentation sites: prioritize internal navigation
|
||||
if link_text and any(doc_keyword in link_text.lower()
|
||||
for doc_keyword in ['api', 'reference', 'guide', 'tutorial', 'example']):
|
||||
score += 1.0
|
||||
|
||||
except Exception:
|
||||
# Fail gracefully - return minimal score
|
||||
score = 0.5
|
||||
|
||||
# Ensure score is within reasonable bounds
|
||||
return max(0.0, min(score, 10.0))
|
||||
|
||||
|
||||
def calculate_total_score(
|
||||
intrinsic_score: Optional[float] = None,
|
||||
contextual_score: Optional[float] = None,
|
||||
score_links_enabled: bool = False,
|
||||
query_provided: bool = False
|
||||
) -> float:
|
||||
"""
|
||||
Calculate combined total score from intrinsic and contextual scores with smart fallbacks.
|
||||
|
||||
Args:
|
||||
intrinsic_score: Quality score based on URL structure, text, and context (0-10)
|
||||
contextual_score: BM25 relevance score based on query and head content (0-1 typically)
|
||||
score_links_enabled: Whether link scoring is enabled
|
||||
query_provided: Whether a query was provided for contextual scoring
|
||||
|
||||
Returns:
|
||||
Combined total score (0-10 scale)
|
||||
|
||||
Scoring Logic:
|
||||
- No scoring: return 5.0 (neutral score)
|
||||
- Only intrinsic: return normalized intrinsic score
|
||||
- Only contextual: return contextual score scaled to 10
|
||||
- Both: weighted combination (70% intrinsic, 30% contextual scaled)
|
||||
"""
|
||||
# Case 1: No scoring enabled at all
|
||||
if not score_links_enabled:
|
||||
return 5.0 # Neutral score - all links treated equally
|
||||
|
||||
# Normalize scores to handle None values
|
||||
intrinsic = intrinsic_score if intrinsic_score is not None else 0.0
|
||||
contextual = contextual_score if contextual_score is not None else 0.0
|
||||
|
||||
# Case 2: Only intrinsic scoring (no query provided or no head extraction)
|
||||
if not query_provided or contextual_score is None:
|
||||
# Use intrinsic score directly (already 0-10 scale)
|
||||
return max(0.0, min(intrinsic, 10.0))
|
||||
|
||||
# Case 3: Both intrinsic and contextual scores available
|
||||
# Scale contextual score (typically 0-1) to 0-10 range
|
||||
contextual_scaled = min(contextual * 10.0, 10.0)
|
||||
|
||||
# Weighted combination: 70% intrinsic (structure/content quality) + 30% contextual (query relevance)
|
||||
# This gives more weight to link quality while still considering relevance
|
||||
total = (intrinsic * 0.7) + (contextual_scaled * 0.3)
|
||||
|
||||
return max(0.0, min(total, 10.0))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user