feat: Add advanced link head extraction with three-layer scoring system (#1)

Squashed commit from feature/link-extractor branch implementing comprehensive link analysis:

- Extract HTML head content from discovered links with parallel processing
- Three-layer scoring: Intrinsic (URL quality), Contextual (BM25), and Total scores
- New LinkExtractionConfig class for type-safe configuration
- Pattern-based filtering for internal/external links
- Comprehensive documentation and examples
This commit is contained in:
UncleCode
2025-06-27 20:06:04 +08:00
parent e528086341
commit 5c9c305dbf
10 changed files with 2126 additions and 15 deletions

View File

@@ -2939,3 +2939,212 @@ pip install -q nest_asyncio google-colab
echo "✅ Setup complete!"
''')
# Link Quality Scoring Functions
def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict:
"""
Extract page context for link scoring - called ONCE per page for performance.
Parser-agnostic function that takes pre-extracted data.
Args:
page_title: Title of the page
headlines_text: Combined text from h1, h2, h3 elements
meta_description: Meta description content
base_url: Base URL of the page
Returns:
Dictionary containing page context data for fast link scoring
"""
context = {
'terms': set(),
'headlines': headlines_text or '',
'meta_description': meta_description or '',
'domain': '',
'is_docs_site': False
}
try:
from urllib.parse import urlparse
parsed = urlparse(base_url)
context['domain'] = parsed.netloc.lower()
# Check if this is a documentation/reference site
context['is_docs_site'] = any(indicator in context['domain']
for indicator in ['docs.', 'api.', 'developer.', 'reference.'])
# Create term set for fast intersection (performance optimization)
all_text = ((page_title or '') + ' ' + context['headlines'] + ' ' + context['meta_description']).lower()
# Simple tokenization - fast and sufficient for scoring
context['terms'] = set(word.strip('.,!?;:"()[]{}')
for word in all_text.split()
if len(word.strip('.,!?;:"()[]{}')) > 2)
except Exception:
# Fail gracefully - return empty context
pass
return context
def calculate_link_intrinsic_score(
link_text: str,
url: str,
title_attr: str,
class_attr: str,
rel_attr: str,
page_context: dict
) -> float:
"""
Ultra-fast link quality scoring using only provided data (no DOM access needed).
Parser-agnostic function.
Args:
link_text: Text content of the link
url: Link URL
title_attr: Title attribute of the link
class_attr: Class attribute of the link
rel_attr: Rel attribute of the link
page_context: Pre-computed page context from extract_page_context()
Returns:
Quality score (0.0 - 10.0), higher is better
"""
score = 0.0
try:
# 1. ATTRIBUTE QUALITY (string analysis - very fast)
if title_attr and len(title_attr.strip()) > 3:
score += 1.0
class_str = (class_attr or '').lower()
# Navigation/important classes boost score
if any(nav_class in class_str for nav_class in ['nav', 'menu', 'primary', 'main', 'important']):
score += 1.5
# Marketing/ad classes reduce score
if any(bad_class in class_str for bad_class in ['ad', 'sponsor', 'track', 'promo', 'banner']):
score -= 1.0
rel_str = (rel_attr or '').lower()
# Semantic rel values
if any(good_rel in rel_str for good_rel in ['canonical', 'next', 'prev', 'chapter']):
score += 1.0
if any(bad_rel in rel_str for bad_rel in ['nofollow', 'sponsored', 'ugc']):
score -= 0.5
# 2. URL STRUCTURE QUALITY (string operations - very fast)
url_lower = url.lower()
# High-value path patterns
if any(good_path in url_lower for good_path in ['/docs/', '/api/', '/guide/', '/tutorial/', '/reference/', '/manual/']):
score += 2.0
elif any(medium_path in url_lower for medium_path in ['/blog/', '/article/', '/post/', '/news/']):
score += 1.0
# Penalize certain patterns
if any(bad_path in url_lower for bad_path in ['/admin/', '/login/', '/cart/', '/checkout/', '/track/', '/click/']):
score -= 1.5
# URL depth (shallow URLs often more important)
url_depth = url.count('/') - 2 # Subtract protocol and domain
if url_depth <= 2:
score += 1.0
elif url_depth > 5:
score -= 0.5
# HTTPS bonus
if url.startswith('https://'):
score += 0.5
# 3. TEXT QUALITY (string analysis - very fast)
if link_text:
text_clean = link_text.strip()
if len(text_clean) > 3:
score += 1.0
# Multi-word links are usually more descriptive
word_count = len(text_clean.split())
if word_count >= 2:
score += 0.5
if word_count >= 4:
score += 0.5
# Avoid generic link text
generic_texts = ['click here', 'read more', 'more info', 'link', 'here']
if text_clean.lower() in generic_texts:
score -= 1.0
# 4. CONTEXTUAL RELEVANCE (pre-computed page terms - very fast)
if page_context.get('terms') and link_text:
link_words = set(word.strip('.,!?;:"()[]{}').lower()
for word in link_text.split()
if len(word.strip('.,!?;:"()[]{}')) > 2)
if link_words:
# Calculate word overlap ratio
overlap = len(link_words & page_context['terms'])
if overlap > 0:
relevance_ratio = overlap / min(len(link_words), 10) # Cap to avoid over-weighting
score += relevance_ratio * 2.0 # Up to 2 points for relevance
# 5. DOMAIN CONTEXT BONUSES (very fast string checks)
if page_context.get('is_docs_site', False):
# Documentation sites: prioritize internal navigation
if link_text and any(doc_keyword in link_text.lower()
for doc_keyword in ['api', 'reference', 'guide', 'tutorial', 'example']):
score += 1.0
except Exception:
# Fail gracefully - return minimal score
score = 0.5
# Ensure score is within reasonable bounds
return max(0.0, min(score, 10.0))
def calculate_total_score(
intrinsic_score: Optional[float] = None,
contextual_score: Optional[float] = None,
score_links_enabled: bool = False,
query_provided: bool = False
) -> float:
"""
Calculate combined total score from intrinsic and contextual scores with smart fallbacks.
Args:
intrinsic_score: Quality score based on URL structure, text, and context (0-10)
contextual_score: BM25 relevance score based on query and head content (0-1 typically)
score_links_enabled: Whether link scoring is enabled
query_provided: Whether a query was provided for contextual scoring
Returns:
Combined total score (0-10 scale)
Scoring Logic:
- No scoring: return 5.0 (neutral score)
- Only intrinsic: return normalized intrinsic score
- Only contextual: return contextual score scaled to 10
- Both: weighted combination (70% intrinsic, 30% contextual scaled)
"""
# Case 1: No scoring enabled at all
if not score_links_enabled:
return 5.0 # Neutral score - all links treated equally
# Normalize scores to handle None values
intrinsic = intrinsic_score if intrinsic_score is not None else 0.0
contextual = contextual_score if contextual_score is not None else 0.0
# Case 2: Only intrinsic scoring (no query provided or no head extraction)
if not query_provided or contextual_score is None:
# Use intrinsic score directly (already 0-10 scale)
return max(0.0, min(intrinsic, 10.0))
# Case 3: Both intrinsic and contextual scores available
# Scale contextual score (typically 0-1) to 0-10 range
contextual_scaled = min(contextual * 10.0, 10.0)
# Weighted combination: 70% intrinsic (structure/content quality) + 30% contextual (query relevance)
# This gives more weight to link quality while still considering relevance
total = (intrinsic * 0.7) + (contextual_scaled * 0.3)
return max(0.0, min(total, 10.0))