feat: Add advanced link head extraction with three-layer scoring system (#1)
Squashed commit from feature/link-extractor branch implementing comprehensive link analysis: - Extract HTML head content from discovered links with parallel processing - Three-layer scoring: Intrinsic (URL quality), Contextual (BM25), and Total scores - New LinkExtractionConfig class for type-safe configuration - Pattern-based filtering for internal/external links - Comprehensive documentation and examples
This commit is contained in:
@@ -2939,3 +2939,212 @@ pip install -q nest_asyncio google-colab
|
||||
echo "✅ Setup complete!"
|
||||
''')
|
||||
|
||||
|
||||
# Link Quality Scoring Functions
|
||||
def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict:
|
||||
"""
|
||||
Extract page context for link scoring - called ONCE per page for performance.
|
||||
Parser-agnostic function that takes pre-extracted data.
|
||||
|
||||
Args:
|
||||
page_title: Title of the page
|
||||
headlines_text: Combined text from h1, h2, h3 elements
|
||||
meta_description: Meta description content
|
||||
base_url: Base URL of the page
|
||||
|
||||
Returns:
|
||||
Dictionary containing page context data for fast link scoring
|
||||
"""
|
||||
context = {
|
||||
'terms': set(),
|
||||
'headlines': headlines_text or '',
|
||||
'meta_description': meta_description or '',
|
||||
'domain': '',
|
||||
'is_docs_site': False
|
||||
}
|
||||
|
||||
try:
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(base_url)
|
||||
context['domain'] = parsed.netloc.lower()
|
||||
|
||||
# Check if this is a documentation/reference site
|
||||
context['is_docs_site'] = any(indicator in context['domain']
|
||||
for indicator in ['docs.', 'api.', 'developer.', 'reference.'])
|
||||
|
||||
# Create term set for fast intersection (performance optimization)
|
||||
all_text = ((page_title or '') + ' ' + context['headlines'] + ' ' + context['meta_description']).lower()
|
||||
# Simple tokenization - fast and sufficient for scoring
|
||||
context['terms'] = set(word.strip('.,!?;:"()[]{}')
|
||||
for word in all_text.split()
|
||||
if len(word.strip('.,!?;:"()[]{}')) > 2)
|
||||
|
||||
except Exception:
|
||||
# Fail gracefully - return empty context
|
||||
pass
|
||||
|
||||
return context
|
||||
|
||||
|
||||
def calculate_link_intrinsic_score(
|
||||
link_text: str,
|
||||
url: str,
|
||||
title_attr: str,
|
||||
class_attr: str,
|
||||
rel_attr: str,
|
||||
page_context: dict
|
||||
) -> float:
|
||||
"""
|
||||
Ultra-fast link quality scoring using only provided data (no DOM access needed).
|
||||
Parser-agnostic function.
|
||||
|
||||
Args:
|
||||
link_text: Text content of the link
|
||||
url: Link URL
|
||||
title_attr: Title attribute of the link
|
||||
class_attr: Class attribute of the link
|
||||
rel_attr: Rel attribute of the link
|
||||
page_context: Pre-computed page context from extract_page_context()
|
||||
|
||||
Returns:
|
||||
Quality score (0.0 - 10.0), higher is better
|
||||
"""
|
||||
score = 0.0
|
||||
|
||||
try:
|
||||
# 1. ATTRIBUTE QUALITY (string analysis - very fast)
|
||||
if title_attr and len(title_attr.strip()) > 3:
|
||||
score += 1.0
|
||||
|
||||
class_str = (class_attr or '').lower()
|
||||
# Navigation/important classes boost score
|
||||
if any(nav_class in class_str for nav_class in ['nav', 'menu', 'primary', 'main', 'important']):
|
||||
score += 1.5
|
||||
# Marketing/ad classes reduce score
|
||||
if any(bad_class in class_str for bad_class in ['ad', 'sponsor', 'track', 'promo', 'banner']):
|
||||
score -= 1.0
|
||||
|
||||
rel_str = (rel_attr or '').lower()
|
||||
# Semantic rel values
|
||||
if any(good_rel in rel_str for good_rel in ['canonical', 'next', 'prev', 'chapter']):
|
||||
score += 1.0
|
||||
if any(bad_rel in rel_str for bad_rel in ['nofollow', 'sponsored', 'ugc']):
|
||||
score -= 0.5
|
||||
|
||||
# 2. URL STRUCTURE QUALITY (string operations - very fast)
|
||||
url_lower = url.lower()
|
||||
|
||||
# High-value path patterns
|
||||
if any(good_path in url_lower for good_path in ['/docs/', '/api/', '/guide/', '/tutorial/', '/reference/', '/manual/']):
|
||||
score += 2.0
|
||||
elif any(medium_path in url_lower for medium_path in ['/blog/', '/article/', '/post/', '/news/']):
|
||||
score += 1.0
|
||||
|
||||
# Penalize certain patterns
|
||||
if any(bad_path in url_lower for bad_path in ['/admin/', '/login/', '/cart/', '/checkout/', '/track/', '/click/']):
|
||||
score -= 1.5
|
||||
|
||||
# URL depth (shallow URLs often more important)
|
||||
url_depth = url.count('/') - 2 # Subtract protocol and domain
|
||||
if url_depth <= 2:
|
||||
score += 1.0
|
||||
elif url_depth > 5:
|
||||
score -= 0.5
|
||||
|
||||
# HTTPS bonus
|
||||
if url.startswith('https://'):
|
||||
score += 0.5
|
||||
|
||||
# 3. TEXT QUALITY (string analysis - very fast)
|
||||
if link_text:
|
||||
text_clean = link_text.strip()
|
||||
if len(text_clean) > 3:
|
||||
score += 1.0
|
||||
|
||||
# Multi-word links are usually more descriptive
|
||||
word_count = len(text_clean.split())
|
||||
if word_count >= 2:
|
||||
score += 0.5
|
||||
if word_count >= 4:
|
||||
score += 0.5
|
||||
|
||||
# Avoid generic link text
|
||||
generic_texts = ['click here', 'read more', 'more info', 'link', 'here']
|
||||
if text_clean.lower() in generic_texts:
|
||||
score -= 1.0
|
||||
|
||||
# 4. CONTEXTUAL RELEVANCE (pre-computed page terms - very fast)
|
||||
if page_context.get('terms') and link_text:
|
||||
link_words = set(word.strip('.,!?;:"()[]{}').lower()
|
||||
for word in link_text.split()
|
||||
if len(word.strip('.,!?;:"()[]{}')) > 2)
|
||||
|
||||
if link_words:
|
||||
# Calculate word overlap ratio
|
||||
overlap = len(link_words & page_context['terms'])
|
||||
if overlap > 0:
|
||||
relevance_ratio = overlap / min(len(link_words), 10) # Cap to avoid over-weighting
|
||||
score += relevance_ratio * 2.0 # Up to 2 points for relevance
|
||||
|
||||
# 5. DOMAIN CONTEXT BONUSES (very fast string checks)
|
||||
if page_context.get('is_docs_site', False):
|
||||
# Documentation sites: prioritize internal navigation
|
||||
if link_text and any(doc_keyword in link_text.lower()
|
||||
for doc_keyword in ['api', 'reference', 'guide', 'tutorial', 'example']):
|
||||
score += 1.0
|
||||
|
||||
except Exception:
|
||||
# Fail gracefully - return minimal score
|
||||
score = 0.5
|
||||
|
||||
# Ensure score is within reasonable bounds
|
||||
return max(0.0, min(score, 10.0))
|
||||
|
||||
|
||||
def calculate_total_score(
|
||||
intrinsic_score: Optional[float] = None,
|
||||
contextual_score: Optional[float] = None,
|
||||
score_links_enabled: bool = False,
|
||||
query_provided: bool = False
|
||||
) -> float:
|
||||
"""
|
||||
Calculate combined total score from intrinsic and contextual scores with smart fallbacks.
|
||||
|
||||
Args:
|
||||
intrinsic_score: Quality score based on URL structure, text, and context (0-10)
|
||||
contextual_score: BM25 relevance score based on query and head content (0-1 typically)
|
||||
score_links_enabled: Whether link scoring is enabled
|
||||
query_provided: Whether a query was provided for contextual scoring
|
||||
|
||||
Returns:
|
||||
Combined total score (0-10 scale)
|
||||
|
||||
Scoring Logic:
|
||||
- No scoring: return 5.0 (neutral score)
|
||||
- Only intrinsic: return normalized intrinsic score
|
||||
- Only contextual: return contextual score scaled to 10
|
||||
- Both: weighted combination (70% intrinsic, 30% contextual scaled)
|
||||
"""
|
||||
# Case 1: No scoring enabled at all
|
||||
if not score_links_enabled:
|
||||
return 5.0 # Neutral score - all links treated equally
|
||||
|
||||
# Normalize scores to handle None values
|
||||
intrinsic = intrinsic_score if intrinsic_score is not None else 0.0
|
||||
contextual = contextual_score if contextual_score is not None else 0.0
|
||||
|
||||
# Case 2: Only intrinsic scoring (no query provided or no head extraction)
|
||||
if not query_provided or contextual_score is None:
|
||||
# Use intrinsic score directly (already 0-10 scale)
|
||||
return max(0.0, min(intrinsic, 10.0))
|
||||
|
||||
# Case 3: Both intrinsic and contextual scores available
|
||||
# Scale contextual score (typically 0-1) to 0-10 range
|
||||
contextual_scaled = min(contextual * 10.0, 10.0)
|
||||
|
||||
# Weighted combination: 70% intrinsic (structure/content quality) + 30% contextual (query relevance)
|
||||
# This gives more weight to link quality while still considering relevance
|
||||
total = (intrinsic * 0.7) + (contextual_scaled * 0.3)
|
||||
|
||||
return max(0.0, min(total, 10.0))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user