feat: Add advanced link head extraction with three-layer scoring system (#1)

Squashed commit from feature/link-extractor branch implementing comprehensive link analysis: - Extract HTML head content from discovered links with parallel processing - Three-layer scoring: Intrinsic (URL quality), Contextual (BM25), and Total scores - New LinkExtractionConfig class for type-safe configuration - Pattern-based filtering for internal/external links - Comprehensive documentation and examples
2025-06-27 20:06:04 +08:00
parent e528086341
commit 5c9c305dbf
10 changed files with 2126 additions and 15 deletions
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2939,3 +2939,212 @@ pip install -q nest_asyncio google-colab
 echo "✅ Setup complete!"
 ''')

+
+# Link Quality Scoring Functions
+def extract_page_context(page_title: str, headlines_text: str, meta_description: str, base_url: str) -> dict:
+    """
+    Extract page context for link scoring - called ONCE per page for performance.
+    Parser-agnostic function that takes pre-extracted data.
+    
+    Args:
+        page_title: Title of the page
+        headlines_text: Combined text from h1, h2, h3 elements
+        meta_description: Meta description content
+        base_url: Base URL of the page
+        
+    Returns:
+        Dictionary containing page context data for fast link scoring
+    """
+    context = {
+        'terms': set(),
+        'headlines': headlines_text or '',
+        'meta_description': meta_description or '',
+        'domain': '',
+        'is_docs_site': False
+    }
+    
+    try:
+        from urllib.parse import urlparse
+        parsed = urlparse(base_url)
+        context['domain'] = parsed.netloc.lower()
+        
+        # Check if this is a documentation/reference site
+        context['is_docs_site'] = any(indicator in context['domain'] 
+                                    for indicator in ['docs.', 'api.', 'developer.', 'reference.'])
+        
+        # Create term set for fast intersection (performance optimization)
+        all_text = ((page_title or '') + ' ' + context['headlines'] + ' ' + context['meta_description']).lower()
+        # Simple tokenization - fast and sufficient for scoring
+        context['terms'] = set(word.strip('.,!?;:"()[]{}') 
+                             for word in all_text.split() 
+                             if len(word.strip('.,!?;:"()[]{}')) > 2)
+                             
+    except Exception:
+        # Fail gracefully - return empty context
+        pass
+    
+    return context
+
+
+def calculate_link_intrinsic_score(
+    link_text: str, 
+    url: str, 
+    title_attr: str, 
+    class_attr: str, 
+    rel_attr: str, 
+    page_context: dict
+) -> float:
+    """
+    Ultra-fast link quality scoring using only provided data (no DOM access needed).
+    Parser-agnostic function.
+    
+    Args:
+        link_text: Text content of the link
+        url: Link URL
+        title_attr: Title attribute of the link
+        class_attr: Class attribute of the link
+        rel_attr: Rel attribute of the link
+        page_context: Pre-computed page context from extract_page_context()
+        
+    Returns:
+        Quality score (0.0 - 10.0), higher is better
+    """
+    score = 0.0
+    
+    try:
+        # 1. ATTRIBUTE QUALITY (string analysis - very fast)
+        if title_attr and len(title_attr.strip()) > 3:
+            score += 1.0
+            
+        class_str = (class_attr or '').lower()
+        # Navigation/important classes boost score
+        if any(nav_class in class_str for nav_class in ['nav', 'menu', 'primary', 'main', 'important']):
+            score += 1.5
+        # Marketing/ad classes reduce score  
+        if any(bad_class in class_str for bad_class in ['ad', 'sponsor', 'track', 'promo', 'banner']):
+            score -= 1.0
+            
+        rel_str = (rel_attr or '').lower()
+        # Semantic rel values
+        if any(good_rel in rel_str for good_rel in ['canonical', 'next', 'prev', 'chapter']):
+            score += 1.0
+        if any(bad_rel in rel_str for bad_rel in ['nofollow', 'sponsored', 'ugc']):
+            score -= 0.5
+            
+        # 2. URL STRUCTURE QUALITY (string operations - very fast)
+        url_lower = url.lower()
+        
+        # High-value path patterns
+        if any(good_path in url_lower for good_path in ['/docs/', '/api/', '/guide/', '/tutorial/', '/reference/', '/manual/']):
+            score += 2.0
+        elif any(medium_path in url_lower for medium_path in ['/blog/', '/article/', '/post/', '/news/']):
+            score += 1.0
+            
+        # Penalize certain patterns
+        if any(bad_path in url_lower for bad_path in ['/admin/', '/login/', '/cart/', '/checkout/', '/track/', '/click/']):
+            score -= 1.5
+            
+        # URL depth (shallow URLs often more important)
+        url_depth = url.count('/') - 2  # Subtract protocol and domain
+        if url_depth <= 2:
+            score += 1.0
+        elif url_depth > 5:
+            score -= 0.5
+            
+        # HTTPS bonus
+        if url.startswith('https://'):
+            score += 0.5
+            
+        # 3. TEXT QUALITY (string analysis - very fast)
+        if link_text:
+            text_clean = link_text.strip()
+            if len(text_clean) > 3:
+                score += 1.0
+                
+            # Multi-word links are usually more descriptive
+            word_count = len(text_clean.split())
+            if word_count >= 2:
+                score += 0.5
+            if word_count >= 4:
+                score += 0.5
+                
+            # Avoid generic link text
+            generic_texts = ['click here', 'read more', 'more info', 'link', 'here']
+            if text_clean.lower() in generic_texts:
+                score -= 1.0
+                
+        # 4. CONTEXTUAL RELEVANCE (pre-computed page terms - very fast)
+        if page_context.get('terms') and link_text:
+            link_words = set(word.strip('.,!?;:"()[]{}').lower() 
+                           for word in link_text.split() 
+                           if len(word.strip('.,!?;:"()[]{}')) > 2)
+            
+            if link_words:
+                # Calculate word overlap ratio
+                overlap = len(link_words & page_context['terms'])
+                if overlap > 0:
+                    relevance_ratio = overlap / min(len(link_words), 10)  # Cap to avoid over-weighting
+                    score += relevance_ratio * 2.0  # Up to 2 points for relevance
+                    
+        # 5. DOMAIN CONTEXT BONUSES (very fast string checks)
+        if page_context.get('is_docs_site', False):
+            # Documentation sites: prioritize internal navigation
+            if link_text and any(doc_keyword in link_text.lower() 
+                               for doc_keyword in ['api', 'reference', 'guide', 'tutorial', 'example']):
+                score += 1.0
+                
+    except Exception:
+        # Fail gracefully - return minimal score
+        score = 0.5
+        
+    # Ensure score is within reasonable bounds
+    return max(0.0, min(score, 10.0))
+
+
+def calculate_total_score(
+    intrinsic_score: Optional[float] = None,
+    contextual_score: Optional[float] = None,
+    score_links_enabled: bool = False,
+    query_provided: bool = False
+) -> float:
+    """
+    Calculate combined total score from intrinsic and contextual scores with smart fallbacks.
+    
+    Args:
+        intrinsic_score: Quality score based on URL structure, text, and context (0-10)
+        contextual_score: BM25 relevance score based on query and head content (0-1 typically)
+        score_links_enabled: Whether link scoring is enabled
+        query_provided: Whether a query was provided for contextual scoring
+        
+    Returns:
+        Combined total score (0-10 scale)
+        
+    Scoring Logic:
+        - No scoring: return 5.0 (neutral score)
+        - Only intrinsic: return normalized intrinsic score
+        - Only contextual: return contextual score scaled to 10
+        - Both: weighted combination (70% intrinsic, 30% contextual scaled)
+    """
+    # Case 1: No scoring enabled at all
+    if not score_links_enabled:
+        return 5.0  # Neutral score - all links treated equally
+    
+    # Normalize scores to handle None values
+    intrinsic = intrinsic_score if intrinsic_score is not None else 0.0
+    contextual = contextual_score if contextual_score is not None else 0.0
+    
+    # Case 2: Only intrinsic scoring (no query provided or no head extraction)
+    if not query_provided or contextual_score is None:
+        # Use intrinsic score directly (already 0-10 scale)
+        return max(0.0, min(intrinsic, 10.0))
+    
+    # Case 3: Both intrinsic and contextual scores available
+    # Scale contextual score (typically 0-1) to 0-10 range
+    contextual_scaled = min(contextual * 10.0, 10.0)
+    
+    # Weighted combination: 70% intrinsic (structure/content quality) + 30% contextual (query relevance)
+    # This gives more weight to link quality while still considering relevance
+    total = (intrinsic * 0.7) + (contextual_scaled * 0.3)
+    
+    return max(0.0, min(total, 10.0))
+