Files
crawl4ai/crawl4ai/deep_crawling/scorers.py
Aravind dad592c801 2025 feb alpha 1 (#685)
* spelling change in prompt

* gpt-4o-mini support

* Remove leading Y before here

* prompt spell correction

* (Docs) Fix numbered list end-of-line formatting

Added the missing "two spaces" to add a line break

* fix: access downloads_path through browser_config in _handle_download method - Fixes #585

* crawl

* fix: https://github.com/unclecode/crawl4ai/issues/592

* fix: https://github.com/unclecode/crawl4ai/issues/583

* Docs update: https://github.com/unclecode/crawl4ai/issues/649

* fix: https://github.com/unclecode/crawl4ai/issues/570

* Docs: updated example for content-selection to reflect new changes in yc newsfeed css

* Refactor: Removed old filters and replaced with optimised filters

* fix:Fixed imports as per the new names of filters

* Tests: For deep crawl filters

* Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers.

* fix: awaiting on filters that are async in nature eg: content relevance and seo filters

* fix: https://github.com/unclecode/crawl4ai/issues/592

* fix: https://github.com/unclecode/crawl4ai/issues/715

---------

Co-authored-by: DarshanTank <darshan.tank@gnani.ai>
Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com>
Co-authored-by: Serhat Soydan <ssoydan@gmail.com>
Co-authored-by: cardit1 <maneesh@cardit.in>
Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com>
2025-02-19 14:13:17 +08:00

519 lines
16 KiB
Python

from abc import ABC, abstractmethod
from typing import List, Dict, Optional
from dataclasses import dataclass
from urllib.parse import urlparse, unquote
import re
import logging
from functools import lru_cache
from array import array
import ctypes
import platform
PLATFORM = platform.system()
# Pre-computed scores for common year differences
_SCORE_LOOKUP = [1.0, 0.5, 0.3333333333333333, 0.25]
# Pre-computed scores for common year differences
_FRESHNESS_SCORES = [
1.0, # Current year
0.9, # Last year
0.8, # 2 years ago
0.7, # 3 years ago
0.6, # 4 years ago
0.5, # 5 years ago
]
class ScoringStats:
__slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score')
def __init__(self):
self._urls_scored = 0
self._total_score = 0.0
self._min_score = None # Lazy initialization
self._max_score = None
def update(self, score: float) -> None:
"""Optimized update with minimal operations"""
self._urls_scored += 1
self._total_score += score
# Lazy min/max tracking - only if actually accessed
if self._min_score is not None:
if score < self._min_score:
self._min_score = score
if self._max_score is not None:
if score > self._max_score:
self._max_score = score
def get_average(self) -> float:
"""Direct calculation instead of property"""
return self._total_score / self._urls_scored if self._urls_scored else 0.0
def get_min(self) -> float:
"""Lazy min calculation"""
if self._min_score is None:
self._min_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
return self._min_score
def get_max(self) -> float:
"""Lazy max calculation"""
if self._max_score is None:
self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
return self._max_score
class URLScorer(ABC):
__slots__ = ('_weight', '_stats')
def __init__(self, weight: float = 1.0):
# Store weight directly as float32 for memory efficiency
self._weight = ctypes.c_float(weight).value
self._stats = ScoringStats()
@abstractmethod
def _calculate_score(self, url: str) -> float:
"""Calculate raw score for URL."""
pass
def score(self, url: str) -> float:
"""Calculate weighted score with minimal overhead."""
score = self._calculate_score(url) * self._weight
self._stats.update(score)
return score
@property
def stats(self):
"""Access to scoring statistics."""
return self._stats
@property
def weight(self):
return self._weight
class CompositeScorer(URLScorer):
__slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array')
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
"""Initialize composite scorer combining multiple scoring strategies.
Optimized for:
- Fast parallel scoring
- Memory efficient score aggregation
- Quick short-circuit conditions
- Pre-allocated arrays
Args:
scorers: List of scoring strategies to combine
normalize: Whether to normalize final score by scorer count
"""
super().__init__(weight=1.0)
self._scorers = scorers
self._normalize = normalize
# Pre-allocate arrays for scores and weights
self._weights_array = array('f', [s.weight for s in scorers])
self._score_array = array('f', [0.0] * len(scorers))
@lru_cache(maxsize=10000)
def _calculate_score(self, url: str) -> float:
"""Calculate combined score from all scoring strategies.
Uses:
1. Pre-allocated arrays for scores
2. Short-circuit on zero scores
3. Optimized normalization
4. Vectorized operations where possible
Args:
url: URL to score
Returns:
Combined and optionally normalized score
"""
total_score = 0.0
scores = self._score_array
# Get scores from all scorers
for i, scorer in enumerate(self._scorers):
# Use public score() method which applies weight
scores[i] = scorer.score(url)
total_score += scores[i]
# Normalize if requested
if self._normalize and self._scorers:
count = len(self._scorers)
return total_score / count
return total_score
def score(self, url: str) -> float:
"""Public scoring interface with stats tracking.
Args:
url: URL to score
Returns:
Final combined score
"""
score = self._calculate_score(url)
self.stats.update(score)
return score
class KeywordRelevanceScorer(URLScorer):
__slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive')
def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
super().__init__(weight=weight)
self._case_sensitive = case_sensitive
# Pre-process keywords once
self._keywords = [k if case_sensitive else k.lower() for k in keywords]
@lru_cache(maxsize=10000)
def _url_bytes(self, url: str) -> bytes:
"""Cache decoded URL bytes"""
return url.encode('utf-8') if self._case_sensitive else url.lower().encode('utf-8')
def _calculate_score(self, url: str) -> float:
"""Fast string matching without regex or byte conversion"""
if not self._case_sensitive:
url = url.lower()
matches = sum(1 for k in self._keywords if k in url)
# Fast return paths
if not matches:
return 0.0
if matches == len(self._keywords):
return 1.0
return matches / len(self._keywords)
class PathDepthScorer(URLScorer):
__slots__ = ('_weight', '_stats', '_optimal_depth') # Remove _url_cache
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
super().__init__(weight=weight)
self._optimal_depth = optimal_depth
@staticmethod
@lru_cache(maxsize=10000)
def _quick_depth(path: str) -> int:
"""Ultra fast path depth calculation.
Examples:
- "http://example.com" -> 0 # No path segments
- "http://example.com/" -> 0 # Empty path
- "http://example.com/a" -> 1
- "http://example.com/a/b" -> 2
"""
if not path or path == '/':
return 0
if '/' not in path:
return 0
depth = 0
last_was_slash = True
for c in path:
if c == '/':
if not last_was_slash:
depth += 1
last_was_slash = True
else:
last_was_slash = False
if not last_was_slash:
depth += 1
return depth
@lru_cache(maxsize=10000) # Cache the whole calculation
def _calculate_score(self, url: str) -> float:
pos = url.find('/', url.find('://') + 3)
if pos == -1:
depth = 0
else:
depth = self._quick_depth(url[pos:])
# Use lookup table for common distances
distance = depth - self._optimal_depth
distance = distance if distance >= 0 else -distance # Faster than abs()
if distance < 4:
return _SCORE_LOOKUP[distance]
return 1.0 / (1.0 + distance)
class ContentTypeScorer(URLScorer):
__slots__ = ('_weight', '_exact_types', '_regex_types')
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
"""Initialize scorer with type weights map.
Args:
type_weights: Dict mapping file extensions/patterns to scores (e.g. {'.html$': 1.0})
weight: Overall weight multiplier for this scorer
"""
super().__init__(weight=weight)
self._exact_types = {} # Fast lookup for simple extensions
self._regex_types = [] # Fallback for complex patterns
# Split into exact vs regex matchers for performance
for pattern, score in type_weights.items():
if pattern.startswith('.') and pattern.endswith('$'):
ext = pattern[1:-1]
self._exact_types[ext] = score
else:
self._regex_types.append((re.compile(pattern), score))
# Sort complex patterns by score for early exit
self._regex_types.sort(key=lambda x: -x[1])
@staticmethod
@lru_cache(maxsize=10000)
def _quick_extension(url: str) -> str:
"""Extract file extension ultra-fast without regex/splits.
Handles:
- Basic extensions: "example.html" -> "html"
- Query strings: "page.php?id=1" -> "php"
- Fragments: "doc.pdf#page=1" -> "pdf"
- Path params: "file.jpg;width=100" -> "jpg"
Args:
url: URL to extract extension from
Returns:
Extension without dot, or empty string if none found
"""
pos = url.rfind('.')
if pos == -1:
return ''
# Find first non-alphanumeric char after extension
end = len(url)
for i in range(pos + 1, len(url)):
c = url[i]
# Stop at query string, fragment, path param or any non-alphanumeric
if c in '?#;' or not c.isalnum():
end = i
break
return url[pos + 1:end].lower()
@lru_cache(maxsize=10000)
def _calculate_score(self, url: str) -> float:
"""Calculate content type score for URL.
Uses staged approach:
1. Try exact extension match (fast path)
2. Fall back to regex patterns if needed
Args:
url: URL to score
Returns:
Score between 0.0 and 1.0 * weight
"""
# Fast path: direct extension lookup
ext = self._quick_extension(url)
if ext:
score = self._exact_types.get(ext, None)
if score is not None:
return score
# Slow path: regex patterns
for pattern, score in self._regex_types:
if pattern.search(url):
return score
return 0.0
class FreshnessScorer(URLScorer):
__slots__ = ('_weight', '_date_pattern', '_current_year')
def __init__(self, weight: float = 1.0, current_year: int = 2024):
"""Initialize freshness scorer.
Extracts and scores dates from URLs using format:
- YYYY/MM/DD
- YYYY-MM-DD
- YYYY_MM_DD
- YYYY (year only)
Args:
weight: Score multiplier
current_year: Year to calculate freshness against (default 2024)
"""
super().__init__(weight=weight)
self._current_year = current_year
# Combined pattern for all date formats
# Uses non-capturing groups (?:) and alternation
self._date_pattern = re.compile(
r'(?:/' # Path separator
r'|[-_])' # or date separators
r'((?:19|20)\d{2})' # Year group (1900-2099)
r'(?:' # Optional month/day group
r'(?:/|[-_])' # Date separator
r'(?:\d{2})' # Month
r'(?:' # Optional day
r'(?:/|[-_])' # Date separator
r'(?:\d{2})' # Day
r')?' # Day is optional
r')?' # Month/day group is optional
)
@lru_cache(maxsize=10000)
def _extract_year(self, url: str) -> Optional[int]:
"""Extract the most recent year from URL.
Args:
url: URL to extract year from
Returns:
Year as int or None if no valid year found
"""
matches = self._date_pattern.finditer(url)
latest_year = None
# Find most recent year
for match in matches:
year = int(match.group(1))
if (year <= self._current_year and # Sanity check
(latest_year is None or year > latest_year)):
latest_year = year
return latest_year
@lru_cache(maxsize=10000)
def _calculate_score(self, url: str) -> float:
"""Calculate freshness score based on URL date.
More recent years score higher. Uses pre-computed scoring
table for common year differences.
Args:
url: URL to score
Returns:
Score between 0.0 and 1.0 * weight
"""
year = self._extract_year(url)
if year is None:
return 0.5 # Default score
# Use lookup table for common year differences
year_diff = self._current_year - year
if year_diff < len(_FRESHNESS_SCORES):
return _FRESHNESS_SCORES[year_diff]
# Fallback calculation for older content
return max(0.1, 1.0 - year_diff * 0.1)
class DomainAuthorityScorer(URLScorer):
__slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains')
def __init__(
self,
domain_weights: Dict[str, float],
default_weight: float = 0.5,
weight: float = 1.0,
):
"""Initialize domain authority scorer.
Args:
domain_weights: Dict mapping domains to authority scores
default_weight: Score for unknown domains
weight: Overall scorer weight multiplier
Example:
{
'python.org': 1.0,
'github.com': 0.9,
'medium.com': 0.7
}
"""
super().__init__(weight=weight)
# Pre-process domains for faster lookup
self._domain_weights = {
domain.lower(): score
for domain, score in domain_weights.items()
}
self._default_weight = default_weight
# Cache top domains for fast path
self._top_domains = {
domain: score
for domain, score in sorted(
domain_weights.items(),
key=lambda x: -x[1]
)[:5] # Keep top 5 highest scoring domains
}
@staticmethod
@lru_cache(maxsize=10000)
def _extract_domain(url: str) -> str:
"""Extract domain from URL ultra-fast.
Handles:
- Basic domains: "example.com"
- Subdomains: "sub.example.com"
- Ports: "example.com:8080"
- IPv4: "192.168.1.1"
Args:
url: Full URL to extract domain from
Returns:
Lowercase domain without port
"""
# Find domain start
start = url.find('://')
if start == -1:
start = 0
else:
start += 3
# Find domain end
end = url.find('/', start)
if end == -1:
end = url.find('?', start)
if end == -1:
end = url.find('#', start)
if end == -1:
end = len(url)
# Extract domain and remove port
domain = url[start:end]
port_idx = domain.rfind(':')
if port_idx != -1:
domain = domain[:port_idx]
return domain.lower()
@lru_cache(maxsize=10000)
def _calculate_score(self, url: str) -> float:
"""Calculate domain authority score.
Uses staged approach:
1. Check top domains (fastest)
2. Check full domain weights
3. Return default weight
Args:
url: URL to score
Returns:
Authority score between 0.0 and 1.0 * weight
"""
domain = self._extract_domain(url)
# Fast path: check top domains first
score = self._top_domains.get(domain)
if score is not None:
return score
# Regular path: check all domains
return self._domain_weights.get(domain, self._default_weight)