From 513d008de5464e7b09d3423e64ad32ef7e379707 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 27 Jan 2025 11:54:10 +0530 Subject: [PATCH] feat: Merge reviews from unclecode for scorers and filters & Remove the robots.txt compliance from scraper since that will be now handled by crawler --- crawl4ai/scraper/bfs_scraper_strategy.py | 40 +- crawl4ai/scraper/filters.py | 787 ++++++++++++-- crawl4ai/scraper/filters_review.py | 872 ---------------- crawl4ai/scraper/scorers.py | 953 ++++++++++++++++- crawl4ai/scraper/scorers_review.py | 1208 ---------------------- 5 files changed, 1631 insertions(+), 2229 deletions(-) delete mode 100644 crawl4ai/scraper/filters_review.py delete mode 100644 crawl4ai/scraper/scorers_review.py diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index c9db4c1c..646ff1d1 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -4,7 +4,7 @@ from datetime import datetime import asyncio import logging from urllib.parse import urlparse -from urllib.robotparser import RobotFileParser + # import validators from ..async_configs import CrawlerRunConfig @@ -26,7 +26,6 @@ class CrawlStats: urls_skipped: int = 0 total_depth_reached: int = 0 current_depth: int = 0 - robots_blocked: int = 0 class BFSScraperStrategy(ScraperStrategy): @@ -49,13 +48,11 @@ class BFSScraperStrategy(ScraperStrategy): self.stats = CrawlStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() self.process_external_links = process_external_links - self.robot_parsers: Dict[str, RobotFileParser] = {} async def can_process_url(self, url: str, depth: int) -> bool: """Check if URL can be processed based on robots.txt and filters This is our gatekeeper method that determines if a URL should be processed. It: - Validates URL format using a robust built-in method - - Checks robots.txt permissions for the domain - Applies custom filters from the filter chain - Updates statistics for blocked URLs - Returns False early if any check fails @@ -64,47 +61,20 @@ class BFSScraperStrategy(ScraperStrategy): result = urlparse(url) if not all([result.scheme, result.netloc]): raise ValueError("Invalid URL") - if result.scheme not in ('http', 'https'): + if result.scheme not in ("http", "https"): raise ValueError("URL must be HTTP or HTTPS") - if not result.netloc or '.' not in result.netloc: + if not result.netloc or "." not in result.netloc: raise ValueError("Invalid domain") except Exception as e: self.logger.warning(f"Invalid URL: {url}. Error: {str(e)}") return False - robot_parser = await self._get_robot_parser(url) - if robot_parser and not robot_parser.can_fetch("*", url): - self.stats.robots_blocked += 1 - self.logger.info(f"Blocked by robots.txt: {url}") - return False - # Apply the filter chain if it's not start page if depth != 0 and not self.filter_chain.apply(url): return False return True - async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]: - """Get or create robots.txt parser for domain. - This is our robots.txt manager that: - - Uses domain-level caching of robot parsers - - Creates and caches new parsers as needed - - Handles failed robots.txt fetches gracefully - - Returns None if robots.txt can't be fetched, allowing crawling to proceed - """ - domain = urlparse(url).netloc - if domain not in self.robot_parsers: - parser = RobotFileParser() - try: - robots_url = f"{urlparse(url).scheme}://{domain}/robots.txt" - parser.set_url(robots_url) - parser.read() - self.robot_parsers[domain] = parser - except Exception as e: - self.logger.warning(f"Error fetching robots.txt for {domain}: {e}") - return None - return self.robot_parsers[domain] - async def _process_links( self, result: CrawlResult, @@ -117,7 +87,7 @@ class BFSScraperStrategy(ScraperStrategy): """Process extracted links from crawl result. This is our link processor that: Handles both internal and external links - Checks if URL can be processed - validates URL, applies Filters and tests Robots.txt compliance with can_process_url + Checks if URL can be processed - validates URL, applies Filters with can_process_url Checks depth limits Scores URLs for priority Updates depth tracking @@ -233,5 +203,3 @@ class BFSScraperStrategy(ScraperStrategy): async def shutdown(self): """Clean up resources and stop crawling""" self._cancel_event.set() - # Clear caches and close connections - self.robot_parsers.clear() diff --git a/crawl4ai/scraper/filters.py b/crawl4ai/scraper/filters.py index b0547ab9..1e123d11 100644 --- a/crawl4ai/scraper/filters.py +++ b/crawl4ai/scraper/filters.py @@ -1,29 +1,30 @@ -# from .url_filter import URLFilter, FilterChain -# from .content_type_filter import ContentTypeFilter -# from .url_pattern_filter import URLPatternFilter - from abc import ABC, abstractmethod -from typing import List, Pattern, Set, Union -import re +from typing import List, Pattern, Set, Union, FrozenSet +import re, time from urllib.parse import urlparse -import mimetypes +from array import array import logging -from dataclasses import dataclass +from functools import lru_cache import fnmatch +from dataclasses import dataclass +from typing import ClassVar +import weakref +import mimetypes @dataclass class FilterStats: - """Statistics for filter applications""" - + # PERF: Using dataclass creates overhead with __init__ and property access + # PERF: Could use __slots__ to reduce memory footprint + # PERF: Consider using array.array('I') for atomic increments total_urls: int = 0 rejected_urls: int = 0 passed_urls: int = 0 class URLFilter(ABC): - """Base class for URL filters""" - + # PERF: Logger creation is expensive, consider lazy initialization + # PERF: stats object creation adds overhead for each filter instance def __init__(self, name: str = None): self.name = name or self.__class__.__name__ self.stats = FilterStats() @@ -31,37 +32,29 @@ class URLFilter(ABC): @abstractmethod def apply(self, url: str) -> bool: - """Apply the filter to a URL""" pass def _update_stats(self, passed: bool): - """Update filter statistics""" - # INFO: Old trick to make things faster + # PERF: Already optimized but could use bitwise operations + # PERF: Consider removing stats entirely in production/fast mode self.stats.total_urls += 1 self.stats.passed_urls += passed - self.stats.rejected_urls += not passed - # self.stats.total_urls += 1 - # if passed: - # self.stats.passed_urls += 1 - # else: - # self.stats.rejected_urls += 1 + self.stats.rejected_urls += not passed class FilterChain: - """Chain of URL filters.""" - + # PERF: List traversal for each URL is expensive + # PERF: Could use array.array instead of list for filters + # PERF: Consider adding fast path for single filter case def __init__(self, filters: List[URLFilter] = None): self.filters = filters or [] self.stats = FilterStats() self.logger = logging.getLogger("urlfilter.chain") - def add_filter(self, filter_: URLFilter) -> "FilterChain": - """Add a filter to the chain""" - self.filters.append(filter_) - return self # Enable method chaining - def apply(self, url: str) -> bool: - """Apply all filters in the chain""" + # PERF: Logging on every rejection is expensive + # PERF: Could reorder filters by rejection rate + # PERF: Consider batch processing mode self.stats.total_urls += 1 for filter_ in self.filters: @@ -75,19 +68,9 @@ class FilterChain: class URLPatternFilter(URLFilter): - """Filter URLs based on glob patterns or regex. - - pattern_filter = URLPatternFilter([ - "*.example.com/*", # Glob pattern - "*/article/*", # Path pattern - re.compile(r"blog-\d+") # Regex pattern - ]) - - - Supports glob patterns and regex - - Multiple patterns per filter - - Pattern pre-compilation for performance - """ - + # PERF: Converting glob to regex is expensive + # PERF: Multiple regex compilation is slow + # PERF: List of patterns causes multiple regex evaluations def __init__( self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], @@ -98,6 +81,8 @@ class URLPatternFilter(URLFilter): self.use_glob = use_glob self._compiled_patterns = [] + # PERF: This could be consolidated into a single regex with OR conditions + # PERF: glob_to_regex creates complex patterns, could be simplified for pattern in self.patterns: if isinstance(pattern, str) and use_glob: self._compiled_patterns.append(self._glob_to_regex(pattern)) @@ -107,29 +92,22 @@ class URLPatternFilter(URLFilter): ) def _glob_to_regex(self, pattern: str) -> Pattern: - """Convert glob pattern to regex""" + # PERF: fnmatch.translate creates overly complex patterns + # PERF: Could cache common translations return re.compile(fnmatch.translate(pattern)) def apply(self, url: str) -> bool: - """Check if URL matches any of the patterns""" + # PERF: any() with generator is slower than direct loop with early return + # PERF: searching entire string is slower than anchored match matches = any(pattern.search(url) for pattern in self._compiled_patterns) self._update_stats(matches) return matches class ContentTypeFilter(URLFilter): - """Filter URLs based on expected content type. - - content_filter = ContentTypeFilter([ - "text/html", - "application/pdf" - ], check_extension=True) - - - Filter by MIME types - - Extension checking - - Support for multiple content types - """ - + # PERF: mimetypes guessing is extremely slow + # PERF: URL parsing on every check is expensive + # PERF: No caching of results for similar extensions def __init__( self, allowed_types: Union[str, List[str]], check_extension: bool = True ): @@ -145,15 +123,18 @@ class ContentTypeFilter(URLFilter): self.allowed_types = [t.lower() for t in self.allowed_types] def _check_extension(self, url: str) -> bool: - """Check URL's file extension""" + # PERF: urlparse is called on every check + # PERF: multiple string splits are expensive + # PERF: mimetypes.guess_type is very slow ext = ( urlparse(url).path.split(".")[-1].lower() if "." in urlparse(url).path else "" ) if not ext: - return True # No extension, might be dynamic content + return True + # PERF: guess_type is main bottleneck guessed_type = mimetypes.guess_type(url)[0] return any( allowed in (guessed_type or "").lower() for allowed in self.allowed_types @@ -169,24 +150,16 @@ class ContentTypeFilter(URLFilter): class DomainFilter(URLFilter): - """Filter URLs based on allowed/blocked domains. - - domain_filter = DomainFilter( - allowed_domains=["example.com", "blog.example.com"], - blocked_domains=["ads.example.com"] - ) - - - Allow/block specific domains - - Subdomain support - - Efficient domain matching - """ - + # PERF: Set lookups are fast but string normalizations on init are not + # PERF: Creating two sets doubles memory usage def __init__( self, allowed_domains: Union[str, List[str]] = None, blocked_domains: Union[str, List[str]] = None, ): super().__init__() + # PERF: Normalizing domains on every init is wasteful + # PERF: Could use frozenset for immutable lists self.allowed_domains = ( set(self._normalize_domains(allowed_domains)) if allowed_domains else None ) @@ -195,17 +168,21 @@ class DomainFilter(URLFilter): ) def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]: - """Normalize domain strings""" + # PERF: strip() and lower() create new strings for each domain + # PERF: List comprehension creates intermediate list if isinstance(domains, str): domains = [domains] return [d.lower().strip() for d in domains] def _extract_domain(self, url: str) -> str: - """Extract domain from URL""" + # PERF: urlparse is called for every URL check + # PERF: lower() creates new string every time + # PERF: Could cache recent results return urlparse(url).netloc.lower() def apply(self, url: str) -> bool: - """Check if URL's domain is allowed""" + # PERF: Two separate set lookups in worst case + # PERF: Domain extraction happens before knowing if we have any filters domain = self._extract_domain(url) if domain in self.blocked_domains: @@ -239,47 +216,653 @@ def create_common_filter_chain() -> FilterChain: ) +#################################################################################### +# Uncledoe: Optimized Version +#################################################################################### + + +# Use __slots__ and array for maximum memory/speed efficiency +class FastFilterStats: + __slots__ = ("_counters",) + + def __init__(self): + # Use array of unsigned ints for atomic operations + self._counters = array("I", [0, 0, 0]) # total, passed, rejected + + @property + def total_urls(self): + return self._counters[0] + + @property + def passed_urls(self): + return self._counters[1] + + @property + def rejected_urls(self): + return self._counters[2] + + +class FastURLFilter(ABC): + """Optimized base filter class""" + + __slots__ = ("name", "stats", "_logger_ref") + + def __init__(self, name: str = None): + self.name = name or self.__class__.__name__ + self.stats = FastFilterStats() + # Lazy logger initialization using weakref + self._logger_ref = None + + @property + def logger(self): + if self._logger_ref is None or self._logger_ref() is None: + logger = logging.getLogger(f"urlfilter.{self.name}") + self._logger_ref = weakref.ref(logger) + return self._logger_ref() + + @abstractmethod + def apply(self, url: str) -> bool: + pass + + def _update_stats(self, passed: bool): + # Use direct array index for speed + self.stats._counters[0] += 1 # total + self.stats._counters[1] += passed # passed + self.stats._counters[2] += not passed # rejected + + +class FastFilterChain: + """Optimized filter chain""" + + __slots__ = ("filters", "stats", "_logger_ref") + + def __init__(self, filters: List[FastURLFilter] = None): + self.filters = tuple(filters or []) # Immutable tuple for speed + self.stats = FastFilterStats() + self._logger_ref = None + + @property + def logger(self): + if self._logger_ref is None or self._logger_ref() is None: + logger = logging.getLogger("urlfilter.chain") + self._logger_ref = weakref.ref(logger) + return self._logger_ref() + + def add_filter(self, filter_: FastURLFilter) -> "FastFilterChain": + """Add a filter to the chain""" + self.filters.append(filter_) + return self # Enable method chaining + + def apply(self, url: str) -> bool: + """Optimized apply with minimal operations""" + self.stats._counters[0] += 1 # total + + # Direct tuple iteration is faster than list + for f in self.filters: + if not f.apply(url): + self.stats._counters[2] += 1 # rejected + return False + + self.stats._counters[1] += 1 # passed + return True + +class FastURLPatternFilter(FastURLFilter): + """Pattern filter balancing speed and completeness""" + __slots__ = ('_simple_suffixes', '_simple_prefixes', '_domain_patterns', '_path_patterns') + + PATTERN_TYPES = { + 'SUFFIX': 1, # *.html + 'PREFIX': 2, # /foo/* + 'DOMAIN': 3, # *.example.com + 'PATH': 4 , # Everything else + 'REGEX': 5 + } + + def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True): + super().__init__() + patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns + + self._simple_suffixes = set() + self._simple_prefixes = set() + self._domain_patterns = [] + self._path_patterns = [] + + for pattern in patterns: + pattern_type = self._categorize_pattern(pattern) + self._add_pattern(pattern, pattern_type) + + def _categorize_pattern(self, pattern: str) -> int: + """Categorize pattern for specialized handling""" + if not isinstance(pattern, str): + return self.PATTERN_TYPES['PATH'] + + # Check if it's a regex pattern + if pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern: + return self.PATTERN_TYPES['REGEX'] + + if pattern.count('*') == 1: + if pattern.startswith('*.'): + return self.PATTERN_TYPES['SUFFIX'] + if pattern.endswith('/*'): + return self.PATTERN_TYPES['PREFIX'] + + if '://' in pattern and pattern.startswith('*.'): + return self.PATTERN_TYPES['DOMAIN'] + + return self.PATTERN_TYPES['PATH'] + + def _add_pattern(self, pattern: str, pattern_type: int): + """Add pattern to appropriate matcher""" + if pattern_type == self.PATTERN_TYPES['REGEX']: + # For regex patterns, compile directly without glob translation + if isinstance(pattern, str) and (pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern): + self._path_patterns.append(re.compile(pattern)) + return + elif pattern_type == self.PATTERN_TYPES['SUFFIX']: + self._simple_suffixes.add(pattern[2:]) + elif pattern_type == self.PATTERN_TYPES['PREFIX']: + self._simple_prefixes.add(pattern[:-2]) + elif pattern_type == self.PATTERN_TYPES['DOMAIN']: + self._domain_patterns.append( + re.compile(pattern.replace('*.', r'[^/]+\.')) + ) + else: + if isinstance(pattern, str): + # Handle complex glob patterns + if '**' in pattern: + pattern = pattern.replace('**', '.*') + if '{' in pattern: + # Convert {a,b} to (a|b) + pattern = re.sub(r'\{([^}]+)\}', + lambda m: f'({"|".join(m.group(1).split(","))})', + pattern) + pattern = fnmatch.translate(pattern) + self._path_patterns.append( + pattern if isinstance(pattern, Pattern) else re.compile(pattern) + ) + + @lru_cache(maxsize=10000) + def apply(self, url: str) -> bool: + """Hierarchical pattern matching""" + # Quick suffix check (*.html) + if self._simple_suffixes: + path = url.split('?')[0] + if path.split('/')[-1].split('.')[-1] in self._simple_suffixes: + self._update_stats(True) + return True + + # Domain check + if self._domain_patterns: + for pattern in self._domain_patterns: + if pattern.match(url): + self._update_stats(True) + return True + + # Prefix check (/foo/*) + if self._simple_prefixes: + path = url.split('?')[0] + if any(path.startswith(p) for p in self._simple_prefixes): + self._update_stats(True) + return True + + # Complex patterns + if self._path_patterns: + if any(p.search(url) for p in self._path_patterns): + self._update_stats(True) + return True + + self._update_stats(False) + return False + + +class FastContentTypeFilter(FastURLFilter): + """Optimized content type filter using fast lookups""" + + __slots__ = ("allowed_types", "_ext_map", "_check_extension") + + # Fast extension to mime type mapping + _MIME_MAP = { + # Text Formats + "txt": "text/plain", + "html": "text/html", + "htm": "text/html", + "xhtml": "application/xhtml+xml", + "css": "text/css", + "csv": "text/csv", + "ics": "text/calendar", + "js": "application/javascript", + # Images + "bmp": "image/bmp", + "gif": "image/gif", + "jpeg": "image/jpeg", + "jpg": "image/jpeg", + "png": "image/png", + "svg": "image/svg+xml", + "tiff": "image/tiff", + "ico": "image/x-icon", + "webp": "image/webp", + # Audio + "mp3": "audio/mpeg", + "wav": "audio/wav", + "ogg": "audio/ogg", + "m4a": "audio/mp4", + "aac": "audio/aac", + # Video + "mp4": "video/mp4", + "mpeg": "video/mpeg", + "webm": "video/webm", + "avi": "video/x-msvideo", + "mov": "video/quicktime", + "flv": "video/x-flv", + "wmv": "video/x-ms-wmv", + "mkv": "video/x-matroska", + # Applications + "json": "application/json", + "xml": "application/xml", + "pdf": "application/pdf", + "zip": "application/zip", + "gz": "application/gzip", + "tar": "application/x-tar", + "rar": "application/vnd.rar", + "7z": "application/x-7z-compressed", + "exe": "application/vnd.microsoft.portable-executable", + "msi": "application/x-msdownload", + # Fonts + "woff": "font/woff", + "woff2": "font/woff2", + "ttf": "font/ttf", + "otf": "font/otf", + # Microsoft Office + "doc": "application/msword", + "dot": "application/msword", + "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "xls": "application/vnd.ms-excel", + "ppt": "application/vnd.ms-powerpoint", + "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + # OpenDocument Formats + "odt": "application/vnd.oasis.opendocument.text", + "ods": "application/vnd.oasis.opendocument.spreadsheet", + "odp": "application/vnd.oasis.opendocument.presentation", + # Archives + "tar.gz": "application/gzip", + "tgz": "application/gzip", + "bz2": "application/x-bzip2", + # Others + "rtf": "application/rtf", + "apk": "application/vnd.android.package-archive", + "epub": "application/epub+zip", + "jar": "application/java-archive", + "swf": "application/x-shockwave-flash", + "midi": "audio/midi", + "mid": "audio/midi", + "ps": "application/postscript", + "ai": "application/postscript", + "eps": "application/postscript", + # Custom or less common + "bin": "application/octet-stream", + "dmg": "application/x-apple-diskimage", + "iso": "application/x-iso9660-image", + "deb": "application/x-debian-package", + "rpm": "application/x-rpm", + "sqlite": "application/vnd.sqlite3", + # Placeholder + "unknown": "application/octet-stream", # Fallback for unknown file types + } + + @staticmethod + @lru_cache(maxsize=1000) + def _extract_extension(path: str) -> str: + """Fast extension extraction with caching""" + if "." not in path: + return "" + return path.rpartition(".")[-1].lower() + + def __init__( + self, allowed_types: Union[str, List[str]], check_extension: bool = True + ): + super().__init__() + # Normalize and store as frozenset for fast lookup + self.allowed_types = frozenset( + t.lower() + for t in ( + allowed_types if isinstance(allowed_types, list) else [allowed_types] + ) + ) + self._check_extension = check_extension + + # Pre-compute extension map for allowed types + self._ext_map = frozenset( + ext + for ext, mime in self._MIME_MAP.items() + if any(allowed in mime for allowed in self.allowed_types) + ) + + @lru_cache(maxsize=1000) + def _check_url_cached(self, url: str) -> bool: + """Cached URL checking""" + if not self._check_extension: + return True + + path = url.split("?")[0] # Fast path split + ext = self._extract_extension(path) + if not ext: + return True + + return ext in self._ext_map + + def apply(self, url: str) -> bool: + """Fast extension check with caching""" + result = self._check_url_cached(url) + self._update_stats(result) + return result + + +class FastDomainFilter(FastURLFilter): + """Optimized domain filter with fast lookups and caching""" + + __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache") + + # Regex for fast domain extraction + _DOMAIN_REGEX = re.compile(r"://([^/]+)") + + def __init__( + self, + allowed_domains: Union[str, List[str]] = None, + blocked_domains: Union[str, List[str]] = None, + ): + super().__init__() + + # Convert inputs to frozensets for immutable, fast lookups + self._allowed_domains = ( + frozenset(self._normalize_domains(allowed_domains)) + if allowed_domains + else None + ) + self._blocked_domains = ( + frozenset(self._normalize_domains(blocked_domains)) + if blocked_domains + else frozenset() + ) + + @staticmethod + def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]: + """Fast domain normalization""" + if isinstance(domains, str): + return {domains.lower()} + return {d.lower() for d in domains} + + @staticmethod + @lru_cache(maxsize=10000) + def _extract_domain(url: str) -> str: + """Ultra-fast domain extraction with regex and caching""" + match = FastDomainFilter._DOMAIN_REGEX.search(url) + return match.group(1).lower() if match else "" + + def apply(self, url: str) -> bool: + """Optimized domain checking with early returns""" + # Skip processing if no filters + if not self._blocked_domains and self._allowed_domains is None: + self._update_stats(True) + return True + + domain = self._extract_domain(url) + + # Early return for blocked domains + if domain in self._blocked_domains: + self._update_stats(False) + return False + + # If no allowed domains specified, accept all non-blocked + if self._allowed_domains is None: + self._update_stats(True) + return True + + # Final allowed domains check + result = domain in self._allowed_domains + self._update_stats(result) + return result + + +def create_fast_filter_chain() -> FastFilterChain: + """Create an optimized filter chain with filters ordered by rejection rate""" + return FastFilterChain( + [ + # Domain filter first (fastest rejection) + FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]), + # Content filter second (medium speed) + FastContentTypeFilter(["text/html", "application/xhtml+xml"]), + # Pattern filter last (most expensive) + FastURLPatternFilter( + [ + "*.html", + "*.htm", + "*/article/*", + "*/blog/*", + ] + ), + ] + ) + def run_performance_test(): import time import random - - # Test URLs - test_urls = [ - 'https://example.com/article/123', - 'https://blog.example.com/post/456', - 'https://ads.example.com/tracking', - 'https://example.com/about.html', - 'https://analytics.example.com/script.js', - 'https://example.com/products.php', - 'https://subdomain.example.com/blog/post-123', - 'https://example.com/path/file.pdf', - ] * 100000 # Create 800k URLs to test - - def benchmark(name: str, func, *args): + from itertools import cycle + + # Generate test URLs + base_urls = [ + "https://example.com/article/123", + "https://blog.example.com/post/456", + "https://ads.example.com/tracking", + "https://example.com/about.html", + "https://analytics.example.com/script.js", + "https://example.com/products.php", + "https://subdomain.example.com/blog/post-123", + "https://example.com/path/file.pdf", + ] + + # Create more varied test data + test_urls = [] + for base in base_urls: + # Add original + test_urls.append(base) + # Add variations + parts = base.split("/") + for i in range(10): + parts[-1] = f"page_{i}.html" + test_urls.append("/".join(parts)) + + # Multiply to get enough test data + test_urls = test_urls * 10000 # Creates ~800k URLs + + def benchmark(name: str, func, *args, warmup=True): + if warmup: + # Warmup run + func(*args) + + # Actual timing start = time.perf_counter_ns() - result = func(*args) if args[0] else func() + result = func(*args) elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms - print(f"{name:<30} {elapsed:>8.3f} ms") + print( + f"{name:<30} {elapsed:>8.3f} ms ({len(test_urls)/elapsed*1000:,.0f} URLs/sec)" + ) return result - # Test individual filters + print("\nBenchmarking original vs optimized implementations...") + print("-" * 70) + + # Original implementation pattern_filter = URLPatternFilter(["*.html", "*/article/*"]) content_filter = ContentTypeFilter(["text/html"]) domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"]) - - # Test chain chain = FilterChain([pattern_filter, content_filter, domain_filter]) + + # Optimized implementation + fast_pattern_filter = FastURLPatternFilter(["*.html", "*/article/*"]) + fast_content_filter = FastContentTypeFilter(["text/html"]) + fast_domain_filter = FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]) + fast_chain = FastFilterChain( + [fast_domain_filter, fast_content_filter, fast_pattern_filter] + ) + + # Test individual filters + print("\nSingle filter performance (first 1000 URLs):") + test_subset = test_urls[:1000] + + print("\nPattern Filters:") + benchmark( + "Original Pattern Filter", + lambda: [pattern_filter.apply(url) for url in test_subset], + ) + benchmark( + "Optimized Pattern Filter", + lambda: [fast_pattern_filter.apply(url) for url in test_subset], + ) + + print("\nContent Filters:") + benchmark( + "Original Content Filter", + lambda: [content_filter.apply(url) for url in test_subset], + ) + benchmark( + "Optimized Content Filter", + lambda: [fast_content_filter.apply(url) for url in test_subset], + ) + + print("\nDomain Filters:") + benchmark( + "Original Domain Filter", + lambda: [domain_filter.apply(url) for url in test_subset], + ) + benchmark( + "Optimized Domain Filter", + lambda: [fast_domain_filter.apply(url) for url in test_subset], + ) + + print("\nFull Chain Performance (all URLs):") + # Test chain + benchmark("Original Chain", lambda: [chain.apply(url) for url in test_urls]) + benchmark("Optimized Chain", lambda: [fast_chain.apply(url) for url in test_urls]) + + # Memory usage + import sys + + print("\nMemory Usage per Filter:") + print(f"Original Pattern Filter: {sys.getsizeof(pattern_filter):,} bytes") + print(f"Optimized Pattern Filter: {sys.getsizeof(fast_pattern_filter):,} bytes") + print(f"Original Content Filter: {sys.getsizeof(content_filter):,} bytes") + print(f"Optimized Content Filter: {sys.getsizeof(fast_content_filter):,} bytes") + print(f"Original Domain Filter: {sys.getsizeof(domain_filter):,} bytes") + print(f"Optimized Domain Filter: {sys.getsizeof(fast_domain_filter):,} bytes") + +def test_pattern_filter(): + import time + from itertools import chain + + # Test cases as list of tuples instead of dict for multiple patterns + test_cases = [ + # Simple suffix patterns (*.html) + ("*.html", { + "https://example.com/page.html": True, + "https://example.com/path/doc.html": True, + "https://example.com/page.htm": False, + "https://example.com/page.html?param=1": True, + }), + + # Path prefix patterns (/foo/*) + ("*/article/*", { + "https://example.com/article/123": True, + "https://example.com/blog/article/456": True, + "https://example.com/articles/789": False, + "https://example.com/article": False, + }), + + # Complex patterns + ("blog-*-[0-9]", { + "https://example.com/blog-post-1": True, + "https://example.com/blog-test-9": True, + "https://example.com/blog-post": False, + "https://example.com/blog-post-x": False, + }), + + # Multiple patterns case + (["*.pdf", "*/download/*"], { + "https://example.com/doc.pdf": True, + "https://example.com/download/file.txt": True, + "https://example.com/path/download/doc": True, + "https://example.com/uploads/file.txt": False, + }), + + # Edge cases + ("*", { + "https://example.com": True, + "": True, + "http://test.com/path": True, + }), + + # Complex regex + (r"^https?://.*\.example\.com/\d+", { + "https://sub.example.com/123": True, + "http://test.example.com/456": True, + "https://example.com/789": False, + "https://sub.example.com/abc": False, + }) + ] + + def run_accuracy_test(): + print("\nAccuracy Tests:") + print("-" * 50) + + all_passed = True + for patterns, test_urls in test_cases: + filter_obj = FastURLPatternFilter(patterns) + + for url, expected in test_urls.items(): + result = filter_obj.apply(url) + if result != expected: + print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'") + print(f" Expected: {expected}, Got: {result}") + all_passed = False + else: + print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'") + + return all_passed + + def run_speed_test(): + print("\nSpeed Tests:") + print("-" * 50) + + # Create a large set of test URLs + all_urls = list(chain.from_iterable(urls.keys() for _, urls in test_cases)) + test_urls = all_urls * 10000 # 100K+ URLs + + # Test both implementations + original = URLPatternFilter(["*.html", "*/article/*", "blog-*"]) + optimized = FastURLPatternFilter(["*.html", "*/article/*", "blog-*"]) + + def benchmark(name, filter_obj): + start = time.perf_counter() + for url in test_urls: + filter_obj.apply(url) + elapsed = time.perf_counter() - start + urls_per_sec = len(test_urls) / elapsed + print(f"{name:<20} {elapsed:.3f}s ({urls_per_sec:,.0f} URLs/sec)") + + benchmark("Original Filter:", original) + benchmark("Optimized Filter:", optimized) + + # Run tests + print("Running Pattern Filter Tests...") + accuracy_passed = run_accuracy_test() - print("\nBenchmarking individual filters...") - for url in test_urls[:5]: # Show first 5 results - print(f"\nTesting URL: {url}") - benchmark("Pattern Filter", pattern_filter.apply, url) - benchmark("Content Filter", content_filter.apply, url) - benchmark("Domain Filter", domain_filter.apply, url) - - print("\nBenchmarking full chain...") - benchmark("Full Chain", lambda: [chain.apply(url) for url in test_urls], None) + if accuracy_passed: + print("\n✨ All accuracy tests passed!") + run_speed_test() + else: + print("\n❌ Some accuracy tests failed!") if __name__ == "__main__": - run_performance_test() \ No newline at end of file + run_performance_test() + # test_pattern_filter() diff --git a/crawl4ai/scraper/filters_review.py b/crawl4ai/scraper/filters_review.py deleted file mode 100644 index c67244c0..00000000 --- a/crawl4ai/scraper/filters_review.py +++ /dev/null @@ -1,872 +0,0 @@ -# from .url_filter import URLFilter, FilterChain -# from .content_type_filter import ContentTypeFilter -# from .url_pattern_filter import URLPatternFilter - -from abc import ABC, abstractmethod -from typing import List, Pattern, Set, Union, FrozenSet -import re, time -from urllib.parse import urlparse -from array import array -import logging -from functools import lru_cache -import fnmatch -from dataclasses import dataclass -from typing import ClassVar -import weakref -import mimetypes - - -@dataclass -class FilterStats: - # PERF: Using dataclass creates overhead with __init__ and property access - # PERF: Could use __slots__ to reduce memory footprint - # PERF: Consider using array.array('I') for atomic increments - total_urls: int = 0 - rejected_urls: int = 0 - passed_urls: int = 0 - - -class URLFilter(ABC): - # PERF: Logger creation is expensive, consider lazy initialization - # PERF: stats object creation adds overhead for each filter instance - def __init__(self, name: str = None): - self.name = name or self.__class__.__name__ - self.stats = FilterStats() - self.logger = logging.getLogger(f"urlfilter.{self.name}") - - @abstractmethod - def apply(self, url: str) -> bool: - pass - - def _update_stats(self, passed: bool): - # PERF: Already optimized but could use bitwise operations - # PERF: Consider removing stats entirely in production/fast mode - self.stats.total_urls += 1 - self.stats.passed_urls += passed - self.stats.rejected_urls += not passed - - -class FilterChain: - # PERF: List traversal for each URL is expensive - # PERF: Could use array.array instead of list for filters - # PERF: Consider adding fast path for single filter case - def __init__(self, filters: List[URLFilter] = None): - self.filters = filters or [] - self.stats = FilterStats() - self.logger = logging.getLogger("urlfilter.chain") - - def apply(self, url: str) -> bool: - # PERF: Logging on every rejection is expensive - # PERF: Could reorder filters by rejection rate - # PERF: Consider batch processing mode - self.stats.total_urls += 1 - - for filter_ in self.filters: - if not filter_.apply(url): - self.stats.rejected_urls += 1 - self.logger.debug(f"URL {url} rejected by {filter_.name}") - return False - - self.stats.passed_urls += 1 - return True - - -class URLPatternFilter(URLFilter): - # PERF: Converting glob to regex is expensive - # PERF: Multiple regex compilation is slow - # PERF: List of patterns causes multiple regex evaluations - def __init__( - self, - patterns: Union[str, Pattern, List[Union[str, Pattern]]], - use_glob: bool = True, - ): - super().__init__() - self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns - self.use_glob = use_glob - self._compiled_patterns = [] - - # PERF: This could be consolidated into a single regex with OR conditions - # PERF: glob_to_regex creates complex patterns, could be simplified - for pattern in self.patterns: - if isinstance(pattern, str) and use_glob: - self._compiled_patterns.append(self._glob_to_regex(pattern)) - else: - self._compiled_patterns.append( - re.compile(pattern) if isinstance(pattern, str) else pattern - ) - - def _glob_to_regex(self, pattern: str) -> Pattern: - # PERF: fnmatch.translate creates overly complex patterns - # PERF: Could cache common translations - return re.compile(fnmatch.translate(pattern)) - - def apply(self, url: str) -> bool: - # PERF: any() with generator is slower than direct loop with early return - # PERF: searching entire string is slower than anchored match - matches = any(pattern.search(url) for pattern in self._compiled_patterns) - self._update_stats(matches) - return matches - - -class ContentTypeFilter(URLFilter): - # PERF: mimetypes guessing is extremely slow - # PERF: URL parsing on every check is expensive - # PERF: No caching of results for similar extensions - def __init__( - self, allowed_types: Union[str, List[str]], check_extension: bool = True - ): - super().__init__() - self.allowed_types = ( - [allowed_types] if isinstance(allowed_types, str) else allowed_types - ) - self.check_extension = check_extension - self._normalize_types() - - def _normalize_types(self): - """Normalize content type strings""" - self.allowed_types = [t.lower() for t in self.allowed_types] - - def _check_extension(self, url: str) -> bool: - # PERF: urlparse is called on every check - # PERF: multiple string splits are expensive - # PERF: mimetypes.guess_type is very slow - ext = ( - urlparse(url).path.split(".")[-1].lower() - if "." in urlparse(url).path - else "" - ) - if not ext: - return True - - # PERF: guess_type is main bottleneck - guessed_type = mimetypes.guess_type(url)[0] - return any( - allowed in (guessed_type or "").lower() for allowed in self.allowed_types - ) - - def apply(self, url: str) -> bool: - """Check if URL's content type is allowed""" - result = True - if self.check_extension: - result = self._check_extension(url) - self._update_stats(result) - return result - - -class DomainFilter(URLFilter): - # PERF: Set lookups are fast but string normalizations on init are not - # PERF: Creating two sets doubles memory usage - def __init__( - self, - allowed_domains: Union[str, List[str]] = None, - blocked_domains: Union[str, List[str]] = None, - ): - super().__init__() - # PERF: Normalizing domains on every init is wasteful - # PERF: Could use frozenset for immutable lists - self.allowed_domains = ( - set(self._normalize_domains(allowed_domains)) if allowed_domains else None - ) - self.blocked_domains = ( - set(self._normalize_domains(blocked_domains)) if blocked_domains else set() - ) - - def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]: - # PERF: strip() and lower() create new strings for each domain - # PERF: List comprehension creates intermediate list - if isinstance(domains, str): - domains = [domains] - return [d.lower().strip() for d in domains] - - def _extract_domain(self, url: str) -> str: - # PERF: urlparse is called for every URL check - # PERF: lower() creates new string every time - # PERF: Could cache recent results - return urlparse(url).netloc.lower() - - def apply(self, url: str) -> bool: - # PERF: Two separate set lookups in worst case - # PERF: Domain extraction happens before knowing if we have any filters - domain = self._extract_domain(url) - - if domain in self.blocked_domains: - self._update_stats(False) - return False - - if self.allowed_domains is not None and domain not in self.allowed_domains: - self._update_stats(False) - return False - - self._update_stats(True) - return True - - -# Example usage: -def create_common_filter_chain() -> FilterChain: - """Create a commonly used filter chain""" - return FilterChain( - [ - URLPatternFilter( - [ - "*.html", - "*.htm", # HTML files - "*/article/*", - "*/blog/*", # Common content paths - ] - ), - ContentTypeFilter(["text/html", "application/xhtml+xml"]), - DomainFilter(blocked_domains=["ads.*", "analytics.*"]), - ] - ) - - -#################################################################################### -# Uncledoe: Optimized Version -#################################################################################### - - -# Use __slots__ and array for maximum memory/speed efficiency -class FastFilterStats: - __slots__ = ("_counters",) - - def __init__(self): - # Use array of unsigned ints for atomic operations - self._counters = array("I", [0, 0, 0]) # total, passed, rejected - - @property - def total_urls(self): - return self._counters[0] - - @property - def passed_urls(self): - return self._counters[1] - - @property - def rejected_urls(self): - return self._counters[2] - - -class FastURLFilter(ABC): - """Optimized base filter class""" - - __slots__ = ("name", "stats", "_logger_ref") - - def __init__(self, name: str = None): - self.name = name or self.__class__.__name__ - self.stats = FastFilterStats() - # Lazy logger initialization using weakref - self._logger_ref = None - - @property - def logger(self): - if self._logger_ref is None or self._logger_ref() is None: - logger = logging.getLogger(f"urlfilter.{self.name}") - self._logger_ref = weakref.ref(logger) - return self._logger_ref() - - @abstractmethod - def apply(self, url: str) -> bool: - pass - - def _update_stats(self, passed: bool): - # Use direct array index for speed - self.stats._counters[0] += 1 # total - self.stats._counters[1] += passed # passed - self.stats._counters[2] += not passed # rejected - - -class FastFilterChain: - """Optimized filter chain""" - - __slots__ = ("filters", "stats", "_logger_ref") - - def __init__(self, filters: List[FastURLFilter] = None): - self.filters = tuple(filters or []) # Immutable tuple for speed - self.stats = FastFilterStats() - self._logger_ref = None - - @property - def logger(self): - if self._logger_ref is None or self._logger_ref() is None: - logger = logging.getLogger("urlfilter.chain") - self._logger_ref = weakref.ref(logger) - return self._logger_ref() - - def add_filter(self, filter_: FastURLFilter) -> "FastFilterChain": - """Add a filter to the chain""" - self.filters.append(filter_) - return self # Enable method chaining - - def apply(self, url: str) -> bool: - """Optimized apply with minimal operations""" - self.stats._counters[0] += 1 # total - - # Direct tuple iteration is faster than list - for f in self.filters: - if not f.apply(url): - self.stats._counters[2] += 1 # rejected - return False - - self.stats._counters[1] += 1 # passed - return True - -class FastURLPatternFilter(FastURLFilter): - """Pattern filter balancing speed and completeness""" - __slots__ = ('_simple_suffixes', '_simple_prefixes', '_domain_patterns', '_path_patterns') - - PATTERN_TYPES = { - 'SUFFIX': 1, # *.html - 'PREFIX': 2, # /foo/* - 'DOMAIN': 3, # *.example.com - 'PATH': 4 , # Everything else - 'REGEX': 5 - } - - def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True): - super().__init__() - patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns - - self._simple_suffixes = set() - self._simple_prefixes = set() - self._domain_patterns = [] - self._path_patterns = [] - - for pattern in patterns: - pattern_type = self._categorize_pattern(pattern) - self._add_pattern(pattern, pattern_type) - - def _categorize_pattern(self, pattern: str) -> int: - """Categorize pattern for specialized handling""" - if not isinstance(pattern, str): - return self.PATTERN_TYPES['PATH'] - - # Check if it's a regex pattern - if pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern: - return self.PATTERN_TYPES['REGEX'] - - if pattern.count('*') == 1: - if pattern.startswith('*.'): - return self.PATTERN_TYPES['SUFFIX'] - if pattern.endswith('/*'): - return self.PATTERN_TYPES['PREFIX'] - - if '://' in pattern and pattern.startswith('*.'): - return self.PATTERN_TYPES['DOMAIN'] - - return self.PATTERN_TYPES['PATH'] - - def _add_pattern(self, pattern: str, pattern_type: int): - """Add pattern to appropriate matcher""" - if pattern_type == self.PATTERN_TYPES['REGEX']: - # For regex patterns, compile directly without glob translation - if isinstance(pattern, str) and (pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern): - self._path_patterns.append(re.compile(pattern)) - return - elif pattern_type == self.PATTERN_TYPES['SUFFIX']: - self._simple_suffixes.add(pattern[2:]) - elif pattern_type == self.PATTERN_TYPES['PREFIX']: - self._simple_prefixes.add(pattern[:-2]) - elif pattern_type == self.PATTERN_TYPES['DOMAIN']: - self._domain_patterns.append( - re.compile(pattern.replace('*.', r'[^/]+\.')) - ) - else: - if isinstance(pattern, str): - # Handle complex glob patterns - if '**' in pattern: - pattern = pattern.replace('**', '.*') - if '{' in pattern: - # Convert {a,b} to (a|b) - pattern = re.sub(r'\{([^}]+)\}', - lambda m: f'({"|".join(m.group(1).split(","))})', - pattern) - pattern = fnmatch.translate(pattern) - self._path_patterns.append( - pattern if isinstance(pattern, Pattern) else re.compile(pattern) - ) - - @lru_cache(maxsize=10000) - def apply(self, url: str) -> bool: - """Hierarchical pattern matching""" - # Quick suffix check (*.html) - if self._simple_suffixes: - path = url.split('?')[0] - if path.split('/')[-1].split('.')[-1] in self._simple_suffixes: - self._update_stats(True) - return True - - # Domain check - if self._domain_patterns: - for pattern in self._domain_patterns: - if pattern.match(url): - self._update_stats(True) - return True - - # Prefix check (/foo/*) - if self._simple_prefixes: - path = url.split('?')[0] - if any(path.startswith(p) for p in self._simple_prefixes): - self._update_stats(True) - return True - - # Complex patterns - if self._path_patterns: - if any(p.search(url) for p in self._path_patterns): - self._update_stats(True) - return True - - self._update_stats(False) - return False - - -class FastContentTypeFilter(FastURLFilter): - """Optimized content type filter using fast lookups""" - - __slots__ = ("allowed_types", "_ext_map", "_check_extension") - - # Fast extension to mime type mapping - _MIME_MAP = { - # Text Formats - "txt": "text/plain", - "html": "text/html", - "htm": "text/html", - "xhtml": "application/xhtml+xml", - "css": "text/css", - "csv": "text/csv", - "ics": "text/calendar", - "js": "application/javascript", - # Images - "bmp": "image/bmp", - "gif": "image/gif", - "jpeg": "image/jpeg", - "jpg": "image/jpeg", - "png": "image/png", - "svg": "image/svg+xml", - "tiff": "image/tiff", - "ico": "image/x-icon", - "webp": "image/webp", - # Audio - "mp3": "audio/mpeg", - "wav": "audio/wav", - "ogg": "audio/ogg", - "m4a": "audio/mp4", - "aac": "audio/aac", - # Video - "mp4": "video/mp4", - "mpeg": "video/mpeg", - "webm": "video/webm", - "avi": "video/x-msvideo", - "mov": "video/quicktime", - "flv": "video/x-flv", - "wmv": "video/x-ms-wmv", - "mkv": "video/x-matroska", - # Applications - "json": "application/json", - "xml": "application/xml", - "pdf": "application/pdf", - "zip": "application/zip", - "gz": "application/gzip", - "tar": "application/x-tar", - "rar": "application/vnd.rar", - "7z": "application/x-7z-compressed", - "exe": "application/vnd.microsoft.portable-executable", - "msi": "application/x-msdownload", - # Fonts - "woff": "font/woff", - "woff2": "font/woff2", - "ttf": "font/ttf", - "otf": "font/otf", - # Microsoft Office - "doc": "application/msword", - "dot": "application/msword", - "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "xls": "application/vnd.ms-excel", - "ppt": "application/vnd.ms-powerpoint", - "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", - # OpenDocument Formats - "odt": "application/vnd.oasis.opendocument.text", - "ods": "application/vnd.oasis.opendocument.spreadsheet", - "odp": "application/vnd.oasis.opendocument.presentation", - # Archives - "tar.gz": "application/gzip", - "tgz": "application/gzip", - "bz2": "application/x-bzip2", - # Others - "rtf": "application/rtf", - "apk": "application/vnd.android.package-archive", - "epub": "application/epub+zip", - "jar": "application/java-archive", - "swf": "application/x-shockwave-flash", - "midi": "audio/midi", - "mid": "audio/midi", - "ps": "application/postscript", - "ai": "application/postscript", - "eps": "application/postscript", - # Custom or less common - "bin": "application/octet-stream", - "dmg": "application/x-apple-diskimage", - "iso": "application/x-iso9660-image", - "deb": "application/x-debian-package", - "rpm": "application/x-rpm", - "sqlite": "application/vnd.sqlite3", - # Placeholder - "unknown": "application/octet-stream", # Fallback for unknown file types - } - - @staticmethod - @lru_cache(maxsize=1000) - def _extract_extension(path: str) -> str: - """Fast extension extraction with caching""" - if "." not in path: - return "" - return path.rpartition(".")[-1].lower() - - def __init__( - self, allowed_types: Union[str, List[str]], check_extension: bool = True - ): - super().__init__() - # Normalize and store as frozenset for fast lookup - self.allowed_types = frozenset( - t.lower() - for t in ( - allowed_types if isinstance(allowed_types, list) else [allowed_types] - ) - ) - self._check_extension = check_extension - - # Pre-compute extension map for allowed types - self._ext_map = frozenset( - ext - for ext, mime in self._MIME_MAP.items() - if any(allowed in mime for allowed in self.allowed_types) - ) - - @lru_cache(maxsize=1000) - def _check_url_cached(self, url: str) -> bool: - """Cached URL checking""" - if not self._check_extension: - return True - - path = url.split("?")[0] # Fast path split - ext = self._extract_extension(path) - if not ext: - return True - - return ext in self._ext_map - - def apply(self, url: str) -> bool: - """Fast extension check with caching""" - result = self._check_url_cached(url) - self._update_stats(result) - return result - - -class FastDomainFilter(FastURLFilter): - """Optimized domain filter with fast lookups and caching""" - - __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache") - - # Regex for fast domain extraction - _DOMAIN_REGEX = re.compile(r"://([^/]+)") - - def __init__( - self, - allowed_domains: Union[str, List[str]] = None, - blocked_domains: Union[str, List[str]] = None, - ): - super().__init__() - - # Convert inputs to frozensets for immutable, fast lookups - self._allowed_domains = ( - frozenset(self._normalize_domains(allowed_domains)) - if allowed_domains - else None - ) - self._blocked_domains = ( - frozenset(self._normalize_domains(blocked_domains)) - if blocked_domains - else frozenset() - ) - - @staticmethod - def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]: - """Fast domain normalization""" - if isinstance(domains, str): - return {domains.lower()} - return {d.lower() for d in domains} - - @staticmethod - @lru_cache(maxsize=10000) - def _extract_domain(url: str) -> str: - """Ultra-fast domain extraction with regex and caching""" - match = FastDomainFilter._DOMAIN_REGEX.search(url) - return match.group(1).lower() if match else "" - - def apply(self, url: str) -> bool: - """Optimized domain checking with early returns""" - # Skip processing if no filters - if not self._blocked_domains and self._allowed_domains is None: - self._update_stats(True) - return True - - domain = self._extract_domain(url) - - # Early return for blocked domains - if domain in self._blocked_domains: - self._update_stats(False) - return False - - # If no allowed domains specified, accept all non-blocked - if self._allowed_domains is None: - self._update_stats(True) - return True - - # Final allowed domains check - result = domain in self._allowed_domains - self._update_stats(result) - return result - - -def create_fast_filter_chain() -> FastFilterChain: - """Create an optimized filter chain with filters ordered by rejection rate""" - return FastFilterChain( - [ - # Domain filter first (fastest rejection) - FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]), - # Content filter second (medium speed) - FastContentTypeFilter(["text/html", "application/xhtml+xml"]), - # Pattern filter last (most expensive) - FastURLPatternFilter( - [ - "*.html", - "*.htm", - "*/article/*", - "*/blog/*", - ] - ), - ] - ) - - -def run_performance_test(): - import time - import random - from itertools import cycle - - # Generate test URLs - base_urls = [ - "https://example.com/article/123", - "https://blog.example.com/post/456", - "https://ads.example.com/tracking", - "https://example.com/about.html", - "https://analytics.example.com/script.js", - "https://example.com/products.php", - "https://subdomain.example.com/blog/post-123", - "https://example.com/path/file.pdf", - ] - - # Create more varied test data - test_urls = [] - for base in base_urls: - # Add original - test_urls.append(base) - # Add variations - parts = base.split("/") - for i in range(10): - parts[-1] = f"page_{i}.html" - test_urls.append("/".join(parts)) - - # Multiply to get enough test data - test_urls = test_urls * 10000 # Creates ~800k URLs - - def benchmark(name: str, func, *args, warmup=True): - if warmup: - # Warmup run - func(*args) - - # Actual timing - start = time.perf_counter_ns() - result = func(*args) - elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms - print( - f"{name:<30} {elapsed:>8.3f} ms ({len(test_urls)/elapsed*1000:,.0f} URLs/sec)" - ) - return result - - print("\nBenchmarking original vs optimized implementations...") - print("-" * 70) - - # Original implementation - pattern_filter = URLPatternFilter(["*.html", "*/article/*"]) - content_filter = ContentTypeFilter(["text/html"]) - domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"]) - chain = FilterChain([pattern_filter, content_filter, domain_filter]) - - # Optimized implementation - fast_pattern_filter = FastURLPatternFilter(["*.html", "*/article/*"]) - fast_content_filter = FastContentTypeFilter(["text/html"]) - fast_domain_filter = FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]) - fast_chain = FastFilterChain( - [fast_domain_filter, fast_content_filter, fast_pattern_filter] - ) - - # Test individual filters - print("\nSingle filter performance (first 1000 URLs):") - test_subset = test_urls[:1000] - - print("\nPattern Filters:") - benchmark( - "Original Pattern Filter", - lambda: [pattern_filter.apply(url) for url in test_subset], - ) - benchmark( - "Optimized Pattern Filter", - lambda: [fast_pattern_filter.apply(url) for url in test_subset], - ) - - print("\nContent Filters:") - benchmark( - "Original Content Filter", - lambda: [content_filter.apply(url) for url in test_subset], - ) - benchmark( - "Optimized Content Filter", - lambda: [fast_content_filter.apply(url) for url in test_subset], - ) - - print("\nDomain Filters:") - benchmark( - "Original Domain Filter", - lambda: [domain_filter.apply(url) for url in test_subset], - ) - benchmark( - "Optimized Domain Filter", - lambda: [fast_domain_filter.apply(url) for url in test_subset], - ) - - print("\nFull Chain Performance (all URLs):") - # Test chain - benchmark("Original Chain", lambda: [chain.apply(url) for url in test_urls]) - benchmark("Optimized Chain", lambda: [fast_chain.apply(url) for url in test_urls]) - - # Memory usage - import sys - - print("\nMemory Usage per Filter:") - print(f"Original Pattern Filter: {sys.getsizeof(pattern_filter):,} bytes") - print(f"Optimized Pattern Filter: {sys.getsizeof(fast_pattern_filter):,} bytes") - print(f"Original Content Filter: {sys.getsizeof(content_filter):,} bytes") - print(f"Optimized Content Filter: {sys.getsizeof(fast_content_filter):,} bytes") - print(f"Original Domain Filter: {sys.getsizeof(domain_filter):,} bytes") - print(f"Optimized Domain Filter: {sys.getsizeof(fast_domain_filter):,} bytes") - -def test_pattern_filter(): - import time - from itertools import chain - - # Test cases as list of tuples instead of dict for multiple patterns - test_cases = [ - # Simple suffix patterns (*.html) - ("*.html", { - "https://example.com/page.html": True, - "https://example.com/path/doc.html": True, - "https://example.com/page.htm": False, - "https://example.com/page.html?param=1": True, - }), - - # Path prefix patterns (/foo/*) - ("*/article/*", { - "https://example.com/article/123": True, - "https://example.com/blog/article/456": True, - "https://example.com/articles/789": False, - "https://example.com/article": False, - }), - - # Complex patterns - ("blog-*-[0-9]", { - "https://example.com/blog-post-1": True, - "https://example.com/blog-test-9": True, - "https://example.com/blog-post": False, - "https://example.com/blog-post-x": False, - }), - - # Multiple patterns case - (["*.pdf", "*/download/*"], { - "https://example.com/doc.pdf": True, - "https://example.com/download/file.txt": True, - "https://example.com/path/download/doc": True, - "https://example.com/uploads/file.txt": False, - }), - - # Edge cases - ("*", { - "https://example.com": True, - "": True, - "http://test.com/path": True, - }), - - # Complex regex - (r"^https?://.*\.example\.com/\d+", { - "https://sub.example.com/123": True, - "http://test.example.com/456": True, - "https://example.com/789": False, - "https://sub.example.com/abc": False, - }) - ] - - def run_accuracy_test(): - print("\nAccuracy Tests:") - print("-" * 50) - - all_passed = True - for patterns, test_urls in test_cases: - filter_obj = FastURLPatternFilter(patterns) - - for url, expected in test_urls.items(): - result = filter_obj.apply(url) - if result != expected: - print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'") - print(f" Expected: {expected}, Got: {result}") - all_passed = False - else: - print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'") - - return all_passed - - def run_speed_test(): - print("\nSpeed Tests:") - print("-" * 50) - - # Create a large set of test URLs - all_urls = list(chain.from_iterable(urls.keys() for _, urls in test_cases)) - test_urls = all_urls * 10000 # 100K+ URLs - - # Test both implementations - original = URLPatternFilter(["*.html", "*/article/*", "blog-*"]) - optimized = FastURLPatternFilter(["*.html", "*/article/*", "blog-*"]) - - def benchmark(name, filter_obj): - start = time.perf_counter() - for url in test_urls: - filter_obj.apply(url) - elapsed = time.perf_counter() - start - urls_per_sec = len(test_urls) / elapsed - print(f"{name:<20} {elapsed:.3f}s ({urls_per_sec:,.0f} URLs/sec)") - - benchmark("Original Filter:", original) - benchmark("Optimized Filter:", optimized) - - # Run tests - print("Running Pattern Filter Tests...") - accuracy_passed = run_accuracy_test() - - if accuracy_passed: - print("\n✨ All accuracy tests passed!") - run_speed_test() - else: - print("\n❌ Some accuracy tests failed!") - -if __name__ == "__main__": - run_performance_test() - # test_pattern_filter() diff --git a/crawl4ai/scraper/scorers.py b/crawl4ai/scraper/scorers.py index 8552aa8e..4ef90dcd 100644 --- a/crawl4ai/scraper/scorers.py +++ b/crawl4ai/scraper/scorers.py @@ -1,6 +1,3 @@ -# from .url_scorer import URLScorer -# from .keyword_relevance_scorer import KeywordRelevanceScorer - from abc import ABC, abstractmethod from typing import List, Dict, Optional, Union from dataclasses import dataclass @@ -9,17 +6,42 @@ import re from collections import defaultdict import math import logging +from functools import lru_cache +from array import array +from functools import lru_cache +import ctypes +import platform +PLATFORM = platform.system() + +# Pre-computed scores for common year differences +_SCORE_LOOKUP = [1.0, 0.5, 0.3333333333333333, 0.25] + +# Pre-computed scores for common year differences +_FRESHNESS_SCORES = [ + 1.0, # Current year + 0.9, # Last year + 0.8, # 2 years ago + 0.7, # 3 years ago + 0.6, # 4 years ago + 0.5, # 5 years ago +] + +# Pre-computed normalization factors for powers of 2 +_POW2_NORM = [1.0, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625] @dataclass class ScoringStats: - """Statistics for URL scoring""" - + # PERF: Dataclass introduces overhead with property access and __init__ + # PERF: Float operations and comparisons are expensive for high-frequency updates + # PERF: Property calculation on every access is inefficient + # PERF: Storing min/max adds memory overhead and comparison costs + # PERF: Using inf/-inf creates unnecessary float objects urls_scored: int = 0 total_score: float = 0.0 - min_score: float = float("inf") + min_score: float = float("inf") # Expensive object creation max_score: float = float("-inf") - + def update(self, score: float): """Update scoring statistics""" self.urls_scored += 1 @@ -32,9 +54,50 @@ class ScoringStats: """Calculate average score""" return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0 +class FastScoringStats: + __slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score') + + def __init__(self): + self._urls_scored = 0 + self._total_score = 0.0 + self._min_score = None # Lazy initialization + self._max_score = None + + def update(self, score: float) -> None: + """Optimized update with minimal operations""" + self._urls_scored += 1 + self._total_score += score + + # Lazy min/max tracking - only if actually accessed + if self._min_score is not None: + if score < self._min_score: + self._min_score = score + if self._max_score is not None: + if score > self._max_score: + self._max_score = score + + def get_average(self) -> float: + """Direct calculation instead of property""" + return self._total_score / self._urls_scored if self._urls_scored else 0.0 + + def get_min(self) -> float: + """Lazy min calculation""" + if self._min_score is None: + self._min_score = self._total_score / self._urls_scored if self._urls_scored else 0.0 + return self._min_score + + def get_max(self) -> float: + """Lazy max calculation""" + if self._max_score is None: + self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0 + return self._max_score class URLScorer(ABC): - """Base class for URL scoring strategies""" + # PERF: Property access overhead for weight + # PERF: Unnecessary name attribute + # PERF: Stats object creation overhead + # PERF: Logger creation for each instance + # PERF: Abstract method overhead def __init__(self, weight: float = 1.0, name: str = None): self.weight = weight @@ -44,18 +107,52 @@ class URLScorer(ABC): @abstractmethod def _calculate_score(self, url: str) -> float: - """Calculate the raw score for a URL""" pass def score(self, url: str) -> float: - """Calculate the weighted score for a URL""" raw_score = self._calculate_score(url) weighted_score = raw_score * self.weight self.stats.update(weighted_score) return weighted_score +# Optimized base class +class FastURLScorer(ABC): + __slots__ = ('_weight', '_stats') + + def __init__(self, weight: float = 1.0): + # Store weight directly as float32 for memory efficiency + self._weight = ctypes.c_float(weight).value + self._stats = ScoringStats() + + @abstractmethod + def _calculate_score(self, url: str) -> float: + """Calculate raw score for URL.""" + pass + + def score(self, url: str) -> float: + """Calculate weighted score with minimal overhead.""" + score = self._calculate_score(url) * self._weight + self._stats.update(score) + return score + + @property + def stats(self): + """Access to scoring statistics.""" + return self._stats + + @property + def weight(self): + return self._weight class CompositeScorer(URLScorer): + # PERF: Unnecessary list iteration for each score + # PERF: Creates new list for scores + # PERF: Division on every normalization + # PERF: No parallelization for independent scorers + # PERF: No short circuit for zero scores + # PERF: No weighting optimization + # PERF: No caching of combined scores + # PERF: List allocation for scores storag """Combines multiple scorers with weights""" def __init__(self, scorers: List[URLScorer], normalize: bool = True): @@ -72,8 +169,84 @@ class CompositeScorer(URLScorer): return total_score +class FastCompositeScorer(FastURLScorer): + __slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array') + + def __init__(self, scorers: List[URLScorer], normalize: bool = True): + """Initialize composite scorer combining multiple scoring strategies. + + Optimized for: + - Fast parallel scoring + - Memory efficient score aggregation + - Quick short-circuit conditions + - Pre-allocated arrays + + Args: + scorers: List of scoring strategies to combine + normalize: Whether to normalize final score by scorer count + """ + super().__init__(weight=1.0) + self._scorers = scorers + self._normalize = normalize + + # Pre-allocate arrays for scores and weights + self._weights_array = array('f', [s.weight for s in scorers]) + self._score_array = array('f', [0.0] * len(scorers)) -class KeywordRelevanceScorer(URLScorer): + @lru_cache(maxsize=10000) + def _calculate_score(self, url: str) -> float: + """Calculate combined score from all scoring strategies. + + Uses: + 1. Pre-allocated arrays for scores + 2. Short-circuit on zero scores + 3. Optimized normalization + 4. Vectorized operations where possible + + Args: + url: URL to score + + Returns: + Combined and optionally normalized score + """ + total_score = 0.0 + scores = self._score_array + + # Get scores from all scorers + for i, scorer in enumerate(self._scorers): + # Use public score() method which applies weight + scores[i] = scorer.score(url) + total_score += scores[i] + + # Normalize if requested + if self._normalize and self._scorers: + count = len(self._scorers) + return total_score / count + + return total_score + + def score(self, url: str) -> float: + """Public scoring interface with stats tracking. + + Args: + url: URL to score + + Returns: + Final combined score + """ + score = self._calculate_score(url) + self.stats.update(score) + return score + +class KeywordRelevanceScorer(URLScorer): + # PERF: Regex compilation and pattern matching is expensive + # PERF: List comprehension with pattern search has high overhead + # PERF: URL decoding on every calculation + # PERF: Division operation for normalization is costly + # PERF: Case insensitive regex adds overhead + # PERF: No pattern caching or reuse + # PERF: Using inheritance adds method lookup overhead + """Score URLs based on keyword relevance. keyword_scorer = KeywordRelevanceScorer( @@ -109,8 +282,44 @@ class KeywordRelevanceScorer(URLScorer): # Normalize score between 0 and 1 return total_matches / len(self.patterns) if self.patterns else 0.0 +class FastKeywordRelevanceScorer(FastURLScorer): + __slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive') + + def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False): + super().__init__(weight=weight) + self._case_sensitive = case_sensitive + # Pre-process keywords once + self._keywords = [k if case_sensitive else k.lower() for k in keywords] + + @lru_cache(maxsize=10000) + def _url_bytes(self, url: str) -> bytes: + """Cache decoded URL bytes""" + return url.encode('utf-8') if self._case_sensitive else url.lower().encode('utf-8') + + + def _calculate_score(self, url: str) -> float: + """Fast string matching without regex or byte conversion""" + if not self._case_sensitive: + url = url.lower() + + matches = sum(1 for k in self._keywords if k in url) + + # Fast return paths + if not matches: + return 0.0 + if matches == len(self._keywords): + return 1.0 + + return matches / len(self._keywords) class PathDepthScorer(URLScorer): + # PERF: URL parsing on every call is expensive + # PERF: Split and list comprehension creates temporary lists + # PERF: abs() call adds function overhead + # PERF: Division and addition in score calculation are expensive for high frequency + # PERF: Path parts filtering creates extra list + # PERF: Inherits URLScorer adding method lookup overhead + # PERF: No caching of parsed URLs or calculated depths """Score URLs based on their path depth. path_scorer = PathDepthScorer( @@ -136,8 +345,71 @@ class PathDepthScorer(URLScorer): distance_from_optimal = abs(depth - self.optimal_depth) return 1.0 / (1.0 + distance_from_optimal) +class FastPathDepthScorer(FastURLScorer): + __slots__ = ('_weight', '_stats', '_optimal_depth') # Remove _url_cache + + def __init__(self, optimal_depth: int = 3, weight: float = 1.0): + super().__init__(weight=weight) + self._optimal_depth = optimal_depth + + @staticmethod + @lru_cache(maxsize=10000) + def _quick_depth(path: str) -> int: + """Ultra fast path depth calculation. + + Examples: + - "http://example.com" -> 0 # No path segments + - "http://example.com/" -> 0 # Empty path + - "http://example.com/a" -> 1 + - "http://example.com/a/b" -> 2 + """ + if not path or path == '/': + return 0 + + if '/' not in path: + return 0 + + depth = 0 + last_was_slash = True + + for c in path: + if c == '/': + if not last_was_slash: + depth += 1 + last_was_slash = True + else: + last_was_slash = False + + if not last_was_slash: + depth += 1 + + return depth + + @lru_cache(maxsize=10000) # Cache the whole calculation + def _calculate_score(self, url: str) -> float: + pos = url.find('/', url.find('://') + 3) + if pos == -1: + depth = 0 + else: + depth = self._quick_depth(url[pos:]) + + # Use lookup table for common distances + distance = depth - self._optimal_depth + distance = distance if distance >= 0 else -distance # Faster than abs() + + if distance < 4: + return _SCORE_LOOKUP[distance] + + return 1.0 / (1.0 + distance) class ContentTypeScorer(URLScorer): + # PERF: Regex compilation on every initialization + # PERF: Dict lookup and regex search for every URL + # PERF: Pattern iteration adds loop overhead + # PERF: No pattern priority or short-circuit + # PERF: Dict storage has lookup overhead + # PERF: Missing extension fast path check + # PERF: Unnecessary regex for simple extensions """Score URLs based on content type preferences. content_scorer = ContentTypeScorer({ @@ -169,8 +441,100 @@ class ContentTypeScorer(URLScorer): return weight return 0.0 +class FastContentTypeScorer(FastURLScorer): + __slots__ = ('_weight', '_exact_types', '_regex_types') + + def __init__(self, type_weights: Dict[str, float], weight: float = 1.0): + """Initialize scorer with type weights map. + + Args: + type_weights: Dict mapping file extensions/patterns to scores (e.g. {'.html$': 1.0}) + weight: Overall weight multiplier for this scorer + """ + super().__init__(weight=weight) + self._exact_types = {} # Fast lookup for simple extensions + self._regex_types = [] # Fallback for complex patterns + + # Split into exact vs regex matchers for performance + for pattern, score in type_weights.items(): + if pattern.startswith('.') and pattern.endswith('$'): + ext = pattern[1:-1] + self._exact_types[ext] = score + else: + self._regex_types.append((re.compile(pattern), score)) + + # Sort complex patterns by score for early exit + self._regex_types.sort(key=lambda x: -x[1]) + + @staticmethod + @lru_cache(maxsize=10000) + def _quick_extension(url: str) -> str: + """Extract file extension ultra-fast without regex/splits. + + Handles: + - Basic extensions: "example.html" -> "html" + - Query strings: "page.php?id=1" -> "php" + - Fragments: "doc.pdf#page=1" -> "pdf" + - Path params: "file.jpg;width=100" -> "jpg" + + Args: + url: URL to extract extension from + + Returns: + Extension without dot, or empty string if none found + """ + pos = url.rfind('.') + if pos == -1: + return '' + + # Find first non-alphanumeric char after extension + end = len(url) + for i in range(pos + 1, len(url)): + c = url[i] + # Stop at query string, fragment, path param or any non-alphanumeric + if c in '?#;' or not c.isalnum(): + end = i + break + + return url[pos + 1:end].lower() + + @lru_cache(maxsize=10000) + def _calculate_score(self, url: str) -> float: + """Calculate content type score for URL. + + Uses staged approach: + 1. Try exact extension match (fast path) + 2. Fall back to regex patterns if needed + + Args: + url: URL to score + + Returns: + Score between 0.0 and 1.0 * weight + """ + # Fast path: direct extension lookup + ext = self._quick_extension(url) + if ext: + score = self._exact_types.get(ext, None) + if score is not None: + return score + + # Slow path: regex patterns + for pattern, score in self._regex_types: + if pattern.search(url): + return score + + return 0.0 class FreshnessScorer(URLScorer): + # PERF: Multiple regex compilations for each pattern + # PERF: Tries all patterns sequentially + # PERF: Regex pattern matching is expensive + # PERF: Int conversion and arithmetic for every match + # PERF: Repeated constant value (2024) hardcoded + # PERF: No URL caching + # PERF: Complex patterns with redundant groups + # PERF: Unnecessary list of patterns when could combine """Score URLs based on freshness indicators. freshness_scorer = FreshnessScorer(weight=0.9) @@ -201,8 +565,96 @@ class FreshnessScorer(URLScorer): return 1.0 - (2024 - year) * 0.1 return 0.5 # Default score for URLs without dates +class FastFreshnessScorer(FastURLScorer): + __slots__ = ('_weight', '_date_pattern', '_current_year') + + def __init__(self, weight: float = 1.0, current_year: int = 2024): + """Initialize freshness scorer. + + Extracts and scores dates from URLs using format: + - YYYY/MM/DD + - YYYY-MM-DD + - YYYY_MM_DD + - YYYY (year only) + + Args: + weight: Score multiplier + current_year: Year to calculate freshness against (default 2024) + """ + super().__init__(weight=weight) + self._current_year = current_year + + # Combined pattern for all date formats + # Uses non-capturing groups (?:) and alternation + self._date_pattern = re.compile( + r'(?:/' # Path separator + r'|[-_])' # or date separators + r'((?:19|20)\d{2})' # Year group (1900-2099) + r'(?:' # Optional month/day group + r'(?:/|[-_])' # Date separator + r'(?:\d{2})' # Month + r'(?:' # Optional day + r'(?:/|[-_])' # Date separator + r'(?:\d{2})' # Day + r')?' # Day is optional + r')?' # Month/day group is optional + ) + + @lru_cache(maxsize=10000) + def _extract_year(self, url: str) -> Optional[int]: + """Extract the most recent year from URL. + + Args: + url: URL to extract year from + + Returns: + Year as int or None if no valid year found + """ + matches = self._date_pattern.finditer(url) + latest_year = None + + # Find most recent year + for match in matches: + year = int(match.group(1)) + if (year <= self._current_year and # Sanity check + (latest_year is None or year > latest_year)): + latest_year = year + + return latest_year + + @lru_cache(maxsize=10000) + def _calculate_score(self, url: str) -> float: + """Calculate freshness score based on URL date. + + More recent years score higher. Uses pre-computed scoring + table for common year differences. + + Args: + url: URL to score + + Returns: + Score between 0.0 and 1.0 * weight + """ + year = self._extract_year(url) + if year is None: + return 0.5 # Default score + + # Use lookup table for common year differences + year_diff = self._current_year - year + if year_diff < len(_FRESHNESS_SCORES): + return _FRESHNESS_SCORES[year_diff] + + # Fallback calculation for older content + return max(0.1, 1.0 - year_diff * 0.1) class DomainAuthorityScorer(URLScorer): + # PERF: URL parsing on every score calculation + # PERF: Repeated domain extraction + # PERF: Case conversion on every lookup + # PERF: Dict lookup without caching + # PERF: Processes full URL when only needs domain + # PERF: No fast path for common domains + # PERF: Netloc includes port which requires extra processing """Score URLs based on domain authority. authority_scorer = DomainAuthorityScorer({ @@ -230,6 +682,112 @@ class DomainAuthorityScorer(URLScorer): domain = urlparse(url).netloc.lower() return self.domain_weights.get(domain, self.default_weight) +class FastDomainAuthorityScorer(FastURLScorer): + __slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains') + + def __init__( + self, + domain_weights: Dict[str, float], + default_weight: float = 0.5, + weight: float = 1.0, + ): + """Initialize domain authority scorer. + + Args: + domain_weights: Dict mapping domains to authority scores + default_weight: Score for unknown domains + weight: Overall scorer weight multiplier + + Example: + { + 'python.org': 1.0, + 'github.com': 0.9, + 'medium.com': 0.7 + } + """ + super().__init__(weight=weight) + + # Pre-process domains for faster lookup + self._domain_weights = { + domain.lower(): score + for domain, score in domain_weights.items() + } + self._default_weight = default_weight + + # Cache top domains for fast path + self._top_domains = { + domain: score + for domain, score in sorted( + domain_weights.items(), + key=lambda x: -x[1] + )[:5] # Keep top 5 highest scoring domains + } + + @staticmethod + @lru_cache(maxsize=10000) + def _extract_domain(url: str) -> str: + """Extract domain from URL ultra-fast. + + Handles: + - Basic domains: "example.com" + - Subdomains: "sub.example.com" + - Ports: "example.com:8080" + - IPv4: "192.168.1.1" + + Args: + url: Full URL to extract domain from + + Returns: + Lowercase domain without port + """ + # Find domain start + start = url.find('://') + if start == -1: + start = 0 + else: + start += 3 + + # Find domain end + end = url.find('/', start) + if end == -1: + end = url.find('?', start) + if end == -1: + end = url.find('#', start) + if end == -1: + end = len(url) + + # Extract domain and remove port + domain = url[start:end] + port_idx = domain.rfind(':') + if port_idx != -1: + domain = domain[:port_idx] + + return domain.lower() + + @lru_cache(maxsize=10000) + def _calculate_score(self, url: str) -> float: + """Calculate domain authority score. + + Uses staged approach: + 1. Check top domains (fastest) + 2. Check full domain weights + 3. Return default weight + + Args: + url: URL to score + + Returns: + Authority score between 0.0 and 1.0 * weight + """ + domain = self._extract_domain(url) + + # Fast path: check top domains first + score = self._top_domains.get(domain) + if score is not None: + return score + + # Regular path: check all domains + return self._domain_weights.get(domain, self._default_weight) def create_balanced_scorer() -> CompositeScorer: """Create a balanced composite scorer""" @@ -247,6 +805,21 @@ def create_balanced_scorer() -> CompositeScorer: ] ) +def create_balanced_fast_freshness_scorer() -> CompositeScorer: + """Create a balanced composite scorer with fast freshness scorer""" + return FastCompositeScorer( + [ + FastKeywordRelevanceScorer( + keywords=["article", "blog", "news", "research"], weight=1.0 + ), + FastPathDepthScorer(optimal_depth=3, weight=0.7), + FastContentTypeScorer( + type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6}, + weight=0.8, + ), + FastFreshnessScorer(weight=0.9), + ] + ) # Example Usage: """ @@ -272,3 +845,361 @@ score = scorer.score("https://python.org/article/2024/01/new-features") print(f"Average score: {scorer.stats.average_score}") print(f"URLs scored: {scorer.stats.urls_scored}") """ + + +def run_scorer_performance_test(): + import time + import random + from itertools import cycle + import sys + + # Generate varied test URLs + base_urls = [ + # News/blog articles with dates + "https://example.com/2024/01/article-123", + "https://news.com/2023-12-31/breaking-news", + "https://blog.site.com/2022_11_15/tech-update", + + # Different content types + "https://docs.example.com/report.pdf", + "https://site.com/page.html?q=test", + "https://api.service.com/data.json", + + # Various domain authorities + "https://python.org/downloads", + "https://github.com/repo/code", + "https://medium.com/@user/post", + + # Different path depths + "https://site.com/category/subcategory/product/detail", + "https://shop.com/items", + "https://edu.org/courses/cs/intro/lecture1", + ] + + # Create variations + test_urls = [] + years = list(range(2020, 2025)) + domains = ["example.com", "python.org", "github.com", "medium.com"] + extensions = ["html", "pdf", "php", "jsx"] + + for base in base_urls: + test_urls.append(base) + # Add year variations + for year in years: + test_urls.append(f"https://blog.com/{year}/post-{random.randint(1,999)}") + # Add domain variations + for domain in domains: + test_urls.append(f"https://{domain}/article-{random.randint(1,999)}") + # Add extension variations + for ext in extensions: + test_urls.append(f"https://site.com/doc-{random.randint(1,999)}.{ext}") + + # Multiply dataset + test_urls = test_urls * 5000 # Creates ~300k URLs + + def benchmark(name: str, scorer, urls, warmup=True): + if warmup: + for url in urls[:100]: # Warmup with subset + scorer.score(url) + + start = time.perf_counter_ns() + for url in urls: + scorer.score(url) + elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms + + print( + f"{name:<35} {elapsed:>8.3f} ms ({len(urls)/elapsed*1000:,.0f} URLs/sec)" + ) + return elapsed + + print("\nBenchmarking original vs optimized scorers...") + print("-" * 75) + + # Initialize test data + domain_weights = {"python.org": 1.0, "github.com": 0.9, "medium.com": 0.7} + type_weights = {".html$": 1.0, ".pdf$": 0.8, ".php$": 0.6} + keywords = ["python", "article", "blog", "docs"] + + # Original implementations + keyword_scorer = KeywordRelevanceScorer(keywords=keywords, weight=1.0) + path_scorer = PathDepthScorer(optimal_depth=3, weight=0.7) + content_scorer = ContentTypeScorer(type_weights=type_weights, weight=0.8) + freshness_scorer = FreshnessScorer(weight=0.9) + domain_scorer = DomainAuthorityScorer(domain_weights=domain_weights, weight=1.0) + + # Fast implementations + fast_keyword_scorer = FastKeywordRelevanceScorer(keywords=keywords, weight=1.0) + fast_path_scorer = FastPathDepthScorer(optimal_depth=3, weight=0.7) + fast_content_scorer = FastContentTypeScorer(type_weights=type_weights, weight=0.8) + fast_freshness_scorer = FastFreshnessScorer(weight=0.9) + fast_domain_scorer = FastDomainAuthorityScorer(domain_weights=domain_weights, weight=1.0) + + # Test subset for individual scorers + test_subset = test_urls[:1000] + + print("\nIndividual Scorer Performance (first 1000 URLs):") + + print("\nKeyword Relevance Scorers:") + benchmark("Original Keyword Scorer", keyword_scorer, test_subset) + benchmark("Optimized Keyword Scorer", fast_keyword_scorer, test_subset) + + print("\nPath Depth Scorers:") + benchmark("Original Path Scorer", path_scorer, test_subset) + benchmark("Optimized Path Scorer", fast_path_scorer, test_subset) + + print("\nContent Type Scorers:") + benchmark("Original Content Scorer", content_scorer, test_subset) + benchmark("Optimized Content Scorer", fast_content_scorer, test_subset) + + print("\nFreshness Scorers:") + benchmark("Original Freshness Scorer", freshness_scorer, test_subset) + benchmark("Optimized Freshness Scorer", fast_freshness_scorer, test_subset) + + print("\nDomain Authority Scorers:") + benchmark("Original Domain Scorer", domain_scorer, test_subset) + benchmark("Optimized Domain Scorer", fast_domain_scorer, test_subset) + + # Test composite scorers + print("\nComposite Scorer Performance (all URLs):") + + original_composite = CompositeScorer([ + keyword_scorer, path_scorer, content_scorer, + freshness_scorer, domain_scorer + ]) + + fast_composite = FastCompositeScorer([ + fast_keyword_scorer, fast_path_scorer, fast_content_scorer, + fast_freshness_scorer, fast_domain_scorer + ]) + + benchmark("Original Composite Scorer", original_composite, test_urls) + benchmark("Optimized Composite Scorer", fast_composite, test_urls) + + # Memory usage + print("\nMemory Usage per Scorer:") + print(f"Original Keyword Scorer: {sys.getsizeof(keyword_scorer):,} bytes") + print(f"Optimized Keyword Scorer: {sys.getsizeof(fast_keyword_scorer):,} bytes") + print(f"Original Path Scorer: {sys.getsizeof(path_scorer):,} bytes") + print(f"Optimized Path Scorer: {sys.getsizeof(fast_path_scorer):,} bytes") + print(f"Original Content Scorer: {sys.getsizeof(content_scorer):,} bytes") + print(f"Optimized Content Scorer: {sys.getsizeof(fast_content_scorer):,} bytes") + print(f"Original Freshness Scorer: {sys.getsizeof(freshness_scorer):,} bytes") + print(f"Optimized Freshness Scorer: {sys.getsizeof(fast_freshness_scorer):,} bytes") + print(f"Original Domain Scorer: {sys.getsizeof(domain_scorer):,} bytes") + print(f"Optimized Domain Scorer: {sys.getsizeof(fast_domain_scorer):,} bytes") + print(f"Original Composite: {sys.getsizeof(original_composite):,} bytes") + print(f"Optimized Composite: {sys.getsizeof(fast_composite):,} bytes") + +def test_scorers(): + import time + from itertools import chain + + test_cases = [ + # Keyword Scorer Tests + { + "scorer_type": "keyword", + "config": { + "keywords": ["python", "blog"], + "weight": 1.0, + "case_sensitive": False + }, + "urls": { + "https://example.com/python-blog": 1.0, + "https://example.com/PYTHON-BLOG": 1.0, + "https://example.com/python-only": 0.5, + "https://example.com/other": 0.0 + } + }, + + # Path Depth Scorer Tests + { + "scorer_type": "path_depth", + "config": { + "optimal_depth": 2, + "weight": 1.0 + }, + "urls": { + "https://example.com/a/b": 1.0, + "https://example.com/a": 0.5, + "https://example.com/a/b/c": 0.5, + "https://example.com": 0.33333333 + } + }, + + # Content Type Scorer Tests + { + "scorer_type": "content_type", + "config": { + "type_weights": { + ".html$": 1.0, + ".pdf$": 0.8, + ".jpg$": 0.6 + }, + "weight": 1.0 + }, + "urls": { + "https://example.com/doc.html": 1.0, + "https://example.com/doc.pdf": 0.8, + "https://example.com/img.jpg": 0.6, + "https://example.com/other.txt": 0.0 + } + }, + + # Freshness Scorer Tests + { + "scorer_type": "freshness", + "config": { + "weight": 1.0, # Remove current_year since original doesn't support it + }, + "urls": { + "https://example.com/2024/01/post": 1.0, + "https://example.com/2023/12/post": 0.9, + "https://example.com/2022/post": 0.8, + "https://example.com/no-date": 0.5 + } + }, + + # Domain Authority Scorer Tests + { + "scorer_type": "domain", + "config": { + "domain_weights": { + "python.org": 1.0, + "github.com": 0.8, + "medium.com": 0.6 + }, + "default_weight": 0.3, + "weight": 1.0 + }, + "urls": { + "https://python.org/about": 1.0, + "https://github.com/repo": 0.8, + "https://medium.com/post": 0.6, + "https://unknown.com": 0.3 + } + } + ] + + def create_scorer(scorer_type, config): + if scorer_type == "keyword": + return ( + KeywordRelevanceScorer(**config), + FastKeywordRelevanceScorer(**config) + ) + elif scorer_type == "path_depth": + return ( + PathDepthScorer(**config), + FastPathDepthScorer(**config) + ) + elif scorer_type == "content_type": + return ( + ContentTypeScorer(**config), + FastContentTypeScorer(**config) + ) + elif scorer_type == "freshness": + return ( + FreshnessScorer(**config), + FastFreshnessScorer(**config, current_year=2024) + ) + elif scorer_type == "domain": + return ( + DomainAuthorityScorer(**config), + FastDomainAuthorityScorer(**config) + ) + + def run_accuracy_test(): + print("\nAccuracy Tests:") + print("-" * 50) + + all_passed = True + for test_case in test_cases: + print(f"\nTesting {test_case['scorer_type']} scorer:") + original, fast = create_scorer( + test_case['scorer_type'], + test_case['config'] + ) + + for url, expected in test_case['urls'].items(): + orig_score = round(original.score(url), 8) + fast_score = round(fast.score(url), 8) + expected = round(expected, 8) + + if abs(orig_score - expected) > 0.00001: + print(f"❌ Original Failed: URL '{url}'") + print(f" Expected: {expected}, Got: {orig_score}") + all_passed = False + else: + print(f"✅ Original Passed: URL '{url}'") + + if abs(fast_score - expected) > 0.00001: + print(f"❌ Fast Failed: URL '{url}'") + print(f" Expected: {expected}, Got: {fast_score}") + all_passed = False + else: + print(f"✅ Fast Passed: URL '{url}'") + + return all_passed + + def run_composite_test(): + print("\nTesting Composite Scorer:") + print("-" * 50) + + # Create test data + test_urls = { + "https://python.org/blog/2024/01/new-release.html":0.86666667, + "https://github.com/repo/old-code.pdf": 0.62, + "https://unknown.com/random": 0.26 + } + + # Create composite scorers with all types + original_scorers = [] + fast_scorers = [] + + for test_case in test_cases: + orig, fast = create_scorer( + test_case['scorer_type'], + test_case['config'] + ) + original_scorers.append(orig) + fast_scorers.append(fast) + + original_composite = CompositeScorer(original_scorers, normalize=True) + fast_composite = FastCompositeScorer(fast_scorers, normalize=True) + + all_passed = True + for url, expected in test_urls.items(): + orig_score = round(original_composite.score(url), 8) + fast_score = round(fast_composite.score(url), 8) + + if abs(orig_score - expected) > 0.00001: + print(f"❌ Original Composite Failed: URL '{url}'") + print(f" Expected: {expected}, Got: {orig_score}") + all_passed = False + else: + print(f"✅ Original Composite Passed: URL '{url}'") + + if abs(fast_score - expected) > 0.00001: + print(f"❌ Fast Composite Failed: URL '{url}'") + print(f" Expected: {expected}, Got: {fast_score}") + all_passed = False + else: + print(f"✅ Fast Composite Passed: URL '{url}'") + + return all_passed + + # Run tests + print("Running Scorer Tests...") + accuracy_passed = run_accuracy_test() + composite_passed = run_composite_test() + + if accuracy_passed and composite_passed: + print("\n✨ All tests passed!") + # Note: Already have performance tests in run_scorer_performance_test() + else: + print("\n❌ Some tests failed!") + + + +if __name__ == "__main__": + run_scorer_performance_test() + # test_scorers() \ No newline at end of file diff --git a/crawl4ai/scraper/scorers_review.py b/crawl4ai/scraper/scorers_review.py deleted file mode 100644 index 990254d5..00000000 --- a/crawl4ai/scraper/scorers_review.py +++ /dev/null @@ -1,1208 +0,0 @@ -# from .url_scorer import URLScorer -# from .keyword_relevance_scorer import KeywordRelevanceScorer - -from abc import ABC, abstractmethod -from typing import List, Dict, Optional, Union -from dataclasses import dataclass -from urllib.parse import urlparse, unquote -import re -from collections import defaultdict -import math -import logging -from functools import lru_cache -from array import array -from functools import lru_cache -import ctypes -import platform -PLATFORM = platform.system() - -# Pre-computed scores for common year differences -_SCORE_LOOKUP = [1.0, 0.5, 0.3333333333333333, 0.25] - -# Pre-computed scores for common year differences -_FRESHNESS_SCORES = [ - 1.0, # Current year - 0.9, # Last year - 0.8, # 2 years ago - 0.7, # 3 years ago - 0.6, # 4 years ago - 0.5, # 5 years ago -] - -# Pre-computed normalization factors for powers of 2 -_POW2_NORM = [1.0, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625] - - -@dataclass -class ScoringStats: - # PERF: Dataclass introduces overhead with property access and __init__ - # PERF: Float operations and comparisons are expensive for high-frequency updates - # PERF: Property calculation on every access is inefficient - # PERF: Storing min/max adds memory overhead and comparison costs - # PERF: Using inf/-inf creates unnecessary float objects - urls_scored: int = 0 - total_score: float = 0.0 - min_score: float = float("inf") # Expensive object creation - max_score: float = float("-inf") - - def update(self, score: float): - """Update scoring statistics""" - self.urls_scored += 1 - self.total_score += score - self.min_score = min(self.min_score, score) - self.max_score = max(self.max_score, score) - - @property - def average_score(self) -> float: - """Calculate average score""" - return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0 - -class FastScoringStats: - __slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score') - - def __init__(self): - self._urls_scored = 0 - self._total_score = 0.0 - self._min_score = None # Lazy initialization - self._max_score = None - - def update(self, score: float) -> None: - """Optimized update with minimal operations""" - self._urls_scored += 1 - self._total_score += score - - # Lazy min/max tracking - only if actually accessed - if self._min_score is not None: - if score < self._min_score: - self._min_score = score - if self._max_score is not None: - if score > self._max_score: - self._max_score = score - - def get_average(self) -> float: - """Direct calculation instead of property""" - return self._total_score / self._urls_scored if self._urls_scored else 0.0 - - def get_min(self) -> float: - """Lazy min calculation""" - if self._min_score is None: - self._min_score = self._total_score / self._urls_scored if self._urls_scored else 0.0 - return self._min_score - - def get_max(self) -> float: - """Lazy max calculation""" - if self._max_score is None: - self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0 - return self._max_score - -class URLScorer(ABC): - # PERF: Property access overhead for weight - # PERF: Unnecessary name attribute - # PERF: Stats object creation overhead - # PERF: Logger creation for each instance - # PERF: Abstract method overhead - - def __init__(self, weight: float = 1.0, name: str = None): - self.weight = weight - self.name = name or self.__class__.__name__ - self.stats = ScoringStats() - self.logger = logging.getLogger(f"urlscorer.{self.name}") - - @abstractmethod - def _calculate_score(self, url: str) -> float: - pass - - def score(self, url: str) -> float: - raw_score = self._calculate_score(url) - weighted_score = raw_score * self.weight - self.stats.update(weighted_score) - return weighted_score - -# Optimized base class -class FastURLScorer(ABC): - __slots__ = ('_weight', '_stats') - - def __init__(self, weight: float = 1.0): - # Store weight directly as float32 for memory efficiency - self._weight = ctypes.c_float(weight).value - self._stats = ScoringStats() - - @abstractmethod - def _calculate_score(self, url: str) -> float: - """Calculate raw score for URL.""" - pass - - def score(self, url: str) -> float: - """Calculate weighted score with minimal overhead.""" - score = self._calculate_score(url) * self._weight - self._stats.update(score) - return score - - @property - def stats(self): - """Access to scoring statistics.""" - return self._stats - - @property - def weight(self): - return self._weight - -class CompositeScorer(URLScorer): - # PERF: Unnecessary list iteration for each score - # PERF: Creates new list for scores - # PERF: Division on every normalization - # PERF: No parallelization for independent scorers - # PERF: No short circuit for zero scores - # PERF: No weighting optimization - # PERF: No caching of combined scores - # PERF: List allocation for scores storag - """Combines multiple scorers with weights""" - - def __init__(self, scorers: List[URLScorer], normalize: bool = True): - super().__init__(name="CompositeScorer") - self.scorers = scorers - self.normalize = normalize - - def _calculate_score(self, url: str) -> float: - scores = [scorer.score(url) for scorer in self.scorers] - total_score = sum(scores) - - if self.normalize and scores: - total_score /= len(scores) - - return total_score - -class FastCompositeScorer(FastURLScorer): - __slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array') - - def __init__(self, scorers: List[URLScorer], normalize: bool = True): - """Initialize composite scorer combining multiple scoring strategies. - - Optimized for: - - Fast parallel scoring - - Memory efficient score aggregation - - Quick short-circuit conditions - - Pre-allocated arrays - - Args: - scorers: List of scoring strategies to combine - normalize: Whether to normalize final score by scorer count - """ - super().__init__(weight=1.0) - self._scorers = scorers - self._normalize = normalize - - # Pre-allocate arrays for scores and weights - self._weights_array = array('f', [s.weight for s in scorers]) - self._score_array = array('f', [0.0] * len(scorers)) - - @lru_cache(maxsize=10000) - def _calculate_score(self, url: str) -> float: - """Calculate combined score from all scoring strategies. - - Uses: - 1. Pre-allocated arrays for scores - 2. Short-circuit on zero scores - 3. Optimized normalization - 4. Vectorized operations where possible - - Args: - url: URL to score - - Returns: - Combined and optionally normalized score - """ - total_score = 0.0 - scores = self._score_array - - # Get scores from all scorers - for i, scorer in enumerate(self._scorers): - # Use public score() method which applies weight - scores[i] = scorer.score(url) - total_score += scores[i] - - # Normalize if requested - if self._normalize and self._scorers: - count = len(self._scorers) - return total_score / count - - return total_score - - def score(self, url: str) -> float: - """Public scoring interface with stats tracking. - - Args: - url: URL to score - - Returns: - Final combined score - """ - score = self._calculate_score(url) - self.stats.update(score) - return score - -class KeywordRelevanceScorer(URLScorer): - # PERF: Regex compilation and pattern matching is expensive - # PERF: List comprehension with pattern search has high overhead - # PERF: URL decoding on every calculation - # PERF: Division operation for normalization is costly - # PERF: Case insensitive regex adds overhead - # PERF: No pattern caching or reuse - # PERF: Using inheritance adds method lookup overhead - - """Score URLs based on keyword relevance. - - keyword_scorer = KeywordRelevanceScorer( - keywords=["python", "programming"], - weight=1.0, - case_sensitive=False - ) - - - Score based on keyword matches - - Case sensitivity options - - Weighted scoring - """ - - def __init__( - self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False - ): - super().__init__(weight=weight) - self.keywords = keywords - self.case_sensitive = case_sensitive - self._compile_keywords() - - def _compile_keywords(self): - """Prepare keywords for matching""" - flags = 0 if self.case_sensitive else re.IGNORECASE - self.patterns = [re.compile(re.escape(k), flags) for k in self.keywords] - - def _calculate_score(self, url: str) -> float: - """Calculate score based on keyword matches""" - decoded_url = unquote(url) - total_matches = sum( - 1 for pattern in self.patterns if pattern.search(decoded_url) - ) - # Normalize score between 0 and 1 - return total_matches / len(self.patterns) if self.patterns else 0.0 - -class FastKeywordRelevanceScorer(FastURLScorer): - __slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive') - - def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False): - super().__init__(weight=weight) - self._case_sensitive = case_sensitive - # Pre-process keywords once - self._keywords = [k if case_sensitive else k.lower() for k in keywords] - - @lru_cache(maxsize=10000) - def _url_bytes(self, url: str) -> bytes: - """Cache decoded URL bytes""" - return url.encode('utf-8') if self._case_sensitive else url.lower().encode('utf-8') - - - def _calculate_score(self, url: str) -> float: - """Fast string matching without regex or byte conversion""" - if not self._case_sensitive: - url = url.lower() - - matches = sum(1 for k in self._keywords if k in url) - - # Fast return paths - if not matches: - return 0.0 - if matches == len(self._keywords): - return 1.0 - - return matches / len(self._keywords) - -class PathDepthScorer(URLScorer): - # PERF: URL parsing on every call is expensive - # PERF: Split and list comprehension creates temporary lists - # PERF: abs() call adds function overhead - # PERF: Division and addition in score calculation are expensive for high frequency - # PERF: Path parts filtering creates extra list - # PERF: Inherits URLScorer adding method lookup overhead - # PERF: No caching of parsed URLs or calculated depths - """Score URLs based on their path depth. - - path_scorer = PathDepthScorer( - optimal_depth=3, # Preferred URL depth - weight=0.7 - ) - - - Score based on URL path depth - - Configurable optimal depth - - Diminishing returns for deeper paths - """ - - def __init__(self, optimal_depth: int = 3, weight: float = 1.0): - super().__init__(weight=weight) - self.optimal_depth = optimal_depth - - def _calculate_score(self, url: str) -> float: - """Calculate score based on path depth""" - path = urlparse(url).path - depth = len([x for x in path.split("/") if x]) - - # Score decreases as we move away from optimal depth - distance_from_optimal = abs(depth - self.optimal_depth) - return 1.0 / (1.0 + distance_from_optimal) - -class FastPathDepthScorer(FastURLScorer): - __slots__ = ('_weight', '_stats', '_optimal_depth') # Remove _url_cache - - def __init__(self, optimal_depth: int = 3, weight: float = 1.0): - super().__init__(weight=weight) - self._optimal_depth = optimal_depth - - @staticmethod - @lru_cache(maxsize=10000) - def _quick_depth(path: str) -> int: - """Ultra fast path depth calculation. - - Examples: - - "http://example.com" -> 0 # No path segments - - "http://example.com/" -> 0 # Empty path - - "http://example.com/a" -> 1 - - "http://example.com/a/b" -> 2 - """ - if not path or path == '/': - return 0 - - if '/' not in path: - return 0 - - depth = 0 - last_was_slash = True - - for c in path: - if c == '/': - if not last_was_slash: - depth += 1 - last_was_slash = True - else: - last_was_slash = False - - if not last_was_slash: - depth += 1 - - return depth - - @lru_cache(maxsize=10000) # Cache the whole calculation - def _calculate_score(self, url: str) -> float: - pos = url.find('/', url.find('://') + 3) - if pos == -1: - depth = 0 - else: - depth = self._quick_depth(url[pos:]) - - # Use lookup table for common distances - distance = depth - self._optimal_depth - distance = distance if distance >= 0 else -distance # Faster than abs() - - if distance < 4: - return _SCORE_LOOKUP[distance] - - return 1.0 / (1.0 + distance) - -class ContentTypeScorer(URLScorer): - # PERF: Regex compilation on every initialization - # PERF: Dict lookup and regex search for every URL - # PERF: Pattern iteration adds loop overhead - # PERF: No pattern priority or short-circuit - # PERF: Dict storage has lookup overhead - # PERF: Missing extension fast path check - # PERF: Unnecessary regex for simple extensions - """Score URLs based on content type preferences. - - content_scorer = ContentTypeScorer({ - r'\.html$': 1.0, - r'\.pdf$': 0.8, - r'\.xml$': 0.6 - }) - - - Score based on file types - - Configurable type weights - - Pattern matching support - """ - - def __init__(self, type_weights: Dict[str, float], weight: float = 1.0): - super().__init__(weight=weight) - self.type_weights = type_weights - self._compile_patterns() - - def _compile_patterns(self): - """Prepare content type patterns""" - self.patterns = { - re.compile(pattern): weight for pattern, weight in self.type_weights.items() - } - - def _calculate_score(self, url: str) -> float: - """Calculate score based on content type matching""" - for pattern, weight in self.patterns.items(): - if pattern.search(url): - return weight - return 0.0 - -class FastContentTypeScorer(FastURLScorer): - __slots__ = ('_weight', '_exact_types', '_regex_types') - - def __init__(self, type_weights: Dict[str, float], weight: float = 1.0): - """Initialize scorer with type weights map. - - Args: - type_weights: Dict mapping file extensions/patterns to scores (e.g. {'.html$': 1.0}) - weight: Overall weight multiplier for this scorer - """ - super().__init__(weight=weight) - self._exact_types = {} # Fast lookup for simple extensions - self._regex_types = [] # Fallback for complex patterns - - # Split into exact vs regex matchers for performance - for pattern, score in type_weights.items(): - if pattern.startswith('.') and pattern.endswith('$'): - ext = pattern[1:-1] - self._exact_types[ext] = score - else: - self._regex_types.append((re.compile(pattern), score)) - - # Sort complex patterns by score for early exit - self._regex_types.sort(key=lambda x: -x[1]) - - @staticmethod - @lru_cache(maxsize=10000) - def _quick_extension(url: str) -> str: - """Extract file extension ultra-fast without regex/splits. - - Handles: - - Basic extensions: "example.html" -> "html" - - Query strings: "page.php?id=1" -> "php" - - Fragments: "doc.pdf#page=1" -> "pdf" - - Path params: "file.jpg;width=100" -> "jpg" - - Args: - url: URL to extract extension from - - Returns: - Extension without dot, or empty string if none found - """ - pos = url.rfind('.') - if pos == -1: - return '' - - # Find first non-alphanumeric char after extension - end = len(url) - for i in range(pos + 1, len(url)): - c = url[i] - # Stop at query string, fragment, path param or any non-alphanumeric - if c in '?#;' or not c.isalnum(): - end = i - break - - return url[pos + 1:end].lower() - - @lru_cache(maxsize=10000) - def _calculate_score(self, url: str) -> float: - """Calculate content type score for URL. - - Uses staged approach: - 1. Try exact extension match (fast path) - 2. Fall back to regex patterns if needed - - Args: - url: URL to score - - Returns: - Score between 0.0 and 1.0 * weight - """ - # Fast path: direct extension lookup - ext = self._quick_extension(url) - if ext: - score = self._exact_types.get(ext, None) - if score is not None: - return score - - # Slow path: regex patterns - for pattern, score in self._regex_types: - if pattern.search(url): - return score - - return 0.0 - -class FreshnessScorer(URLScorer): - # PERF: Multiple regex compilations for each pattern - # PERF: Tries all patterns sequentially - # PERF: Regex pattern matching is expensive - # PERF: Int conversion and arithmetic for every match - # PERF: Repeated constant value (2024) hardcoded - # PERF: No URL caching - # PERF: Complex patterns with redundant groups - # PERF: Unnecessary list of patterns when could combine - """Score URLs based on freshness indicators. - - freshness_scorer = FreshnessScorer(weight=0.9) - - Score based on date indicators in URLs - Multiple date format support - Recency weighting""" - - def __init__(self, weight: float = 1.0): - super().__init__(weight=weight) - self.date_patterns = [ - r"/(\d{4})/(\d{2})/(\d{2})/", # yyyy/mm/dd - r"(\d{4})[-_](\d{2})[-_](\d{2})", # yyyy-mm-dd - r"/(\d{4})/", # year only - ] - self._compile_patterns() - - def _compile_patterns(self): - """Prepare date patterns""" - self.compiled_patterns = [re.compile(p) for p in self.date_patterns] - - def _calculate_score(self, url: str) -> float: - """Calculate score based on date indicators""" - for pattern in self.compiled_patterns: - if match := pattern.search(url): - year = int(match.group(1)) - # Score higher for more recent years - return 1.0 - (2024 - year) * 0.1 - return 0.5 # Default score for URLs without dates - -class FastFreshnessScorer(FastURLScorer): - __slots__ = ('_weight', '_date_pattern', '_current_year') - - def __init__(self, weight: float = 1.0, current_year: int = 2024): - """Initialize freshness scorer. - - Extracts and scores dates from URLs using format: - - YYYY/MM/DD - - YYYY-MM-DD - - YYYY_MM_DD - - YYYY (year only) - - Args: - weight: Score multiplier - current_year: Year to calculate freshness against (default 2024) - """ - super().__init__(weight=weight) - self._current_year = current_year - - # Combined pattern for all date formats - # Uses non-capturing groups (?:) and alternation - self._date_pattern = re.compile( - r'(?:/' # Path separator - r'|[-_])' # or date separators - r'((?:19|20)\d{2})' # Year group (1900-2099) - r'(?:' # Optional month/day group - r'(?:/|[-_])' # Date separator - r'(?:\d{2})' # Month - r'(?:' # Optional day - r'(?:/|[-_])' # Date separator - r'(?:\d{2})' # Day - r')?' # Day is optional - r')?' # Month/day group is optional - ) - - @lru_cache(maxsize=10000) - def _extract_year(self, url: str) -> Optional[int]: - """Extract the most recent year from URL. - - Args: - url: URL to extract year from - - Returns: - Year as int or None if no valid year found - """ - matches = self._date_pattern.finditer(url) - latest_year = None - - # Find most recent year - for match in matches: - year = int(match.group(1)) - if (year <= self._current_year and # Sanity check - (latest_year is None or year > latest_year)): - latest_year = year - - return latest_year - - @lru_cache(maxsize=10000) - def _calculate_score(self, url: str) -> float: - """Calculate freshness score based on URL date. - - More recent years score higher. Uses pre-computed scoring - table for common year differences. - - Args: - url: URL to score - - Returns: - Score between 0.0 and 1.0 * weight - """ - year = self._extract_year(url) - if year is None: - return 0.5 # Default score - - # Use lookup table for common year differences - year_diff = self._current_year - year - if year_diff < len(_FRESHNESS_SCORES): - return _FRESHNESS_SCORES[year_diff] - - # Fallback calculation for older content - return max(0.1, 1.0 - year_diff * 0.1) - -class DomainAuthorityScorer(URLScorer): - # PERF: URL parsing on every score calculation - # PERF: Repeated domain extraction - # PERF: Case conversion on every lookup - # PERF: Dict lookup without caching - # PERF: Processes full URL when only needs domain - # PERF: No fast path for common domains - # PERF: Netloc includes port which requires extra processing - """Score URLs based on domain authority. - - authority_scorer = DomainAuthorityScorer({ - "python.org": 1.0, - "github.com": 0.9, - "medium.com": 0.7 - }) - - Score based on domain importance - Configurable domain weights - Default weight for unknown domains""" - - def __init__( - self, - domain_weights: Dict[str, float], - default_weight: float = 0.5, - weight: float = 1.0, - ): - super().__init__(weight=weight) - self.domain_weights = domain_weights - self.default_weight = default_weight - - def _calculate_score(self, url: str) -> float: - """Calculate score based on domain authority""" - domain = urlparse(url).netloc.lower() - return self.domain_weights.get(domain, self.default_weight) - -class FastDomainAuthorityScorer(FastURLScorer): - __slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains') - - def __init__( - self, - domain_weights: Dict[str, float], - default_weight: float = 0.5, - weight: float = 1.0, - ): - """Initialize domain authority scorer. - - Args: - domain_weights: Dict mapping domains to authority scores - default_weight: Score for unknown domains - weight: Overall scorer weight multiplier - - Example: - { - 'python.org': 1.0, - 'github.com': 0.9, - 'medium.com': 0.7 - } - """ - super().__init__(weight=weight) - - # Pre-process domains for faster lookup - self._domain_weights = { - domain.lower(): score - for domain, score in domain_weights.items() - } - self._default_weight = default_weight - - # Cache top domains for fast path - self._top_domains = { - domain: score - for domain, score in sorted( - domain_weights.items(), - key=lambda x: -x[1] - )[:5] # Keep top 5 highest scoring domains - } - - @staticmethod - @lru_cache(maxsize=10000) - def _extract_domain(url: str) -> str: - """Extract domain from URL ultra-fast. - - Handles: - - Basic domains: "example.com" - - Subdomains: "sub.example.com" - - Ports: "example.com:8080" - - IPv4: "192.168.1.1" - - Args: - url: Full URL to extract domain from - - Returns: - Lowercase domain without port - """ - # Find domain start - start = url.find('://') - if start == -1: - start = 0 - else: - start += 3 - - # Find domain end - end = url.find('/', start) - if end == -1: - end = url.find('?', start) - if end == -1: - end = url.find('#', start) - if end == -1: - end = len(url) - - # Extract domain and remove port - domain = url[start:end] - port_idx = domain.rfind(':') - if port_idx != -1: - domain = domain[:port_idx] - - return domain.lower() - - @lru_cache(maxsize=10000) - def _calculate_score(self, url: str) -> float: - """Calculate domain authority score. - - Uses staged approach: - 1. Check top domains (fastest) - 2. Check full domain weights - 3. Return default weight - - Args: - url: URL to score - - Returns: - Authority score between 0.0 and 1.0 * weight - """ - domain = self._extract_domain(url) - - # Fast path: check top domains first - score = self._top_domains.get(domain) - if score is not None: - return score - - # Regular path: check all domains - return self._domain_weights.get(domain, self._default_weight) - -def create_balanced_scorer() -> CompositeScorer: - """Create a balanced composite scorer""" - return CompositeScorer( - [ - KeywordRelevanceScorer( - keywords=["article", "blog", "news", "research"], weight=1.0 - ), - PathDepthScorer(optimal_depth=3, weight=0.7), - ContentTypeScorer( - type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6}, - weight=0.8, - ), - FreshnessScorer(weight=0.9), - ] - ) - -def create_balanced_fast_freshness_scorer() -> CompositeScorer: - """Create a balanced composite scorer with fast freshness scorer""" - return FastCompositeScorer( - [ - FastKeywordRelevanceScorer( - keywords=["article", "blog", "news", "research"], weight=1.0 - ), - FastPathDepthScorer(optimal_depth=3, weight=0.7), - FastContentTypeScorer( - type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6}, - weight=0.8, - ), - FastFreshnessScorer(weight=0.9), - ] - ) - -# Example Usage: -""" -# Create a composite scorer -scorer = CompositeScorer([ - KeywordRelevanceScorer(["python", "programming"], weight=1.0), - PathDepthScorer(optimal_depth=2, weight=0.7), - FreshnessScorer(weight=0.8), - DomainAuthorityScorer( - domain_weights={ - "python.org": 1.0, - "github.com": 0.9, - "medium.com": 0.7 - }, - weight=0.9 - ) -]) - -# Score a URL -score = scorer.score("https://python.org/article/2024/01/new-features") - -# Access statistics -print(f"Average score: {scorer.stats.average_score}") -print(f"URLs scored: {scorer.stats.urls_scored}") -""" - - -def run_scorer_performance_test(): - import time - import random - from itertools import cycle - import sys - - # Generate varied test URLs - base_urls = [ - # News/blog articles with dates - "https://example.com/2024/01/article-123", - "https://news.com/2023-12-31/breaking-news", - "https://blog.site.com/2022_11_15/tech-update", - - # Different content types - "https://docs.example.com/report.pdf", - "https://site.com/page.html?q=test", - "https://api.service.com/data.json", - - # Various domain authorities - "https://python.org/downloads", - "https://github.com/repo/code", - "https://medium.com/@user/post", - - # Different path depths - "https://site.com/category/subcategory/product/detail", - "https://shop.com/items", - "https://edu.org/courses/cs/intro/lecture1", - ] - - # Create variations - test_urls = [] - years = list(range(2020, 2025)) - domains = ["example.com", "python.org", "github.com", "medium.com"] - extensions = ["html", "pdf", "php", "jsx"] - - for base in base_urls: - test_urls.append(base) - # Add year variations - for year in years: - test_urls.append(f"https://blog.com/{year}/post-{random.randint(1,999)}") - # Add domain variations - for domain in domains: - test_urls.append(f"https://{domain}/article-{random.randint(1,999)}") - # Add extension variations - for ext in extensions: - test_urls.append(f"https://site.com/doc-{random.randint(1,999)}.{ext}") - - # Multiply dataset - test_urls = test_urls * 5000 # Creates ~300k URLs - - def benchmark(name: str, scorer, urls, warmup=True): - if warmup: - for url in urls[:100]: # Warmup with subset - scorer.score(url) - - start = time.perf_counter_ns() - for url in urls: - scorer.score(url) - elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms - - print( - f"{name:<35} {elapsed:>8.3f} ms ({len(urls)/elapsed*1000:,.0f} URLs/sec)" - ) - return elapsed - - print("\nBenchmarking original vs optimized scorers...") - print("-" * 75) - - # Initialize test data - domain_weights = {"python.org": 1.0, "github.com": 0.9, "medium.com": 0.7} - type_weights = {".html$": 1.0, ".pdf$": 0.8, ".php$": 0.6} - keywords = ["python", "article", "blog", "docs"] - - # Original implementations - keyword_scorer = KeywordRelevanceScorer(keywords=keywords, weight=1.0) - path_scorer = PathDepthScorer(optimal_depth=3, weight=0.7) - content_scorer = ContentTypeScorer(type_weights=type_weights, weight=0.8) - freshness_scorer = FreshnessScorer(weight=0.9) - domain_scorer = DomainAuthorityScorer(domain_weights=domain_weights, weight=1.0) - - # Fast implementations - fast_keyword_scorer = FastKeywordRelevanceScorer(keywords=keywords, weight=1.0) - fast_path_scorer = FastPathDepthScorer(optimal_depth=3, weight=0.7) - fast_content_scorer = FastContentTypeScorer(type_weights=type_weights, weight=0.8) - fast_freshness_scorer = FastFreshnessScorer(weight=0.9) - fast_domain_scorer = FastDomainAuthorityScorer(domain_weights=domain_weights, weight=1.0) - - # Test subset for individual scorers - test_subset = test_urls[:1000] - - print("\nIndividual Scorer Performance (first 1000 URLs):") - - print("\nKeyword Relevance Scorers:") - benchmark("Original Keyword Scorer", keyword_scorer, test_subset) - benchmark("Optimized Keyword Scorer", fast_keyword_scorer, test_subset) - - print("\nPath Depth Scorers:") - benchmark("Original Path Scorer", path_scorer, test_subset) - benchmark("Optimized Path Scorer", fast_path_scorer, test_subset) - - print("\nContent Type Scorers:") - benchmark("Original Content Scorer", content_scorer, test_subset) - benchmark("Optimized Content Scorer", fast_content_scorer, test_subset) - - print("\nFreshness Scorers:") - benchmark("Original Freshness Scorer", freshness_scorer, test_subset) - benchmark("Optimized Freshness Scorer", fast_freshness_scorer, test_subset) - - print("\nDomain Authority Scorers:") - benchmark("Original Domain Scorer", domain_scorer, test_subset) - benchmark("Optimized Domain Scorer", fast_domain_scorer, test_subset) - - # Test composite scorers - print("\nComposite Scorer Performance (all URLs):") - - original_composite = CompositeScorer([ - keyword_scorer, path_scorer, content_scorer, - freshness_scorer, domain_scorer - ]) - - fast_composite = FastCompositeScorer([ - fast_keyword_scorer, fast_path_scorer, fast_content_scorer, - fast_freshness_scorer, fast_domain_scorer - ]) - - benchmark("Original Composite Scorer", original_composite, test_urls) - benchmark("Optimized Composite Scorer", fast_composite, test_urls) - - # Memory usage - print("\nMemory Usage per Scorer:") - print(f"Original Keyword Scorer: {sys.getsizeof(keyword_scorer):,} bytes") - print(f"Optimized Keyword Scorer: {sys.getsizeof(fast_keyword_scorer):,} bytes") - print(f"Original Path Scorer: {sys.getsizeof(path_scorer):,} bytes") - print(f"Optimized Path Scorer: {sys.getsizeof(fast_path_scorer):,} bytes") - print(f"Original Content Scorer: {sys.getsizeof(content_scorer):,} bytes") - print(f"Optimized Content Scorer: {sys.getsizeof(fast_content_scorer):,} bytes") - print(f"Original Freshness Scorer: {sys.getsizeof(freshness_scorer):,} bytes") - print(f"Optimized Freshness Scorer: {sys.getsizeof(fast_freshness_scorer):,} bytes") - print(f"Original Domain Scorer: {sys.getsizeof(domain_scorer):,} bytes") - print(f"Optimized Domain Scorer: {sys.getsizeof(fast_domain_scorer):,} bytes") - print(f"Original Composite: {sys.getsizeof(original_composite):,} bytes") - print(f"Optimized Composite: {sys.getsizeof(fast_composite):,} bytes") - -def test_scorers(): - import time - from itertools import chain - - test_cases = [ - # Keyword Scorer Tests - { - "scorer_type": "keyword", - "config": { - "keywords": ["python", "blog"], - "weight": 1.0, - "case_sensitive": False - }, - "urls": { - "https://example.com/python-blog": 1.0, - "https://example.com/PYTHON-BLOG": 1.0, - "https://example.com/python-only": 0.5, - "https://example.com/other": 0.0 - } - }, - - # Path Depth Scorer Tests - { - "scorer_type": "path_depth", - "config": { - "optimal_depth": 2, - "weight": 1.0 - }, - "urls": { - "https://example.com/a/b": 1.0, - "https://example.com/a": 0.5, - "https://example.com/a/b/c": 0.5, - "https://example.com": 0.33333333 - } - }, - - # Content Type Scorer Tests - { - "scorer_type": "content_type", - "config": { - "type_weights": { - ".html$": 1.0, - ".pdf$": 0.8, - ".jpg$": 0.6 - }, - "weight": 1.0 - }, - "urls": { - "https://example.com/doc.html": 1.0, - "https://example.com/doc.pdf": 0.8, - "https://example.com/img.jpg": 0.6, - "https://example.com/other.txt": 0.0 - } - }, - - # Freshness Scorer Tests - { - "scorer_type": "freshness", - "config": { - "weight": 1.0, # Remove current_year since original doesn't support it - }, - "urls": { - "https://example.com/2024/01/post": 1.0, - "https://example.com/2023/12/post": 0.9, - "https://example.com/2022/post": 0.8, - "https://example.com/no-date": 0.5 - } - }, - - # Domain Authority Scorer Tests - { - "scorer_type": "domain", - "config": { - "domain_weights": { - "python.org": 1.0, - "github.com": 0.8, - "medium.com": 0.6 - }, - "default_weight": 0.3, - "weight": 1.0 - }, - "urls": { - "https://python.org/about": 1.0, - "https://github.com/repo": 0.8, - "https://medium.com/post": 0.6, - "https://unknown.com": 0.3 - } - } - ] - - def create_scorer(scorer_type, config): - if scorer_type == "keyword": - return ( - KeywordRelevanceScorer(**config), - FastKeywordRelevanceScorer(**config) - ) - elif scorer_type == "path_depth": - return ( - PathDepthScorer(**config), - FastPathDepthScorer(**config) - ) - elif scorer_type == "content_type": - return ( - ContentTypeScorer(**config), - FastContentTypeScorer(**config) - ) - elif scorer_type == "freshness": - return ( - FreshnessScorer(**config), - FastFreshnessScorer(**config, current_year=2024) - ) - elif scorer_type == "domain": - return ( - DomainAuthorityScorer(**config), - FastDomainAuthorityScorer(**config) - ) - - def run_accuracy_test(): - print("\nAccuracy Tests:") - print("-" * 50) - - all_passed = True - for test_case in test_cases: - print(f"\nTesting {test_case['scorer_type']} scorer:") - original, fast = create_scorer( - test_case['scorer_type'], - test_case['config'] - ) - - for url, expected in test_case['urls'].items(): - orig_score = round(original.score(url), 8) - fast_score = round(fast.score(url), 8) - expected = round(expected, 8) - - if abs(orig_score - expected) > 0.00001: - print(f"❌ Original Failed: URL '{url}'") - print(f" Expected: {expected}, Got: {orig_score}") - all_passed = False - else: - print(f"✅ Original Passed: URL '{url}'") - - if abs(fast_score - expected) > 0.00001: - print(f"❌ Fast Failed: URL '{url}'") - print(f" Expected: {expected}, Got: {fast_score}") - all_passed = False - else: - print(f"✅ Fast Passed: URL '{url}'") - - return all_passed - - def run_composite_test(): - print("\nTesting Composite Scorer:") - print("-" * 50) - - # Create test data - test_urls = { - "https://python.org/blog/2024/01/new-release.html":0.86666667, - "https://github.com/repo/old-code.pdf": 0.62, - "https://unknown.com/random": 0.26 - } - - # Create composite scorers with all types - original_scorers = [] - fast_scorers = [] - - for test_case in test_cases: - orig, fast = create_scorer( - test_case['scorer_type'], - test_case['config'] - ) - original_scorers.append(orig) - fast_scorers.append(fast) - - original_composite = CompositeScorer(original_scorers, normalize=True) - fast_composite = FastCompositeScorer(fast_scorers, normalize=True) - - all_passed = True - for url, expected in test_urls.items(): - orig_score = round(original_composite.score(url), 8) - fast_score = round(fast_composite.score(url), 8) - - if abs(orig_score - expected) > 0.00001: - print(f"❌ Original Composite Failed: URL '{url}'") - print(f" Expected: {expected}, Got: {orig_score}") - all_passed = False - else: - print(f"✅ Original Composite Passed: URL '{url}'") - - if abs(fast_score - expected) > 0.00001: - print(f"❌ Fast Composite Failed: URL '{url}'") - print(f" Expected: {expected}, Got: {fast_score}") - all_passed = False - else: - print(f"✅ Fast Composite Passed: URL '{url}'") - - return all_passed - - # Run tests - print("Running Scorer Tests...") - accuracy_passed = run_accuracy_test() - composite_passed = run_composite_test() - - if accuracy_passed and composite_passed: - print("\n✨ All tests passed!") - # Note: Already have performance tests in run_scorer_performance_test() - else: - print("\n❌ Some tests failed!") - - - -if __name__ == "__main__": - run_scorer_performance_test() - # test_scorers() \ No newline at end of file