diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 13410c4f..746df82b 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -17,11 +17,16 @@ from .extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy, - JsonXPathExtractionStrategy + JsonXPathExtractionStrategy, ) from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator -from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter +from .content_filter_strategy import ( + PruningContentFilter, + BM25ContentFilter, + LLMContentFilter, + RelevantContentFilter, +) from .models import CrawlResult, MarkdownGenerationResult from .async_dispatcher import ( MemoryAdaptiveDispatcher, @@ -29,20 +34,25 @@ from .async_dispatcher import ( RateLimiter, CrawlerMonitor, DisplayMode, - BaseDispatcher + BaseDispatcher, ) from .docker_client import Crawl4aiDockerClient from .hub import CrawlerHub from .deep_crawling import ( DeepCrawlStrategy, BFSDeepCrawlStrategy, - FastFilterChain, - FastContentTypeFilter, - FastDomainFilter, - FastURLFilter, - FastFilterStats, - FastKeywordRelevanceScorer, - FastURLScorer, + FilterChain, + ContentTypeFilter, + DomainFilter, + URLFilter, + FilterStats, + SEOFilter, + KeywordRelevanceScorer, + URLScorer, + CompositeScorer, + DomainAuthorityScorer, + FreshnessScorer, + PathDepthScorer, BestFirstCrawlingStrategy, DFSDeepCrawlStrategy, DeepCrawlDecorator, @@ -54,13 +64,18 @@ __all__ = [ "BFSDeepCrawlStrategy", "BestFirstCrawlingStrategy", "DFSDeepCrawlStrategy", - "FastFilterChain", - "FastContentTypeFilter", - "FastDomainFilter", - "FastFilterStats", - "FastURLFilter", - "FastKeywordRelevanceScorer", - "FastURLScorer", + "FilterChain", + "ContentTypeFilter", + "DomainFilter", + "FilterStats", + "URLFilter", + "SEOFilter", + "KeywordRelevanceScorer", + "URLScorer", + "CompositeScorer", + "DomainAuthorityScorer", + "FreshnessScorer", + "PathDepthScorer", "DeepCrawlDecorator", "CrawlResult", "CrawlerHub", diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index d93e27d1..19b6a689 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -886,7 +886,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ try: - viewport_height = page.viewport_size.get( + viewport_size = page.viewport_size + if viewport_size is None: + await page.set_viewport_size( + {"width": self.browser_config.viewport_width, "height": self.browser_config.viewport_height} + ) + viewport_size = page.viewport_size + + viewport_height = viewport_size.get( "height", self.browser_config.viewport_height ) current_position = viewport_height @@ -946,7 +953,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ try: suggested_filename = download.suggested_filename - download_path = os.path.join(self.downloads_path, suggested_filename) + download_path = os.path.join(self.browser_config.downloads_path, suggested_filename) self.logger.info( message="Downloading {filename} to {path}", diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 1c76dd79..c68036e8 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -166,7 +166,7 @@ class AsyncWebCrawler: ) # Initialize crawler strategy - params = {k: v for k, v in kwargs.items() if k in ["browser_congig", "logger"]} + params = {k: v for k, v in kwargs.items() if k in ["browser_config", "logger"]} self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( browser_config=browser_config, logger=self.logger, diff --git a/crawl4ai/deep_crawling/__init__.py b/crawl4ai/deep_crawling/__init__.py index f885d2ab..a64d431b 100644 --- a/crawl4ai/deep_crawling/__init__.py +++ b/crawl4ai/deep_crawling/__init__.py @@ -4,15 +4,22 @@ from .bfs_strategy import BFSDeepCrawlStrategy from .bff_strategy import BestFirstCrawlingStrategy from .dfs_strategy import DFSDeepCrawlStrategy from .filters import ( - FastFilterChain, - FastContentTypeFilter, - FastDomainFilter, - FastURLFilter, - FastFilterStats, + FilterChain, + ContentTypeFilter, + DomainFilter, + URLFilter, + FilterStats, + ContentRelevanceFilter, + SEOFilter ) from .scorers import ( - FastKeywordRelevanceScorer, - FastURLScorer, + KeywordRelevanceScorer, + URLScorer, + CompositeScorer, + DomainAuthorityScorer, + FreshnessScorer, + PathDepthScorer, + ContentTypeScorer ) __all__ = [ @@ -21,11 +28,18 @@ __all__ = [ "BFSDeepCrawlStrategy", "BestFirstCrawlingStrategy", "DFSDeepCrawlStrategy", - "FastFilterChain", - "FastContentTypeFilter", - "FastDomainFilter", - "FastURLFilter", - "FastFilterStats", - "FastKeywordRelevanceScorer", - "FastURLScorer", -] \ No newline at end of file + "FilterChain", + "ContentTypeFilter", + "DomainFilter", + "URLFilter", + "FilterStats", + "ContentRelevanceFilter", + "SEOFilter", + "KeywordRelevanceScorer", + "URLScorer", + "CompositeScorer", + "DomainAuthorityScorer", + "FreshnessScorer", + "PathDepthScorer", + "ContentTypeScorer", +] diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index aef38881..f1e871ee 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -6,8 +6,8 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple from urllib.parse import urlparse from ..models import TraversalStats -from .filters import FastFilterChain -from .scorers import FastURLScorer +from .filters import FilterChain +from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn @@ -34,8 +34,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): def __init__( self, max_depth: int, - filter_chain: FastFilterChain = FastFilterChain(), - url_scorer: Optional[FastURLScorer] = None, + filter_chain: FilterChain = FilterChain(), + url_scorer: Optional[URLScorer] = None, include_external: bool = False, logger: Optional[logging.Logger] = None, ): @@ -64,7 +64,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): self.logger.warning(f"Invalid URL: {url}, error: {e}") return False - if depth != 0 and not self.filter_chain.apply(url): + if depth != 0 and not await self.filter_chain.apply(url): return False return True diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index c8b700c4..48c0c240 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -6,8 +6,8 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple from urllib.parse import urlparse from ..models import TraversalStats -from .filters import FastFilterChain -from .scorers import FastURLScorer +from .filters import FilterChain +from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult @@ -23,8 +23,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): def __init__( self, max_depth: int, - filter_chain: FastFilterChain = FastFilterChain(), - url_scorer: Optional[FastURLScorer] = None, + filter_chain: FilterChain = FilterChain(), + url_scorer: Optional[URLScorer] = None, include_external: bool = False, logger: Optional[logging.Logger] = None, ): @@ -53,7 +53,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self.logger.warning(f"Invalid URL: {url}, error: {e}") return False - if depth != 0 and not self.filter_chain.apply(url): + if depth != 0 and not await self.filter_chain.apply(url): return False return True diff --git a/crawl4ai/deep_crawling/crazy.py b/crawl4ai/deep_crawling/crazy.py index 4ba94d65..8804e7bd 100644 --- a/crawl4ai/deep_crawling/crazy.py +++ b/crawl4ai/deep_crawling/crazy.py @@ -374,7 +374,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): parsed = urlparse(url) return (parsed.scheme in {'http', 'https'} and '.' in parsed.netloc - and self.filter_chain.apply(url)) + and await self.filter_chain.apply(url)) except Exception: return False diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index 4e754424..c8af3022 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -8,224 +8,16 @@ from functools import lru_cache import fnmatch from dataclasses import dataclass import weakref -import mimetypes import math from collections import defaultdict from typing import Dict from ..utils import HeadPeekr +import asyncio +import inspect + @dataclass class FilterStats: - # PERF: Using dataclass creates overhead with __init__ and property access - # PERF: Could use __slots__ to reduce memory footprint - # PERF: Consider using array.array('I') for atomic increments - total_urls: int = 0 - rejected_urls: int = 0 - passed_urls: int = 0 - - -class URLFilter(ABC): - # PERF: Logger creation is expensive, consider lazy initialization - # PERF: stats object creation adds overhead for each filter instance - def __init__(self, name: str = None): - self.name = name or self.__class__.__name__ - self.stats = FilterStats() - self.logger = logging.getLogger(f"urlfilter.{self.name}") - - @abstractmethod - def apply(self, url: str) -> bool: - pass - - def _update_stats(self, passed: bool): - # PERF: Already optimized but could use bitwise operations - # PERF: Consider removing stats entirely in production/fast mode - self.stats.total_urls += 1 - self.stats.passed_urls += passed - self.stats.rejected_urls += not passed - - -class FilterChain: - # PERF: List traversal for each URL is expensive - # PERF: Could use array.array instead of list for filters - # PERF: Consider adding fast path for single filter case - def __init__(self, filters: List[URLFilter] = None): - self.filters = filters or [] - self.stats = FilterStats() - self.logger = logging.getLogger("urlfilter.chain") - - def apply(self, url: str) -> bool: - # PERF: Logging on every rejection is expensive - # PERF: Could reorder filters by rejection rate - # PERF: Consider batch processing mode - self.stats.total_urls += 1 - - for filter_ in self.filters: - if not filter_.apply(url): - self.stats.rejected_urls += 1 - self.logger.debug(f"URL {url} rejected by {filter_.name}") - return False - - self.stats.passed_urls += 1 - return True - - -class URLPatternFilter(URLFilter): - # PERF: Converting glob to regex is expensive - # PERF: Multiple regex compilation is slow - # PERF: List of patterns causes multiple regex evaluations - def __init__( - self, - patterns: Union[str, Pattern, List[Union[str, Pattern]]], - use_glob: bool = True, - ): - super().__init__() - self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns - self.use_glob = use_glob - self._compiled_patterns = [] - - # PERF: This could be consolidated into a single regex with OR conditions - # PERF: glob_to_regex creates complex patterns, could be simplified - for pattern in self.patterns: - if isinstance(pattern, str) and use_glob: - self._compiled_patterns.append(self._glob_to_regex(pattern)) - else: - self._compiled_patterns.append( - re.compile(pattern) if isinstance(pattern, str) else pattern - ) - - def _glob_to_regex(self, pattern: str) -> Pattern: - # PERF: fnmatch.translate creates overly complex patterns - # PERF: Could cache common translations - return re.compile(fnmatch.translate(pattern)) - - def apply(self, url: str) -> bool: - # PERF: any() with generator is slower than direct loop with early return - # PERF: searching entire string is slower than anchored match - matches = any(pattern.search(url) for pattern in self._compiled_patterns) - self._update_stats(matches) - return matches - - -class ContentTypeFilter(URLFilter): - # PERF: mimetypes guessing is extremely slow - # PERF: URL parsing on every check is expensive - # PERF: No caching of results for similar extensions - def __init__( - self, allowed_types: Union[str, List[str]], check_extension: bool = True - ): - super().__init__() - self.allowed_types = ( - [allowed_types] if isinstance(allowed_types, str) else allowed_types - ) - self.check_extension = check_extension - self._normalize_types() - - def _normalize_types(self): - """Normalize content type strings""" - self.allowed_types = [t.lower() for t in self.allowed_types] - - def _check_extension(self, url: str) -> bool: - # PERF: urlparse is called on every check - # PERF: multiple string splits are expensive - # PERF: mimetypes.guess_type is very slow - ext = ( - urlparse(url).path.split(".")[-1].lower() - if "." in urlparse(url).path - else "" - ) - if not ext: - return True - - # PERF: guess_type is main bottleneck - guessed_type = mimetypes.guess_type(url)[0] - return any( - allowed in (guessed_type or "").lower() for allowed in self.allowed_types - ) - - def apply(self, url: str) -> bool: - """Check if URL's content type is allowed""" - result = True - if self.check_extension: - result = self._check_extension(url) - self._update_stats(result) - return result - - -class DomainFilter(URLFilter): - # PERF: Set lookups are fast but string normalizations on init are not - # PERF: Creating two sets doubles memory usage - def __init__( - self, - allowed_domains: Union[str, List[str]] = None, - blocked_domains: Union[str, List[str]] = None, - ): - super().__init__() - # PERF: Normalizing domains on every init is wasteful - # PERF: Could use frozenset for immutable lists - self.allowed_domains = ( - set(self._normalize_domains(allowed_domains)) if allowed_domains else None - ) - self.blocked_domains = ( - set(self._normalize_domains(blocked_domains)) if blocked_domains else set() - ) - - def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]: - # PERF: strip() and lower() create new strings for each domain - # PERF: List comprehension creates intermediate list - if isinstance(domains, str): - domains = [domains] - return [d.lower().strip() for d in domains] - - def _extract_domain(self, url: str) -> str: - # PERF: urlparse is called for every URL check - # PERF: lower() creates new string every time - # PERF: Could cache recent results - return urlparse(url).netloc.lower() - - def apply(self, url: str) -> bool: - # PERF: Two separate set lookups in worst case - # PERF: Domain extraction happens before knowing if we have any filters - domain = self._extract_domain(url) - - if domain in self.blocked_domains: - self._update_stats(False) - return False - - if self.allowed_domains is not None and domain not in self.allowed_domains: - self._update_stats(False) - return False - - self._update_stats(True) - return True - - -# Example usage: -def create_common_filter_chain() -> FilterChain: - """Create a commonly used filter chain""" - return FilterChain( - [ - URLPatternFilter( - [ - "*.html", - "*.htm", # HTML files - "*/article/*", - "*/blog/*", # Common content paths - ] - ), - ContentTypeFilter(["text/html", "application/xhtml+xml"]), - DomainFilter(blocked_domains=["ads.*", "analytics.*"]), - ] - ) - - -#################################################################################### -# Uncledoe: Optimized Version -#################################################################################### - - -# Use __slots__ and array for maximum memory/speed efficiency -@dataclass -class FastFilterStats: __slots__ = ("_counters",) def __init__(self): @@ -245,14 +37,14 @@ class FastFilterStats: return self._counters[2] -class FastURLFilter(ABC): +class URLFilter(ABC): """Optimized base filter class""" __slots__ = ("name", "stats", "_logger_ref") def __init__(self, name: str = None): self.name = name or self.__class__.__name__ - self.stats = FastFilterStats() + self.stats = FilterStats() # Lazy logger initialization using weakref self._logger_ref = None @@ -274,14 +66,14 @@ class FastURLFilter(ABC): self.stats._counters[2] += not passed # rejected -class FastFilterChain: +class FilterChain: """Optimized filter chain""" __slots__ = ("filters", "stats", "_logger_ref") - def __init__(self, filters: List[FastURLFilter] = None): + def __init__(self, filters: List[URLFilter] = None): self.filters = tuple(filters or []) # Immutable tuple for speed - self.stats = FastFilterStats() + self.stats = FilterStats() self._logger_ref = None @property @@ -291,94 +83,121 @@ class FastFilterChain: self._logger_ref = weakref.ref(logger) return self._logger_ref() - def add_filter(self, filter_: FastURLFilter) -> "FastFilterChain": + def add_filter(self, filter_: URLFilter) -> "FilterChain": """Add a filter to the chain""" self.filters.append(filter_) return self # Enable method chaining - def apply(self, url: str) -> bool: - """Optimized apply with minimal operations""" - self.stats._counters[0] += 1 # total + async def apply(self, url: str) -> bool: + """Apply all filters concurrently when possible""" + self.stats._counters[0] += 1 # Total processed URLs - # Direct tuple iteration is faster than list + tasks = [] for f in self.filters: - if not f.apply(url): - self.stats._counters[2] += 1 # rejected + result = f.apply(url) + + if inspect.isawaitable(result): + tasks.append(result) # Collect async tasks + elif not result: # Sync rejection + self.stats._counters[2] += 1 # Sync rejected return False - self.stats._counters[1] += 1 # passed + if tasks: + results = await asyncio.gather(*tasks) + + # Count how many filters rejected + rejections = results.count(False) + self.stats._counters[2] += rejections + + if not all(results): + return False # Stop early if any filter rejected + + self.stats._counters[1] += 1 # Passed return True -class FastURLPatternFilter(FastURLFilter): + +class URLPatternFilter(URLFilter): """Pattern filter balancing speed and completeness""" - __slots__ = ('_simple_suffixes', '_simple_prefixes', '_domain_patterns', '_path_patterns') - + + __slots__ = ( + "_simple_suffixes", + "_simple_prefixes", + "_domain_patterns", + "_path_patterns", + ) + PATTERN_TYPES = { - 'SUFFIX': 1, # *.html - 'PREFIX': 2, # /foo/* - 'DOMAIN': 3, # *.example.com - 'PATH': 4 , # Everything else - 'REGEX': 5 + "SUFFIX": 1, # *.html + "PREFIX": 2, # /foo/* + "DOMAIN": 3, # *.example.com + "PATH": 4, # Everything else + "REGEX": 5, } - - def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True): + + def __init__( + self, + patterns: Union[str, Pattern, List[Union[str, Pattern]]], + use_glob: bool = True, + ): super().__init__() patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns - + self._simple_suffixes = set() self._simple_prefixes = set() self._domain_patterns = [] self._path_patterns = [] - + for pattern in patterns: pattern_type = self._categorize_pattern(pattern) self._add_pattern(pattern, pattern_type) - + def _categorize_pattern(self, pattern: str) -> int: """Categorize pattern for specialized handling""" if not isinstance(pattern, str): - return self.PATTERN_TYPES['PATH'] - + return self.PATTERN_TYPES["PATH"] + # Check if it's a regex pattern - if pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern: - return self.PATTERN_TYPES['REGEX'] - - if pattern.count('*') == 1: - if pattern.startswith('*.'): - return self.PATTERN_TYPES['SUFFIX'] - if pattern.endswith('/*'): - return self.PATTERN_TYPES['PREFIX'] - - if '://' in pattern and pattern.startswith('*.'): - return self.PATTERN_TYPES['DOMAIN'] - - return self.PATTERN_TYPES['PATH'] - + if pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern: + return self.PATTERN_TYPES["REGEX"] + + if pattern.count("*") == 1: + if pattern.startswith("*."): + return self.PATTERN_TYPES["SUFFIX"] + if pattern.endswith("/*"): + return self.PATTERN_TYPES["PREFIX"] + + if "://" in pattern and pattern.startswith("*."): + return self.PATTERN_TYPES["DOMAIN"] + + return self.PATTERN_TYPES["PATH"] + def _add_pattern(self, pattern: str, pattern_type: int): """Add pattern to appropriate matcher""" - if pattern_type == self.PATTERN_TYPES['REGEX']: + if pattern_type == self.PATTERN_TYPES["REGEX"]: # For regex patterns, compile directly without glob translation - if isinstance(pattern, str) and (pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern): + if isinstance(pattern, str) and ( + pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern + ): self._path_patterns.append(re.compile(pattern)) return - elif pattern_type == self.PATTERN_TYPES['SUFFIX']: + elif pattern_type == self.PATTERN_TYPES["SUFFIX"]: self._simple_suffixes.add(pattern[2:]) - elif pattern_type == self.PATTERN_TYPES['PREFIX']: + elif pattern_type == self.PATTERN_TYPES["PREFIX"]: self._simple_prefixes.add(pattern[:-2]) - elif pattern_type == self.PATTERN_TYPES['DOMAIN']: - self._domain_patterns.append( - re.compile(pattern.replace('*.', r'[^/]+\.')) - ) + elif pattern_type == self.PATTERN_TYPES["DOMAIN"]: + self._domain_patterns.append(re.compile(pattern.replace("*.", r"[^/]+\."))) else: if isinstance(pattern, str): # Handle complex glob patterns - if '**' in pattern: - pattern = pattern.replace('**', '.*') - if '{' in pattern: + if "**" in pattern: + pattern = pattern.replace("**", ".*") + if "{" in pattern: # Convert {a,b} to (a|b) - pattern = re.sub(r'\{([^}]+)\}', - lambda m: f'({"|".join(m.group(1).split(","))})', - pattern) + pattern = re.sub( + r"\{([^}]+)\}", + lambda m: f'({"|".join(m.group(1).split(","))})', + pattern, + ) pattern = fnmatch.translate(pattern) self._path_patterns.append( pattern if isinstance(pattern, Pattern) else re.compile(pattern) @@ -389,36 +208,36 @@ class FastURLPatternFilter(FastURLFilter): """Hierarchical pattern matching""" # Quick suffix check (*.html) if self._simple_suffixes: - path = url.split('?')[0] - if path.split('/')[-1].split('.')[-1] in self._simple_suffixes: + path = url.split("?")[0] + if path.split("/")[-1].split(".")[-1] in self._simple_suffixes: self._update_stats(True) return True - + # Domain check if self._domain_patterns: for pattern in self._domain_patterns: if pattern.match(url): self._update_stats(True) return True - + # Prefix check (/foo/*) if self._simple_prefixes: - path = url.split('?')[0] + path = url.split("?")[0] if any(path.startswith(p) for p in self._simple_prefixes): self._update_stats(True) return True - + # Complex patterns if self._path_patterns: if any(p.search(url) for p in self._path_patterns): self._update_stats(True) return True - + self._update_stats(False) return False -class FastContentTypeFilter(FastURLFilter): +class ContentTypeFilter(URLFilter): """Optimized content type filter using fast lookups""" __slots__ = ("allowed_types", "_ext_map", "_check_extension") @@ -515,14 +334,30 @@ class FastContentTypeFilter(FastURLFilter): @staticmethod @lru_cache(maxsize=1000) - def _extract_extension(path: str) -> str: - """Fast extension extraction with caching""" - if "." not in path: + def _extract_extension(url: str) -> str: + """Extracts file extension from a URL.""" + # Remove scheme (http://, https://) if present + if "://" in url: + url = url.split("://", 1)[-1] # Get everything after '://' + + # Remove domain (everything up to the first '/') + path_start = url.find("/") + path = url[path_start:] if path_start != -1 else "" + + # Extract last filename in path + filename = path.rsplit("/", 1)[-1] if "/" in path else "" + + # Extract and validate extension + if "." not in filename: return "" - return path.rpartition(".")[-1].lower() + + return filename.rpartition(".")[-1].lower() def __init__( - self, allowed_types: Union[str, List[str]], check_extension: bool = True, ext_map: Dict[str, str] = _MIME_MAP + self, + allowed_types: Union[str, List[str]], + check_extension: bool = True, + ext_map: Dict[str, str] = _MIME_MAP, ): super().__init__() # Normalize and store as frozenset for fast lookup @@ -546,9 +381,7 @@ class FastContentTypeFilter(FastURLFilter): """Cached URL checking""" if not self._check_extension: return True - - path = url.split("?")[0] # Fast path split - ext = self._extract_extension(path) + ext = self._extract_extension(url) if not ext: return True @@ -561,7 +394,7 @@ class FastContentTypeFilter(FastURLFilter): return result -class FastDomainFilter(FastURLFilter): +class DomainFilter(URLFilter): """Optimized domain filter with fast lookups and caching""" __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache") @@ -599,7 +432,7 @@ class FastDomainFilter(FastURLFilter): @lru_cache(maxsize=10000) def _extract_domain(url: str) -> str: """Ultra-fast domain extraction with regex and caching""" - match = FastDomainFilter._DOMAIN_REGEX.search(url) + match = DomainFilter._DOMAIN_REGEX.search(url) return match.group(1).lower() if match else "" def apply(self, url: str) -> bool: @@ -627,19 +460,24 @@ class FastDomainFilter(FastURLFilter): return result - class ContentRelevanceFilter(URLFilter): """BM25-based relevance filter using head section content""" - - __slots__ = ('query_terms', 'threshold', 'k1', 'b', 'avgdl') - - def __init__(self, query: str, threshold: float, - k1: float = 1.2, b: float = 0.75, avgdl: int = 1000): + + __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl") + + def __init__( + self, + query: str, + threshold: float, + k1: float = 1.2, + b: float = 0.75, + avgdl: int = 1000, + ): super().__init__(name="BM25RelevanceFilter") self.query_terms = self._tokenize(query) self.threshold = threshold self.k1 = k1 # TF saturation parameter - self.b = b # Length normalization parameter + self.b = b # Length normalization parameter self.avgdl = avgdl # Average document length (empirical value) async def apply(self, url: str) -> bool: @@ -650,11 +488,11 @@ class ContentRelevanceFilter(URLFilter): # Field extraction with weighting fields = { - 'title': HeadPeekr.get_title(head_content) or "", - 'meta': HeadPeekr.extract_meta_tags(head_content) + "title": HeadPeekr.get_title(head_content) or "", + "meta": HeadPeekr.extract_meta_tags(head_content), } doc_text = self._build_document(fields) - + score = self._bm25(doc_text) decision = score >= self.threshold self._update_stats(decision) @@ -662,12 +500,14 @@ class ContentRelevanceFilter(URLFilter): def _build_document(self, fields: Dict) -> str: """Weighted document construction""" - return ' '.join([ - fields['title'] * 3, # Title weight - fields['meta'].get('description', '') * 2, - fields['meta'].get('keywords', ''), - ' '.join(fields['meta'].values()) - ]) + return " ".join( + [ + fields["title"] * 3, # Title weight + fields["meta"].get("description", "") * 2, + fields["meta"].get("keywords", ""), + " ".join(fields["meta"].values()), + ] + ) def _tokenize(self, text: str) -> List[str]: """Fast case-insensitive tokenization""" @@ -678,7 +518,7 @@ class ContentRelevanceFilter(URLFilter): doc_terms = self._tokenize(document) doc_len = len(doc_terms) tf = defaultdict(int) - + for term in doc_terms: tf[term] += 1 @@ -687,8 +527,9 @@ class ContentRelevanceFilter(URLFilter): term_freq = tf[term] idf = math.log((1 + 1) / (term_freq + 0.5) + 1) # Simplified IDF numerator = term_freq * (self.k1 + 1) - denominator = term_freq + self.k1 * (1 - self.b + - self.b * (doc_len / self.avgdl)) + denominator = term_freq + self.k1 * ( + 1 - self.b + self.b * (doc_len / self.avgdl) + ) score += idf * (numerator / denominator) return score @@ -696,30 +537,36 @@ class ContentRelevanceFilter(URLFilter): class SEOFilter(URLFilter): """Quantitative SEO quality assessment filter using head section analysis""" - - __slots__ = ('threshold', '_weights', '_kw_patterns') - + + __slots__ = ("threshold", "_weights", "_kw_patterns") + # Based on SEMrush/Google ranking factors research DEFAULT_WEIGHTS = { - 'title_length': 0.15, - 'title_kw': 0.18, - 'meta_description': 0.12, - 'canonical': 0.10, - 'robot_ok': 0.20, # Most critical factor - 'schema_org': 0.10, - 'url_quality': 0.15 + "title_length": 0.15, + "title_kw": 0.18, + "meta_description": 0.12, + "canonical": 0.10, + "robot_ok": 0.20, # Most critical factor + "schema_org": 0.10, + "url_quality": 0.15, } - def __init__(self, threshold: float = 0.65, - keywords: List[str] = None, - weights: Dict[str, float] = None): + def __init__( + self, + threshold: float = 0.65, + keywords: List[str] = None, + weights: Dict[str, float] = None, + ): super().__init__(name="SEOFilter") self.threshold = threshold self._weights = weights or self.DEFAULT_WEIGHTS - self._kw_patterns = re.compile( - r'\b({})\b'.format('|'.join(map(re.escape, keywords or []))), - re.I - ) if keywords else None + self._kw_patterns = ( + re.compile( + r"\b({})\b".format("|".join(map(re.escape, keywords or []))), re.I + ) + if keywords + else None + ) async def apply(self, url: str) -> bool: head_content = await HeadPeekr.peek_html(url) @@ -728,313 +575,74 @@ class SEOFilter(URLFilter): return False meta = HeadPeekr.extract_meta_tags(head_content) - title = HeadPeekr.get_title(head_content) or '' + title = HeadPeekr.get_title(head_content) or "" parsed_url = urlparse(url) - + scores = { - 'title_length': self._score_title_length(title), - 'title_kw': self._score_keyword_presence(title), - 'meta_description': self._score_meta_description(meta.get('description', '')), - 'canonical': self._score_canonical(meta.get('canonical'), url), - 'robot_ok': 1.0 if 'noindex' not in meta.get('robots', '') else 0.0, - 'schema_org': self._score_schema_org(head_content), - 'url_quality': self._score_url_quality(parsed_url) + "title_length": self._score_title_length(title), + "title_kw": self._score_keyword_presence(title), + "meta_description": self._score_meta_description( + meta.get("description", "") + ), + "canonical": self._score_canonical(meta.get("canonical"), url), + "robot_ok": 1.0 if "noindex" not in meta.get("robots", "") else 0.0, + "schema_org": self._score_schema_org(head_content), + "url_quality": self._score_url_quality(parsed_url), } - total_score = sum(weight * scores[factor] - for factor, weight in self._weights.items()) - + total_score = sum( + weight * scores[factor] for factor, weight in self._weights.items() + ) + decision = total_score >= self.threshold self._update_stats(decision) return decision def _score_title_length(self, title: str) -> float: length = len(title) - if 50 <= length <= 60: return 1.0 - if 40 <= length < 50 or 60 < length <= 70: return 0.7 + if 50 <= length <= 60: + return 1.0 + if 40 <= length < 50 or 60 < length <= 70: + return 0.7 return 0.3 # Poor length def _score_keyword_presence(self, text: str) -> float: - if not self._kw_patterns: return 0.0 + if not self._kw_patterns: + return 0.0 matches = len(self._kw_patterns.findall(text)) return min(matches * 0.3, 1.0) # Max 3 matches def _score_meta_description(self, desc: str) -> float: length = len(desc) - if 140 <= length <= 160: return 1.0 + if 140 <= length <= 160: + return 1.0 return 0.5 if 120 <= length <= 200 else 0.2 def _score_canonical(self, canonical: str, original: str) -> float: - if not canonical: return 0.5 # Neutral score + if not canonical: + return 0.5 # Neutral score return 1.0 if canonical == original else 0.2 def _score_schema_org(self, html: str) -> float: # Detect any schema.org markup in head - return 1.0 if re.search(r']+type=["\']application/ld\+json', html) else 0.0 + return ( + 1.0 + if re.search(r']+type=["\']application/ld\+json', html) + else 0.0 + ) def _score_url_quality(self, parsed_url) -> float: score = 1.0 path = parsed_url.path.lower() - + # Penalty factors - if len(path) > 80: score *= 0.7 - if re.search(r'\d{4}', path): score *= 0.8 # Numbers in path - if parsed_url.query: score *= 0.6 # URL parameters - if '_' in path: score *= 0.9 # Underscores vs hyphens - + if len(path) > 80: + score *= 0.7 + if re.search(r"\d{4}", path): + score *= 0.8 # Numbers in path + if parsed_url.query: + score *= 0.6 # URL parameters + if "_" in path: + score *= 0.9 # Underscores vs hyphens + return score - -def create_fast_filter_chain() -> FastFilterChain: - """Create an optimized filter chain with filters ordered by rejection rate""" - return FastFilterChain( - [ - # Domain filter first (fastest rejection) - FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]), - # Content filter second (medium speed) - FastContentTypeFilter(["text/html", "application/xhtml+xml"]), - # Pattern filter last (most expensive) - FastURLPatternFilter( - [ - "*.html", - "*.htm", - "*/article/*", - "*/blog/*", - ] - ), - ] - ) - - -def run_performance_test(): - import time - - # Generate test URLs - base_urls = [ - "https://example.com/article/123", - "https://blog.example.com/post/456", - "https://ads.example.com/tracking", - "https://example.com/about.html", - "https://analytics.example.com/script.js", - "https://example.com/products.php", - "https://subdomain.example.com/blog/post-123", - "https://example.com/path/file.pdf", - ] - - # Create more varied test data - test_urls = [] - for base in base_urls: - # Add original - test_urls.append(base) - # Add variations - parts = base.split("/") - for i in range(10): - parts[-1] = f"page_{i}.html" - test_urls.append("/".join(parts)) - - # Multiply to get enough test data - test_urls = test_urls * 10000 # Creates ~800k URLs - - def benchmark(name: str, func, *args, warmup=True): - if warmup: - # Warmup run - func(*args) - - # Actual timing - start = time.perf_counter_ns() - result = func(*args) - elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms - print( - f"{name:<30} {elapsed:>8.3f} ms ({len(test_urls)/elapsed*1000:,.0f} URLs/sec)" - ) - return result - - print("\nBenchmarking original vs optimized implementations...") - print("-" * 70) - - # Original implementation - pattern_filter = URLPatternFilter(["*.html", "*/article/*"]) - content_filter = ContentTypeFilter(["text/html"]) - domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"]) - chain = FilterChain([pattern_filter, content_filter, domain_filter]) - - # Optimized implementation - fast_pattern_filter = FastURLPatternFilter(["*.html", "*/article/*"]) - fast_content_filter = FastContentTypeFilter(["text/html"]) - fast_domain_filter = FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]) - fast_chain = FastFilterChain( - [fast_domain_filter, fast_content_filter, fast_pattern_filter] - ) - - # Test individual filters - print("\nSingle filter performance (first 1000 URLs):") - test_subset = test_urls[:1000] - - print("\nPattern Filters:") - benchmark( - "Original Pattern Filter", - lambda: [pattern_filter.apply(url) for url in test_subset], - ) - benchmark( - "Optimized Pattern Filter", - lambda: [fast_pattern_filter.apply(url) for url in test_subset], - ) - - print("\nContent Filters:") - benchmark( - "Original Content Filter", - lambda: [content_filter.apply(url) for url in test_subset], - ) - benchmark( - "Optimized Content Filter", - lambda: [fast_content_filter.apply(url) for url in test_subset], - ) - - print("\nDomain Filters:") - benchmark( - "Original Domain Filter", - lambda: [domain_filter.apply(url) for url in test_subset], - ) - benchmark( - "Optimized Domain Filter", - lambda: [fast_domain_filter.apply(url) for url in test_subset], - ) - - print("\nFull Chain Performance (all URLs):") - # Test chain - benchmark("Original Chain", lambda: [chain.apply(url) for url in test_urls]) - benchmark("Optimized Chain", lambda: [fast_chain.apply(url) for url in test_urls]) - - # Memory usage - import sys - - print("\nMemory Usage per Filter:") - print(f"Original Pattern Filter: {sys.getsizeof(pattern_filter):,} bytes") - print(f"Optimized Pattern Filter: {sys.getsizeof(fast_pattern_filter):,} bytes") - print(f"Original Content Filter: {sys.getsizeof(content_filter):,} bytes") - print(f"Optimized Content Filter: {sys.getsizeof(fast_content_filter):,} bytes") - print(f"Original Domain Filter: {sys.getsizeof(domain_filter):,} bytes") - print(f"Optimized Domain Filter: {sys.getsizeof(fast_domain_filter):,} bytes") - -def test_pattern_filter(): - import time - from itertools import chain - - # Test cases as list of tuples instead of dict for multiple patterns - test_cases = [ - # Simple suffix patterns (*.html) - ("*.html", { - "https://example.com/page.html": True, - "https://example.com/path/doc.html": True, - "https://example.com/page.htm": False, - "https://example.com/page.html?param=1": True, - }), - - # Path prefix patterns (/foo/*) - ("*/article/*", { - "https://example.com/article/123": True, - "https://example.com/blog/article/456": True, - "https://example.com/articles/789": False, - "https://example.com/article": False, - }), - - # Complex patterns - ("blog-*-[0-9]", { - "https://example.com/blog-post-1": True, - "https://example.com/blog-test-9": True, - "https://example.com/blog-post": False, - "https://example.com/blog-post-x": False, - }), - - # Multiple patterns case - (["*.pdf", "*/download/*"], { - "https://example.com/doc.pdf": True, - "https://example.com/download/file.txt": True, - "https://example.com/path/download/doc": True, - "https://example.com/uploads/file.txt": False, - }), - - # Edge cases - ("*", { - "https://example.com": True, - "": True, - "http://test.com/path": True, - }), - - # Complex regex - (r"^https?://.*\.example\.com/\d+", { - "https://sub.example.com/123": True, - "http://test.example.com/456": True, - "https://example.com/789": False, - "https://sub.example.com/abc": False, - }) - ] - - def run_accuracy_test(): - print("\nAccuracy Tests:") - print("-" * 50) - - all_passed = True - for patterns, test_urls in test_cases: - filter_obj = FastURLPatternFilter(patterns) - - for url, expected in test_urls.items(): - result = filter_obj.apply(url) - if result != expected: - print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'") - print(f" Expected: {expected}, Got: {result}") - all_passed = False - else: - print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'") - - return all_passed - - def run_speed_test(): - print("\nSpeed Tests:") - print("-" * 50) - - # Create a large set of test URLs - all_urls = list(chain.from_iterable(urls.keys() for _, urls in test_cases)) - test_urls = all_urls * 10000 # 100K+ URLs - - # Test both implementations - original = URLPatternFilter(["*.html", "*/article/*", "blog-*"]) - optimized = FastURLPatternFilter(["*.html", "*/article/*", "blog-*"]) - - def benchmark(name, filter_obj): - start = time.perf_counter() - for url in test_urls: - filter_obj.apply(url) - elapsed = time.perf_counter() - start - urls_per_sec = len(test_urls) / elapsed - print(f"{name:<20} {elapsed:.3f}s ({urls_per_sec:,.0f} URLs/sec)") - - benchmark("Original Filter:", original) - benchmark("Optimized Filter:", optimized) - - # Run tests - print("Running Pattern Filter Tests...") - accuracy_passed = run_accuracy_test() - - if accuracy_passed: - print("\n✨ All accuracy tests passed!") - run_speed_test() - else: - print("\n❌ Some accuracy tests failed!") - -async def test_content_relevancy_filter(): - # Initialize with query and threshold (tune based on your corpus) - relevance_filter = ContentRelevanceFilter( - query="machine learning", - threshold=2.5 - ) - - # In your crawler loop - for url in ["https://example.com", "https://example.com/blog/post-123"]: - if await relevance_filter.apply(url): - print(f"✅ Relevant: {url}") - else: - print(f"❌ Not Relevant: {url}") - -if __name__ == "__main__": - run_performance_test() - # test_pattern_filter() \ No newline at end of file diff --git a/crawl4ai/deep_crawling/scorers.py b/crawl4ai/deep_crawling/scorers.py index 53fdd6aa..1cd9f3e1 100644 --- a/crawl4ai/deep_crawling/scorers.py +++ b/crawl4ai/deep_crawling/scorers.py @@ -23,35 +23,7 @@ _FRESHNESS_SCORES = [ 0.5, # 5 years ago ] -# Pre-computed normalization factors for powers of 2 -_POW2_NORM = [1.0, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625] - - -@dataclass class ScoringStats: - # PERF: Dataclass introduces overhead with property access and __init__ - # PERF: Float operations and comparisons are expensive for high-frequency updates - # PERF: Property calculation on every access is inefficient - # PERF: Storing min/max adds memory overhead and comparison costs - # PERF: Using inf/-inf creates unnecessary float objects - urls_scored: int = 0 - total_score: float = 0.0 - min_score: float = float("inf") # Expensive object creation - max_score: float = float("-inf") - - def update(self, score: float): - """Update scoring statistics""" - self.urls_scored += 1 - self.total_score += score - self.min_score = min(self.min_score, score) - self.max_score = max(self.max_score, score) - - @property - def average_score(self) -> float: - """Calculate average score""" - return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0 - -class FastScoringStats: __slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score') def __init__(self): @@ -88,32 +60,7 @@ class FastScoringStats: if self._max_score is None: self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0 return self._max_score - class URLScorer(ABC): - # PERF: Property access overhead for weight - # PERF: Unnecessary name attribute - # PERF: Stats object creation overhead - # PERF: Logger creation for each instance - # PERF: Abstract method overhead - - def __init__(self, weight: float = 1.0, name: str = None): - self.weight = weight - self.name = name or self.__class__.__name__ - self.stats = ScoringStats() - self.logger = logging.getLogger(f"urlscorer.{self.name}") - - @abstractmethod - def _calculate_score(self, url: str) -> float: - pass - - def score(self, url: str) -> float: - raw_score = self._calculate_score(url) - weighted_score = raw_score * self.weight - self.stats.update(weighted_score) - return weighted_score - -# Optimized base class -class FastURLScorer(ABC): __slots__ = ('_weight', '_stats') def __init__(self, weight: float = 1.0): @@ -142,31 +89,6 @@ class FastURLScorer(ABC): return self._weight class CompositeScorer(URLScorer): - # PERF: Unnecessary list iteration for each score - # PERF: Creates new list for scores - # PERF: Division on every normalization - # PERF: No parallelization for independent scorers - # PERF: No short circuit for zero scores - # PERF: No weighting optimization - # PERF: No caching of combined scores - # PERF: List allocation for scores storag - """Combines multiple scorers with weights""" - - def __init__(self, scorers: List[URLScorer], normalize: bool = True): - super().__init__(name="CompositeScorer") - self.scorers = scorers - self.normalize = normalize - - def _calculate_score(self, url: str) -> float: - scores = [scorer.score(url) for scorer in self.scorers] - total_score = sum(scores) - - if self.normalize and scores: - total_score /= len(scores) - - return total_score - -class FastCompositeScorer(FastURLScorer): __slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array') def __init__(self, scorers: List[URLScorer], normalize: bool = True): @@ -235,51 +157,7 @@ class FastCompositeScorer(FastURLScorer): self.stats.update(score) return score -class KeywordRelevanceScorer(URLScorer): - # PERF: Regex compilation and pattern matching is expensive - # PERF: List comprehension with pattern search has high overhead - # PERF: URL decoding on every calculation - # PERF: Division operation for normalization is costly - # PERF: Case insensitive regex adds overhead - # PERF: No pattern caching or reuse - # PERF: Using inheritance adds method lookup overhead - - """Score URLs based on keyword relevance. - - keyword_scorer = KeywordRelevanceScorer( - keywords=["python", "programming"], - weight=1.0, - case_sensitive=False - ) - - - Score based on keyword matches - - Case sensitivity options - - Weighted scoring - """ - - def __init__( - self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False - ): - super().__init__(weight=weight) - self.keywords = keywords - self.case_sensitive = case_sensitive - self._compile_keywords() - - def _compile_keywords(self): - """Prepare keywords for matching""" - flags = 0 if self.case_sensitive else re.IGNORECASE - self.patterns = [re.compile(re.escape(k), flags) for k in self.keywords] - - def _calculate_score(self, url: str) -> float: - """Calculate score based on keyword matches""" - decoded_url = unquote(url) - total_matches = sum( - 1 for pattern in self.patterns if pattern.search(decoded_url) - ) - # Normalize score between 0 and 1 - return total_matches / len(self.patterns) if self.patterns else 0.0 - -class FastKeywordRelevanceScorer(FastURLScorer): +class KeywordRelevanceScorer(URLScorer): __slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive') def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False): @@ -310,39 +188,6 @@ class FastKeywordRelevanceScorer(FastURLScorer): return matches / len(self._keywords) class PathDepthScorer(URLScorer): - # PERF: URL parsing on every call is expensive - # PERF: Split and list comprehension creates temporary lists - # PERF: abs() call adds function overhead - # PERF: Division and addition in score calculation are expensive for high frequency - # PERF: Path parts filtering creates extra list - # PERF: Inherits URLScorer adding method lookup overhead - # PERF: No caching of parsed URLs or calculated depths - """Score URLs based on their path depth. - - path_scorer = PathDepthScorer( - optimal_depth=3, # Preferred URL depth - weight=0.7 - ) - - - Score based on URL path depth - - Configurable optimal depth - - Diminishing returns for deeper paths - """ - - def __init__(self, optimal_depth: int = 3, weight: float = 1.0): - super().__init__(weight=weight) - self.optimal_depth = optimal_depth - - def _calculate_score(self, url: str) -> float: - """Calculate score based on path depth""" - path = urlparse(url).path - depth = len([x for x in path.split("/") if x]) - - # Score decreases as we move away from optimal depth - distance_from_optimal = abs(depth - self.optimal_depth) - return 1.0 / (1.0 + distance_from_optimal) - -class FastPathDepthScorer(FastURLScorer): __slots__ = ('_weight', '_stats', '_optimal_depth') # Remove _url_cache def __init__(self, optimal_depth: int = 3, weight: float = 1.0): @@ -400,45 +245,6 @@ class FastPathDepthScorer(FastURLScorer): return 1.0 / (1.0 + distance) class ContentTypeScorer(URLScorer): - # PERF: Regex compilation on every initialization - # PERF: Dict lookup and regex search for every URL - # PERF: Pattern iteration adds loop overhead - # PERF: No pattern priority or short-circuit - # PERF: Dict storage has lookup overhead - # PERF: Missing extension fast path check - # PERF: Unnecessary regex for simple extensions - """Score URLs based on content type preferences. - - content_scorer = ContentTypeScorer({ - r'\.html$': 1.0, - r'\.pdf$': 0.8, - r'\.xml$': 0.6 - }) - - - Score based on file types - - Configurable type weights - - Pattern matching support - """ - - def __init__(self, type_weights: Dict[str, float], weight: float = 1.0): - super().__init__(weight=weight) - self.type_weights = type_weights - self._compile_patterns() - - def _compile_patterns(self): - """Prepare content type patterns""" - self.patterns = { - re.compile(pattern): weight for pattern, weight in self.type_weights.items() - } - - def _calculate_score(self, url: str) -> float: - """Calculate score based on content type matching""" - for pattern, weight in self.patterns.items(): - if pattern.search(url): - return weight - return 0.0 - -class FastContentTypeScorer(FastURLScorer): __slots__ = ('_weight', '_exact_types', '_regex_types') def __init__(self, type_weights: Dict[str, float], weight: float = 1.0): @@ -524,45 +330,6 @@ class FastContentTypeScorer(FastURLScorer): return 0.0 class FreshnessScorer(URLScorer): - # PERF: Multiple regex compilations for each pattern - # PERF: Tries all patterns sequentially - # PERF: Regex pattern matching is expensive - # PERF: Int conversion and arithmetic for every match - # PERF: Repeated constant value (2024) hardcoded - # PERF: No URL caching - # PERF: Complex patterns with redundant groups - # PERF: Unnecessary list of patterns when could combine - """Score URLs based on freshness indicators. - - freshness_scorer = FreshnessScorer(weight=0.9) - - Score based on date indicators in URLs - Multiple date format support - Recency weighting""" - - def __init__(self, weight: float = 1.0): - super().__init__(weight=weight) - self.date_patterns = [ - r"/(\d{4})/(\d{2})/(\d{2})/", # yyyy/mm/dd - r"(\d{4})[-_](\d{2})[-_](\d{2})", # yyyy-mm-dd - r"/(\d{4})/", # year only - ] - self._compile_patterns() - - def _compile_patterns(self): - """Prepare date patterns""" - self.compiled_patterns = [re.compile(p) for p in self.date_patterns] - - def _calculate_score(self, url: str) -> float: - """Calculate score based on date indicators""" - for pattern in self.compiled_patterns: - if match := pattern.search(url): - year = int(match.group(1)) - # Score higher for more recent years - return 1.0 - (2024 - year) * 0.1 - return 0.5 # Default score for URLs without dates - -class FastFreshnessScorer(FastURLScorer): __slots__ = ('_weight', '_date_pattern', '_current_year') def __init__(self, weight: float = 1.0, current_year: int = 2024): @@ -645,41 +412,6 @@ class FastFreshnessScorer(FastURLScorer): return max(0.1, 1.0 - year_diff * 0.1) class DomainAuthorityScorer(URLScorer): - # PERF: URL parsing on every score calculation - # PERF: Repeated domain extraction - # PERF: Case conversion on every lookup - # PERF: Dict lookup without caching - # PERF: Processes full URL when only needs domain - # PERF: No fast path for common domains - # PERF: Netloc includes port which requires extra processing - """Score URLs based on domain authority. - - authority_scorer = DomainAuthorityScorer({ - "python.org": 1.0, - "github.com": 0.9, - "medium.com": 0.7 - }) - - Score based on domain importance - Configurable domain weights - Default weight for unknown domains""" - - def __init__( - self, - domain_weights: Dict[str, float], - default_weight: float = 0.5, - weight: float = 1.0, - ): - super().__init__(weight=weight) - self.domain_weights = domain_weights - self.default_weight = default_weight - - def _calculate_score(self, url: str) -> float: - """Calculate score based on domain authority""" - domain = urlparse(url).netloc.lower() - return self.domain_weights.get(domain, self.default_weight) - -class FastDomainAuthorityScorer(FastURLScorer): __slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains') def __init__( @@ -784,419 +516,4 @@ class FastDomainAuthorityScorer(FastURLScorer): return score # Regular path: check all domains - return self._domain_weights.get(domain, self._default_weight) - -def create_balanced_scorer() -> CompositeScorer: - """Create a balanced composite scorer""" - return CompositeScorer( - [ - KeywordRelevanceScorer( - keywords=["article", "blog", "news", "research"], weight=1.0 - ), - PathDepthScorer(optimal_depth=3, weight=0.7), - ContentTypeScorer( - type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6}, - weight=0.8, - ), - FreshnessScorer(weight=0.9), - ] - ) - -def create_balanced_fast_freshness_scorer() -> CompositeScorer: - """Create a balanced composite scorer with fast freshness scorer""" - return FastCompositeScorer( - [ - FastKeywordRelevanceScorer( - keywords=["article", "blog", "news", "research"], weight=1.0 - ), - FastPathDepthScorer(optimal_depth=3, weight=0.7), - FastContentTypeScorer( - type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6}, - weight=0.8, - ), - FastFreshnessScorer(weight=0.9), - ] - ) - -# Example Usage: -""" -# Create a composite scorer -scorer = CompositeScorer([ - KeywordRelevanceScorer(["python", "programming"], weight=1.0), - PathDepthScorer(optimal_depth=2, weight=0.7), - FreshnessScorer(weight=0.8), - DomainAuthorityScorer( - domain_weights={ - "python.org": 1.0, - "github.com": 0.9, - "medium.com": 0.7 - }, - weight=0.9 - ) -]) - -# Score a URL -score = scorer.score("https://python.org/article/2024/01/new-features") - -# Access statistics -print(f"Average score: {scorer.stats.average_score}") -print(f"URLs scored: {scorer.stats.urls_scored}") -""" - - -def run_scorer_performance_test(): - import time - import random - from itertools import cycle - import sys - - # Generate varied test URLs - base_urls = [ - # News/blog articles with dates - "https://example.com/2024/01/article-123", - "https://news.com/2023-12-31/breaking-news", - "https://blog.site.com/2022_11_15/tech-update", - - # Different content types - "https://docs.example.com/report.pdf", - "https://site.com/page.html?q=test", - "https://api.service.com/data.json", - - # Various domain authorities - "https://python.org/downloads", - "https://github.com/repo/code", - "https://medium.com/@user/post", - - # Different path depths - "https://site.com/category/subcategory/product/detail", - "https://shop.com/items", - "https://edu.org/courses/cs/intro/lecture1", - ] - - # Create variations - test_urls = [] - years = list(range(2020, 2025)) - domains = ["example.com", "python.org", "github.com", "medium.com"] - extensions = ["html", "pdf", "php", "jsx"] - - for base in base_urls: - test_urls.append(base) - # Add year variations - for year in years: - test_urls.append(f"https://blog.com/{year}/post-{random.randint(1,999)}") - # Add domain variations - for domain in domains: - test_urls.append(f"https://{domain}/article-{random.randint(1,999)}") - # Add extension variations - for ext in extensions: - test_urls.append(f"https://site.com/doc-{random.randint(1,999)}.{ext}") - - # Multiply dataset - test_urls = test_urls * 5000 # Creates ~300k URLs - - def benchmark(name: str, scorer, urls, warmup=True): - if warmup: - for url in urls[:100]: # Warmup with subset - scorer.score(url) - - start = time.perf_counter_ns() - for url in urls: - scorer.score(url) - elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms - - print( - f"{name:<35} {elapsed:>8.3f} ms ({len(urls)/elapsed*1000:,.0f} URLs/sec)" - ) - return elapsed - - print("\nBenchmarking original vs optimized scorers...") - print("-" * 75) - - # Initialize test data - domain_weights = {"python.org": 1.0, "github.com": 0.9, "medium.com": 0.7} - type_weights = {".html$": 1.0, ".pdf$": 0.8, ".php$": 0.6} - keywords = ["python", "article", "blog", "docs"] - - # Original implementations - keyword_scorer = KeywordRelevanceScorer(keywords=keywords, weight=1.0) - path_scorer = PathDepthScorer(optimal_depth=3, weight=0.7) - content_scorer = ContentTypeScorer(type_weights=type_weights, weight=0.8) - freshness_scorer = FreshnessScorer(weight=0.9) - domain_scorer = DomainAuthorityScorer(domain_weights=domain_weights, weight=1.0) - - # Fast implementations - fast_keyword_scorer = FastKeywordRelevanceScorer(keywords=keywords, weight=1.0) - fast_path_scorer = FastPathDepthScorer(optimal_depth=3, weight=0.7) - fast_content_scorer = FastContentTypeScorer(type_weights=type_weights, weight=0.8) - fast_freshness_scorer = FastFreshnessScorer(weight=0.9) - fast_domain_scorer = FastDomainAuthorityScorer(domain_weights=domain_weights, weight=1.0) - - # Test subset for individual scorers - test_subset = test_urls[:1000] - - print("\nIndividual Scorer Performance (first 1000 URLs):") - - print("\nKeyword Relevance Scorers:") - benchmark("Original Keyword Scorer", keyword_scorer, test_subset) - benchmark("Optimized Keyword Scorer", fast_keyword_scorer, test_subset) - - print("\nPath Depth Scorers:") - benchmark("Original Path Scorer", path_scorer, test_subset) - benchmark("Optimized Path Scorer", fast_path_scorer, test_subset) - - print("\nContent Type Scorers:") - benchmark("Original Content Scorer", content_scorer, test_subset) - benchmark("Optimized Content Scorer", fast_content_scorer, test_subset) - - print("\nFreshness Scorers:") - benchmark("Original Freshness Scorer", freshness_scorer, test_subset) - benchmark("Optimized Freshness Scorer", fast_freshness_scorer, test_subset) - - print("\nDomain Authority Scorers:") - benchmark("Original Domain Scorer", domain_scorer, test_subset) - benchmark("Optimized Domain Scorer", fast_domain_scorer, test_subset) - - # Test composite scorers - print("\nComposite Scorer Performance (all URLs):") - - original_composite = CompositeScorer([ - keyword_scorer, path_scorer, content_scorer, - freshness_scorer, domain_scorer - ]) - - fast_composite = FastCompositeScorer([ - fast_keyword_scorer, fast_path_scorer, fast_content_scorer, - fast_freshness_scorer, fast_domain_scorer - ]) - - benchmark("Original Composite Scorer", original_composite, test_urls) - benchmark("Optimized Composite Scorer", fast_composite, test_urls) - - # Memory usage - print("\nMemory Usage per Scorer:") - print(f"Original Keyword Scorer: {sys.getsizeof(keyword_scorer):,} bytes") - print(f"Optimized Keyword Scorer: {sys.getsizeof(fast_keyword_scorer):,} bytes") - print(f"Original Path Scorer: {sys.getsizeof(path_scorer):,} bytes") - print(f"Optimized Path Scorer: {sys.getsizeof(fast_path_scorer):,} bytes") - print(f"Original Content Scorer: {sys.getsizeof(content_scorer):,} bytes") - print(f"Optimized Content Scorer: {sys.getsizeof(fast_content_scorer):,} bytes") - print(f"Original Freshness Scorer: {sys.getsizeof(freshness_scorer):,} bytes") - print(f"Optimized Freshness Scorer: {sys.getsizeof(fast_freshness_scorer):,} bytes") - print(f"Original Domain Scorer: {sys.getsizeof(domain_scorer):,} bytes") - print(f"Optimized Domain Scorer: {sys.getsizeof(fast_domain_scorer):,} bytes") - print(f"Original Composite: {sys.getsizeof(original_composite):,} bytes") - print(f"Optimized Composite: {sys.getsizeof(fast_composite):,} bytes") - -def test_scorers(): - import time - from itertools import chain - - test_cases = [ - # Keyword Scorer Tests - { - "scorer_type": "keyword", - "config": { - "keywords": ["python", "blog"], - "weight": 1.0, - "case_sensitive": False - }, - "urls": { - "https://example.com/python-blog": 1.0, - "https://example.com/PYTHON-BLOG": 1.0, - "https://example.com/python-only": 0.5, - "https://example.com/other": 0.0 - } - }, - - # Path Depth Scorer Tests - { - "scorer_type": "path_depth", - "config": { - "optimal_depth": 2, - "weight": 1.0 - }, - "urls": { - "https://example.com/a/b": 1.0, - "https://example.com/a": 0.5, - "https://example.com/a/b/c": 0.5, - "https://example.com": 0.33333333 - } - }, - - # Content Type Scorer Tests - { - "scorer_type": "content_type", - "config": { - "type_weights": { - ".html$": 1.0, - ".pdf$": 0.8, - ".jpg$": 0.6 - }, - "weight": 1.0 - }, - "urls": { - "https://example.com/doc.html": 1.0, - "https://example.com/doc.pdf": 0.8, - "https://example.com/img.jpg": 0.6, - "https://example.com/other.txt": 0.0 - } - }, - - # Freshness Scorer Tests - { - "scorer_type": "freshness", - "config": { - "weight": 1.0, # Remove current_year since original doesn't support it - }, - "urls": { - "https://example.com/2024/01/post": 1.0, - "https://example.com/2023/12/post": 0.9, - "https://example.com/2022/post": 0.8, - "https://example.com/no-date": 0.5 - } - }, - - # Domain Authority Scorer Tests - { - "scorer_type": "domain", - "config": { - "domain_weights": { - "python.org": 1.0, - "github.com": 0.8, - "medium.com": 0.6 - }, - "default_weight": 0.3, - "weight": 1.0 - }, - "urls": { - "https://python.org/about": 1.0, - "https://github.com/repo": 0.8, - "https://medium.com/post": 0.6, - "https://unknown.com": 0.3 - } - } - ] - - def create_scorer(scorer_type, config): - if scorer_type == "keyword": - return ( - KeywordRelevanceScorer(**config), - FastKeywordRelevanceScorer(**config) - ) - elif scorer_type == "path_depth": - return ( - PathDepthScorer(**config), - FastPathDepthScorer(**config) - ) - elif scorer_type == "content_type": - return ( - ContentTypeScorer(**config), - FastContentTypeScorer(**config) - ) - elif scorer_type == "freshness": - return ( - FreshnessScorer(**config), - FastFreshnessScorer(**config, current_year=2024) - ) - elif scorer_type == "domain": - return ( - DomainAuthorityScorer(**config), - FastDomainAuthorityScorer(**config) - ) - - def run_accuracy_test(): - print("\nAccuracy Tests:") - print("-" * 50) - - all_passed = True - for test_case in test_cases: - print(f"\nTesting {test_case['scorer_type']} scorer:") - original, fast = create_scorer( - test_case['scorer_type'], - test_case['config'] - ) - - for url, expected in test_case['urls'].items(): - orig_score = round(original.score(url), 8) - fast_score = round(fast.score(url), 8) - expected = round(expected, 8) - - if abs(orig_score - expected) > 0.00001: - print(f"❌ Original Failed: URL '{url}'") - print(f" Expected: {expected}, Got: {orig_score}") - all_passed = False - else: - print(f"✅ Original Passed: URL '{url}'") - - if abs(fast_score - expected) > 0.00001: - print(f"❌ Fast Failed: URL '{url}'") - print(f" Expected: {expected}, Got: {fast_score}") - all_passed = False - else: - print(f"✅ Fast Passed: URL '{url}'") - - return all_passed - - def run_composite_test(): - print("\nTesting Composite Scorer:") - print("-" * 50) - - # Create test data - test_urls = { - "https://python.org/blog/2024/01/new-release.html":0.86666667, - "https://github.com/repo/old-code.pdf": 0.62, - "https://unknown.com/random": 0.26 - } - - # Create composite scorers with all types - original_scorers = [] - fast_scorers = [] - - for test_case in test_cases: - orig, fast = create_scorer( - test_case['scorer_type'], - test_case['config'] - ) - original_scorers.append(orig) - fast_scorers.append(fast) - - original_composite = CompositeScorer(original_scorers, normalize=True) - fast_composite = FastCompositeScorer(fast_scorers, normalize=True) - - all_passed = True - for url, expected in test_urls.items(): - orig_score = round(original_composite.score(url), 8) - fast_score = round(fast_composite.score(url), 8) - - if abs(orig_score - expected) > 0.00001: - print(f"❌ Original Composite Failed: URL '{url}'") - print(f" Expected: {expected}, Got: {orig_score}") - all_passed = False - else: - print(f"✅ Original Composite Passed: URL '{url}'") - - if abs(fast_score - expected) > 0.00001: - print(f"❌ Fast Composite Failed: URL '{url}'") - print(f" Expected: {expected}, Got: {fast_score}") - all_passed = False - else: - print(f"✅ Fast Composite Passed: URL '{url}'") - - return all_passed - - # Run tests - print("Running Scorer Tests...") - accuracy_passed = run_accuracy_test() - composite_passed = run_composite_test() - - if accuracy_passed and composite_passed: - print("\n✨ All tests passed!") - # Note: Already have performance tests in run_scorer_performance_test() - else: - print("\n❌ Some tests failed!") - - - -if __name__ == "__main__": - run_scorer_performance_test() - # test_scorers() \ No newline at end of file + return self._domain_weights.get(domain, self._default_weight) \ No newline at end of file diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py index a3349e70..ca15b453 100644 --- a/crawl4ai/html2text/__init__.py +++ b/crawl4ai/html2text/__init__.py @@ -510,6 +510,7 @@ class HTML2Text(html.parser.HTMLParser): if tag == "a" and not self.ignore_links: if start: + self.inside_link = True if ( "href" in attrs and attrs["href"] is not None @@ -526,6 +527,7 @@ class HTML2Text(html.parser.HTMLParser): else: self.astack.append(None) else: + self.inside_link = False if self.astack: a = self.astack.pop() if self.maybe_automatic_link and not self.empty_link: @@ -610,13 +612,22 @@ class HTML2Text(html.parser.HTMLParser): self.o("[" + str(a_props.count) + "]") if tag == "dl" and start: - self.p() - if tag == "dt" and not start: - self.pbr() - if tag == "dd" and start: - self.o(" ") - if tag == "dd" and not start: - self.pbr() + self.p() # Add paragraph break before list starts + self.p_p = 0 # Reset paragraph state + + elif tag == "dt" and start: + if self.p_p == 0: # If not first term + self.o("\n\n") # Add spacing before new term-definition pair + self.p_p = 0 # Reset paragraph state + + elif tag == "dt" and not start: + self.o("\n") # Single newline between term and definition + + elif tag == "dd" and start: + self.o(" ") # Indent definition + + elif tag == "dd" and not start: + self.p_p = 0 if tag in ["ol", "ul"]: # Google Docs create sub lists as top level lists @@ -1026,6 +1037,7 @@ class CustomHTML2Text(HTML2Text): super().__init__(*args, **kwargs) self.inside_pre = False self.inside_code = False + self.inside_link = False self.preserve_tags = set() # Set of tags to preserve self.current_preserved_tag = None self.preserved_content = [] @@ -1105,11 +1117,17 @@ class CustomHTML2Text(HTML2Text): # Ignore code tags inside pre blocks if handle_code_in_pre is False return if start: - self.o("`") # Markdown inline code start + if not self.inside_link: + self.o("`") # Only output backtick if not inside a link self.inside_code = True else: - self.o("`") # Markdown inline code end + if not self.inside_link: + self.o("`") # Only output backtick if not inside a link self.inside_code = False + + # If inside a link, let the parent class handle the content + if self.inside_link: + super().handle_tag(tag, attrs, start) else: super().handle_tag(tag, attrs, start) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index f90aa665..f37abc18 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -179,7 +179,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy): "ignore_emphasis": False, "ignore_links": False, "ignore_images": False, - "protect_links": True, + "protect_links": False, "single_line_break": True, "mark_code": True, "escape_snob": False, diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index 99e0f854..59e9baf0 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -198,7 +198,7 @@ Avoid Common Mistakes: - Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors. - Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places. - Do not miss closing tag at the end of the JSON output. -- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format. +- Do not generate the Python code show me how to do the task, this is your task to extract the information and return it in JSON format. Result Output the final list of JSON objects, wrapped in ... XML tags. Make sure to close the tag properly.""" diff --git a/docs/md_v2/advanced/advanced-features.md b/docs/md_v2/advanced/advanced-features.md index 6b3776d1..b56f216e 100644 --- a/docs/md_v2/advanced/advanced-features.md +++ b/docs/md_v2/advanced/advanced-features.md @@ -7,8 +7,8 @@ Crawl4AI offers multiple power-user features that go beyond simple crawling. Thi 2. **Capturing PDFs & Screenshots** 3. **Handling SSL Certificates** 4. **Custom Headers** -5. **Session Persistence & Local Storage** -6. **Robots.txt Compliance** +5. **Session Persistence & Local Storage** +6. **Robots.txt Compliance** > **Prerequisites** > - You have a basic grasp of [AsyncWebCrawler Basics](../core/simple-crawling.md) diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md index 5d46ef10..0e59d465 100644 --- a/docs/md_v2/core/content-selection.md +++ b/docs/md_v2/core/content-selection.md @@ -168,10 +168,10 @@ async def main(): "name": "News Items", "baseSelector": "tr.athing", "fields": [ - {"name": "title", "selector": "a.storylink", "type": "text"}, + {"name": "title", "selector": "span.titleline a", "type": "text"}, { "name": "link", - "selector": "a.storylink", + "selector": "span.titleline a", "type": "attribute", "attribute": "href" } diff --git a/docs/md_v2/core/quickstart.md b/docs/md_v2/core/quickstart.md index 04614533..7b5a2583 100644 --- a/docs/md_v2/core/quickstart.md +++ b/docs/md_v2/core/quickstart.md @@ -135,14 +135,14 @@ html = "

Gaming Laptop

$999.99 0.00001: + print(f"❌ Scorer Failed: URL '{url}'") + print(f" Expected: {expected}, Got: {score}") + all_passed = False + else: + print(f"✅ Scorer Passed: URL '{url}'") + + + return all_passed + + def run_composite_test(): + print("\nTesting Composite Scorer:") + print("-" * 50) + + # Create test data + test_urls = { + "https://python.org/blog/2024/01/new-release.html":0.86666667, + "https://github.com/repo/old-code.pdf": 0.62, + "https://unknown.com/random": 0.26 + } + + # Create composite scorers with all types + scorers = [] + + for test_case in test_cases: + scorer = create_scorer( + test_case['scorer_type'], + test_case['config'] + ) + scorers.append(scorer) + + composite = CompositeScorer(scorers, normalize=True) + + all_passed = True + for url, expected in test_urls.items(): + score = round(composite.score(url), 8) + + if abs(score - expected) > 0.00001: + print(f"❌ Composite Failed: URL '{url}'") + print(f" Expected: {expected}, Got: {score}") + all_passed = False + else: + print(f"✅ Composite Passed: URL '{url}'") + + return all_passed + + # Run tests + print("Running Scorer Tests...") + accuracy_passed = run_accuracy_test() + composite_passed = run_composite_test() + + if accuracy_passed and composite_passed: + print("\n✨ All tests passed!") + # Note: Already have performance tests in run_scorer_performance_test() + else: + print("\n❌ Some tests failed!") + + + +if __name__ == "__main__": + test_scorers() \ No newline at end of file