2025 feb alpha 1 (#685)

* spelling change in prompt

* gpt-4o-mini support

* Remove leading Y before here

* prompt spell correction

* (Docs) Fix numbered list end-of-line formatting

Added the missing "two spaces" to add a line break

* fix: access downloads_path through browser_config in _handle_download method - Fixes #585

* crawl

* fix: https://github.com/unclecode/crawl4ai/issues/592

* fix: https://github.com/unclecode/crawl4ai/issues/583

* Docs update: https://github.com/unclecode/crawl4ai/issues/649

* fix: https://github.com/unclecode/crawl4ai/issues/570

* Docs: updated example for content-selection to reflect new changes in yc newsfeed css

* Refactor: Removed old filters and replaced with optimised filters

* fix:Fixed imports as per the new names of filters

* Tests: For deep crawl filters

* Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers.

* fix: awaiting on filters that are async in nature eg: content relevance and seo filters

* fix: https://github.com/unclecode/crawl4ai/issues/592

* fix: https://github.com/unclecode/crawl4ai/issues/715

---------

Co-authored-by: DarshanTank <darshan.tank@gnani.ai>
Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com>
Co-authored-by: Serhat Soydan <ssoydan@gmail.com>
Co-authored-by: cardit1 <maneesh@cardit.in>
Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com>
This commit is contained in:
Aravind
2025-02-19 11:43:17 +05:30
committed by GitHub
parent c171891999
commit dad592c801
19 changed files with 833 additions and 1350 deletions

View File

@@ -17,11 +17,16 @@ from .extraction_strategy import (
LLMExtractionStrategy, LLMExtractionStrategy,
CosineStrategy, CosineStrategy,
JsonCssExtractionStrategy, JsonCssExtractionStrategy,
JsonXPathExtractionStrategy JsonXPathExtractionStrategy,
) )
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator from .markdown_generation_strategy import DefaultMarkdownGenerator
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter from .content_filter_strategy import (
PruningContentFilter,
BM25ContentFilter,
LLMContentFilter,
RelevantContentFilter,
)
from .models import CrawlResult, MarkdownGenerationResult from .models import CrawlResult, MarkdownGenerationResult
from .async_dispatcher import ( from .async_dispatcher import (
MemoryAdaptiveDispatcher, MemoryAdaptiveDispatcher,
@@ -29,20 +34,25 @@ from .async_dispatcher import (
RateLimiter, RateLimiter,
CrawlerMonitor, CrawlerMonitor,
DisplayMode, DisplayMode,
BaseDispatcher BaseDispatcher,
) )
from .docker_client import Crawl4aiDockerClient from .docker_client import Crawl4aiDockerClient
from .hub import CrawlerHub from .hub import CrawlerHub
from .deep_crawling import ( from .deep_crawling import (
DeepCrawlStrategy, DeepCrawlStrategy,
BFSDeepCrawlStrategy, BFSDeepCrawlStrategy,
FastFilterChain, FilterChain,
FastContentTypeFilter, ContentTypeFilter,
FastDomainFilter, DomainFilter,
FastURLFilter, URLFilter,
FastFilterStats, FilterStats,
FastKeywordRelevanceScorer, SEOFilter,
FastURLScorer, KeywordRelevanceScorer,
URLScorer,
CompositeScorer,
DomainAuthorityScorer,
FreshnessScorer,
PathDepthScorer,
BestFirstCrawlingStrategy, BestFirstCrawlingStrategy,
DFSDeepCrawlStrategy, DFSDeepCrawlStrategy,
DeepCrawlDecorator, DeepCrawlDecorator,
@@ -54,13 +64,18 @@ __all__ = [
"BFSDeepCrawlStrategy", "BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy", "BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy", "DFSDeepCrawlStrategy",
"FastFilterChain", "FilterChain",
"FastContentTypeFilter", "ContentTypeFilter",
"FastDomainFilter", "DomainFilter",
"FastFilterStats", "FilterStats",
"FastURLFilter", "URLFilter",
"FastKeywordRelevanceScorer", "SEOFilter",
"FastURLScorer", "KeywordRelevanceScorer",
"URLScorer",
"CompositeScorer",
"DomainAuthorityScorer",
"FreshnessScorer",
"PathDepthScorer",
"DeepCrawlDecorator", "DeepCrawlDecorator",
"CrawlResult", "CrawlResult",
"CrawlerHub", "CrawlerHub",

View File

@@ -886,7 +886,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
""" """
try: try:
viewport_height = page.viewport_size.get( viewport_size = page.viewport_size
if viewport_size is None:
await page.set_viewport_size(
{"width": self.browser_config.viewport_width, "height": self.browser_config.viewport_height}
)
viewport_size = page.viewport_size
viewport_height = viewport_size.get(
"height", self.browser_config.viewport_height "height", self.browser_config.viewport_height
) )
current_position = viewport_height current_position = viewport_height
@@ -946,7 +953,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
""" """
try: try:
suggested_filename = download.suggested_filename suggested_filename = download.suggested_filename
download_path = os.path.join(self.downloads_path, suggested_filename) download_path = os.path.join(self.browser_config.downloads_path, suggested_filename)
self.logger.info( self.logger.info(
message="Downloading {filename} to {path}", message="Downloading {filename} to {path}",

View File

@@ -166,7 +166,7 @@ class AsyncWebCrawler:
) )
# Initialize crawler strategy # Initialize crawler strategy
params = {k: v for k, v in kwargs.items() if k in ["browser_congig", "logger"]} params = {k: v for k, v in kwargs.items() if k in ["browser_config", "logger"]}
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
browser_config=browser_config, browser_config=browser_config,
logger=self.logger, logger=self.logger,

View File

@@ -4,15 +4,22 @@ from .bfs_strategy import BFSDeepCrawlStrategy
from .bff_strategy import BestFirstCrawlingStrategy from .bff_strategy import BestFirstCrawlingStrategy
from .dfs_strategy import DFSDeepCrawlStrategy from .dfs_strategy import DFSDeepCrawlStrategy
from .filters import ( from .filters import (
FastFilterChain, FilterChain,
FastContentTypeFilter, ContentTypeFilter,
FastDomainFilter, DomainFilter,
FastURLFilter, URLFilter,
FastFilterStats, FilterStats,
ContentRelevanceFilter,
SEOFilter
) )
from .scorers import ( from .scorers import (
FastKeywordRelevanceScorer, KeywordRelevanceScorer,
FastURLScorer, URLScorer,
CompositeScorer,
DomainAuthorityScorer,
FreshnessScorer,
PathDepthScorer,
ContentTypeScorer
) )
__all__ = [ __all__ = [
@@ -21,11 +28,18 @@ __all__ = [
"BFSDeepCrawlStrategy", "BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy", "BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy", "DFSDeepCrawlStrategy",
"FastFilterChain", "FilterChain",
"FastContentTypeFilter", "ContentTypeFilter",
"FastDomainFilter", "DomainFilter",
"FastURLFilter", "URLFilter",
"FastFilterStats", "FilterStats",
"FastKeywordRelevanceScorer", "ContentRelevanceFilter",
"FastURLScorer", "SEOFilter",
"KeywordRelevanceScorer",
"URLScorer",
"CompositeScorer",
"DomainAuthorityScorer",
"FreshnessScorer",
"PathDepthScorer",
"ContentTypeScorer",
] ]

View File

@@ -6,8 +6,8 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
from ..models import TraversalStats from ..models import TraversalStats
from .filters import FastFilterChain from .filters import FilterChain
from .scorers import FastURLScorer from .scorers import URLScorer
from . import DeepCrawlStrategy from . import DeepCrawlStrategy
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
@@ -34,8 +34,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
def __init__( def __init__(
self, self,
max_depth: int, max_depth: int,
filter_chain: FastFilterChain = FastFilterChain(), filter_chain: FilterChain = FilterChain(),
url_scorer: Optional[FastURLScorer] = None, url_scorer: Optional[URLScorer] = None,
include_external: bool = False, include_external: bool = False,
logger: Optional[logging.Logger] = None, logger: Optional[logging.Logger] = None,
): ):
@@ -64,7 +64,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
self.logger.warning(f"Invalid URL: {url}, error: {e}") self.logger.warning(f"Invalid URL: {url}, error: {e}")
return False return False
if depth != 0 and not self.filter_chain.apply(url): if depth != 0 and not await self.filter_chain.apply(url):
return False return False
return True return True

View File

@@ -6,8 +6,8 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from urllib.parse import urlparse from urllib.parse import urlparse
from ..models import TraversalStats from ..models import TraversalStats
from .filters import FastFilterChain from .filters import FilterChain
from .scorers import FastURLScorer from .scorers import URLScorer
from . import DeepCrawlStrategy from . import DeepCrawlStrategy
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
@@ -23,8 +23,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
def __init__( def __init__(
self, self,
max_depth: int, max_depth: int,
filter_chain: FastFilterChain = FastFilterChain(), filter_chain: FilterChain = FilterChain(),
url_scorer: Optional[FastURLScorer] = None, url_scorer: Optional[URLScorer] = None,
include_external: bool = False, include_external: bool = False,
logger: Optional[logging.Logger] = None, logger: Optional[logging.Logger] = None,
): ):
@@ -53,7 +53,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
self.logger.warning(f"Invalid URL: {url}, error: {e}") self.logger.warning(f"Invalid URL: {url}, error: {e}")
return False return False
if depth != 0 and not self.filter_chain.apply(url): if depth != 0 and not await self.filter_chain.apply(url):
return False return False
return True return True

View File

@@ -374,7 +374,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
parsed = urlparse(url) parsed = urlparse(url)
return (parsed.scheme in {'http', 'https'} return (parsed.scheme in {'http', 'https'}
and '.' in parsed.netloc and '.' in parsed.netloc
and self.filter_chain.apply(url)) and await self.filter_chain.apply(url))
except Exception: except Exception:
return False return False

View File

@@ -8,224 +8,16 @@ from functools import lru_cache
import fnmatch import fnmatch
from dataclasses import dataclass from dataclasses import dataclass
import weakref import weakref
import mimetypes
import math import math
from collections import defaultdict from collections import defaultdict
from typing import Dict from typing import Dict
from ..utils import HeadPeekr from ..utils import HeadPeekr
import asyncio
import inspect
@dataclass @dataclass
class FilterStats: class FilterStats:
# PERF: Using dataclass creates overhead with __init__ and property access
# PERF: Could use __slots__ to reduce memory footprint
# PERF: Consider using array.array('I') for atomic increments
total_urls: int = 0
rejected_urls: int = 0
passed_urls: int = 0
class URLFilter(ABC):
# PERF: Logger creation is expensive, consider lazy initialization
# PERF: stats object creation adds overhead for each filter instance
def __init__(self, name: str = None):
self.name = name or self.__class__.__name__
self.stats = FilterStats()
self.logger = logging.getLogger(f"urlfilter.{self.name}")
@abstractmethod
def apply(self, url: str) -> bool:
pass
def _update_stats(self, passed: bool):
# PERF: Already optimized but could use bitwise operations
# PERF: Consider removing stats entirely in production/fast mode
self.stats.total_urls += 1
self.stats.passed_urls += passed
self.stats.rejected_urls += not passed
class FilterChain:
# PERF: List traversal for each URL is expensive
# PERF: Could use array.array instead of list for filters
# PERF: Consider adding fast path for single filter case
def __init__(self, filters: List[URLFilter] = None):
self.filters = filters or []
self.stats = FilterStats()
self.logger = logging.getLogger("urlfilter.chain")
def apply(self, url: str) -> bool:
# PERF: Logging on every rejection is expensive
# PERF: Could reorder filters by rejection rate
# PERF: Consider batch processing mode
self.stats.total_urls += 1
for filter_ in self.filters:
if not filter_.apply(url):
self.stats.rejected_urls += 1
self.logger.debug(f"URL {url} rejected by {filter_.name}")
return False
self.stats.passed_urls += 1
return True
class URLPatternFilter(URLFilter):
# PERF: Converting glob to regex is expensive
# PERF: Multiple regex compilation is slow
# PERF: List of patterns causes multiple regex evaluations
def __init__(
self,
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
use_glob: bool = True,
):
super().__init__()
self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
self.use_glob = use_glob
self._compiled_patterns = []
# PERF: This could be consolidated into a single regex with OR conditions
# PERF: glob_to_regex creates complex patterns, could be simplified
for pattern in self.patterns:
if isinstance(pattern, str) and use_glob:
self._compiled_patterns.append(self._glob_to_regex(pattern))
else:
self._compiled_patterns.append(
re.compile(pattern) if isinstance(pattern, str) else pattern
)
def _glob_to_regex(self, pattern: str) -> Pattern:
# PERF: fnmatch.translate creates overly complex patterns
# PERF: Could cache common translations
return re.compile(fnmatch.translate(pattern))
def apply(self, url: str) -> bool:
# PERF: any() with generator is slower than direct loop with early return
# PERF: searching entire string is slower than anchored match
matches = any(pattern.search(url) for pattern in self._compiled_patterns)
self._update_stats(matches)
return matches
class ContentTypeFilter(URLFilter):
# PERF: mimetypes guessing is extremely slow
# PERF: URL parsing on every check is expensive
# PERF: No caching of results for similar extensions
def __init__(
self, allowed_types: Union[str, List[str]], check_extension: bool = True
):
super().__init__()
self.allowed_types = (
[allowed_types] if isinstance(allowed_types, str) else allowed_types
)
self.check_extension = check_extension
self._normalize_types()
def _normalize_types(self):
"""Normalize content type strings"""
self.allowed_types = [t.lower() for t in self.allowed_types]
def _check_extension(self, url: str) -> bool:
# PERF: urlparse is called on every check
# PERF: multiple string splits are expensive
# PERF: mimetypes.guess_type is very slow
ext = (
urlparse(url).path.split(".")[-1].lower()
if "." in urlparse(url).path
else ""
)
if not ext:
return True
# PERF: guess_type is main bottleneck
guessed_type = mimetypes.guess_type(url)[0]
return any(
allowed in (guessed_type or "").lower() for allowed in self.allowed_types
)
def apply(self, url: str) -> bool:
"""Check if URL's content type is allowed"""
result = True
if self.check_extension:
result = self._check_extension(url)
self._update_stats(result)
return result
class DomainFilter(URLFilter):
# PERF: Set lookups are fast but string normalizations on init are not
# PERF: Creating two sets doubles memory usage
def __init__(
self,
allowed_domains: Union[str, List[str]] = None,
blocked_domains: Union[str, List[str]] = None,
):
super().__init__()
# PERF: Normalizing domains on every init is wasteful
# PERF: Could use frozenset for immutable lists
self.allowed_domains = (
set(self._normalize_domains(allowed_domains)) if allowed_domains else None
)
self.blocked_domains = (
set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
)
def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
# PERF: strip() and lower() create new strings for each domain
# PERF: List comprehension creates intermediate list
if isinstance(domains, str):
domains = [domains]
return [d.lower().strip() for d in domains]
def _extract_domain(self, url: str) -> str:
# PERF: urlparse is called for every URL check
# PERF: lower() creates new string every time
# PERF: Could cache recent results
return urlparse(url).netloc.lower()
def apply(self, url: str) -> bool:
# PERF: Two separate set lookups in worst case
# PERF: Domain extraction happens before knowing if we have any filters
domain = self._extract_domain(url)
if domain in self.blocked_domains:
self._update_stats(False)
return False
if self.allowed_domains is not None and domain not in self.allowed_domains:
self._update_stats(False)
return False
self._update_stats(True)
return True
# Example usage:
def create_common_filter_chain() -> FilterChain:
"""Create a commonly used filter chain"""
return FilterChain(
[
URLPatternFilter(
[
"*.html",
"*.htm", # HTML files
"*/article/*",
"*/blog/*", # Common content paths
]
),
ContentTypeFilter(["text/html", "application/xhtml+xml"]),
DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
]
)
####################################################################################
# Uncledoe: Optimized Version
####################################################################################
# Use __slots__ and array for maximum memory/speed efficiency
@dataclass
class FastFilterStats:
__slots__ = ("_counters",) __slots__ = ("_counters",)
def __init__(self): def __init__(self):
@@ -245,14 +37,14 @@ class FastFilterStats:
return self._counters[2] return self._counters[2]
class FastURLFilter(ABC): class URLFilter(ABC):
"""Optimized base filter class""" """Optimized base filter class"""
__slots__ = ("name", "stats", "_logger_ref") __slots__ = ("name", "stats", "_logger_ref")
def __init__(self, name: str = None): def __init__(self, name: str = None):
self.name = name or self.__class__.__name__ self.name = name or self.__class__.__name__
self.stats = FastFilterStats() self.stats = FilterStats()
# Lazy logger initialization using weakref # Lazy logger initialization using weakref
self._logger_ref = None self._logger_ref = None
@@ -274,14 +66,14 @@ class FastURLFilter(ABC):
self.stats._counters[2] += not passed # rejected self.stats._counters[2] += not passed # rejected
class FastFilterChain: class FilterChain:
"""Optimized filter chain""" """Optimized filter chain"""
__slots__ = ("filters", "stats", "_logger_ref") __slots__ = ("filters", "stats", "_logger_ref")
def __init__(self, filters: List[FastURLFilter] = None): def __init__(self, filters: List[URLFilter] = None):
self.filters = tuple(filters or []) # Immutable tuple for speed self.filters = tuple(filters or []) # Immutable tuple for speed
self.stats = FastFilterStats() self.stats = FilterStats()
self._logger_ref = None self._logger_ref = None
@property @property
@@ -291,37 +83,62 @@ class FastFilterChain:
self._logger_ref = weakref.ref(logger) self._logger_ref = weakref.ref(logger)
return self._logger_ref() return self._logger_ref()
def add_filter(self, filter_: FastURLFilter) -> "FastFilterChain": def add_filter(self, filter_: URLFilter) -> "FilterChain":
"""Add a filter to the chain""" """Add a filter to the chain"""
self.filters.append(filter_) self.filters.append(filter_)
return self # Enable method chaining return self # Enable method chaining
def apply(self, url: str) -> bool: async def apply(self, url: str) -> bool:
"""Optimized apply with minimal operations""" """Apply all filters concurrently when possible"""
self.stats._counters[0] += 1 # total self.stats._counters[0] += 1 # Total processed URLs
# Direct tuple iteration is faster than list tasks = []
for f in self.filters: for f in self.filters:
if not f.apply(url): result = f.apply(url)
self.stats._counters[2] += 1 # rejected
if inspect.isawaitable(result):
tasks.append(result) # Collect async tasks
elif not result: # Sync rejection
self.stats._counters[2] += 1 # Sync rejected
return False return False
self.stats._counters[1] += 1 # passed if tasks:
results = await asyncio.gather(*tasks)
# Count how many filters rejected
rejections = results.count(False)
self.stats._counters[2] += rejections
if not all(results):
return False # Stop early if any filter rejected
self.stats._counters[1] += 1 # Passed
return True return True
class FastURLPatternFilter(FastURLFilter):
class URLPatternFilter(URLFilter):
"""Pattern filter balancing speed and completeness""" """Pattern filter balancing speed and completeness"""
__slots__ = ('_simple_suffixes', '_simple_prefixes', '_domain_patterns', '_path_patterns')
__slots__ = (
"_simple_suffixes",
"_simple_prefixes",
"_domain_patterns",
"_path_patterns",
)
PATTERN_TYPES = { PATTERN_TYPES = {
'SUFFIX': 1, # *.html "SUFFIX": 1, # *.html
'PREFIX': 2, # /foo/* "PREFIX": 2, # /foo/*
'DOMAIN': 3, # *.example.com "DOMAIN": 3, # *.example.com
'PATH': 4 , # Everything else "PATH": 4, # Everything else
'REGEX': 5 "REGEX": 5,
} }
def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True): def __init__(
self,
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
use_glob: bool = True,
):
super().__init__() super().__init__()
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
@@ -337,48 +154,50 @@ class FastURLPatternFilter(FastURLFilter):
def _categorize_pattern(self, pattern: str) -> int: def _categorize_pattern(self, pattern: str) -> int:
"""Categorize pattern for specialized handling""" """Categorize pattern for specialized handling"""
if not isinstance(pattern, str): if not isinstance(pattern, str):
return self.PATTERN_TYPES['PATH'] return self.PATTERN_TYPES["PATH"]
# Check if it's a regex pattern # Check if it's a regex pattern
if pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern: if pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern:
return self.PATTERN_TYPES['REGEX'] return self.PATTERN_TYPES["REGEX"]
if pattern.count('*') == 1: if pattern.count("*") == 1:
if pattern.startswith('*.'): if pattern.startswith("*."):
return self.PATTERN_TYPES['SUFFIX'] return self.PATTERN_TYPES["SUFFIX"]
if pattern.endswith('/*'): if pattern.endswith("/*"):
return self.PATTERN_TYPES['PREFIX'] return self.PATTERN_TYPES["PREFIX"]
if '://' in pattern and pattern.startswith('*.'): if "://" in pattern and pattern.startswith("*."):
return self.PATTERN_TYPES['DOMAIN'] return self.PATTERN_TYPES["DOMAIN"]
return self.PATTERN_TYPES['PATH'] return self.PATTERN_TYPES["PATH"]
def _add_pattern(self, pattern: str, pattern_type: int): def _add_pattern(self, pattern: str, pattern_type: int):
"""Add pattern to appropriate matcher""" """Add pattern to appropriate matcher"""
if pattern_type == self.PATTERN_TYPES['REGEX']: if pattern_type == self.PATTERN_TYPES["REGEX"]:
# For regex patterns, compile directly without glob translation # For regex patterns, compile directly without glob translation
if isinstance(pattern, str) and (pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern): if isinstance(pattern, str) and (
pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern
):
self._path_patterns.append(re.compile(pattern)) self._path_patterns.append(re.compile(pattern))
return return
elif pattern_type == self.PATTERN_TYPES['SUFFIX']: elif pattern_type == self.PATTERN_TYPES["SUFFIX"]:
self._simple_suffixes.add(pattern[2:]) self._simple_suffixes.add(pattern[2:])
elif pattern_type == self.PATTERN_TYPES['PREFIX']: elif pattern_type == self.PATTERN_TYPES["PREFIX"]:
self._simple_prefixes.add(pattern[:-2]) self._simple_prefixes.add(pattern[:-2])
elif pattern_type == self.PATTERN_TYPES['DOMAIN']: elif pattern_type == self.PATTERN_TYPES["DOMAIN"]:
self._domain_patterns.append( self._domain_patterns.append(re.compile(pattern.replace("*.", r"[^/]+\.")))
re.compile(pattern.replace('*.', r'[^/]+\.'))
)
else: else:
if isinstance(pattern, str): if isinstance(pattern, str):
# Handle complex glob patterns # Handle complex glob patterns
if '**' in pattern: if "**" in pattern:
pattern = pattern.replace('**', '.*') pattern = pattern.replace("**", ".*")
if '{' in pattern: if "{" in pattern:
# Convert {a,b} to (a|b) # Convert {a,b} to (a|b)
pattern = re.sub(r'\{([^}]+)\}', pattern = re.sub(
lambda m: f'({"|".join(m.group(1).split(","))})', r"\{([^}]+)\}",
pattern) lambda m: f'({"|".join(m.group(1).split(","))})',
pattern,
)
pattern = fnmatch.translate(pattern) pattern = fnmatch.translate(pattern)
self._path_patterns.append( self._path_patterns.append(
pattern if isinstance(pattern, Pattern) else re.compile(pattern) pattern if isinstance(pattern, Pattern) else re.compile(pattern)
@@ -389,8 +208,8 @@ class FastURLPatternFilter(FastURLFilter):
"""Hierarchical pattern matching""" """Hierarchical pattern matching"""
# Quick suffix check (*.html) # Quick suffix check (*.html)
if self._simple_suffixes: if self._simple_suffixes:
path = url.split('?')[0] path = url.split("?")[0]
if path.split('/')[-1].split('.')[-1] in self._simple_suffixes: if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
self._update_stats(True) self._update_stats(True)
return True return True
@@ -403,7 +222,7 @@ class FastURLPatternFilter(FastURLFilter):
# Prefix check (/foo/*) # Prefix check (/foo/*)
if self._simple_prefixes: if self._simple_prefixes:
path = url.split('?')[0] path = url.split("?")[0]
if any(path.startswith(p) for p in self._simple_prefixes): if any(path.startswith(p) for p in self._simple_prefixes):
self._update_stats(True) self._update_stats(True)
return True return True
@@ -418,7 +237,7 @@ class FastURLPatternFilter(FastURLFilter):
return False return False
class FastContentTypeFilter(FastURLFilter): class ContentTypeFilter(URLFilter):
"""Optimized content type filter using fast lookups""" """Optimized content type filter using fast lookups"""
__slots__ = ("allowed_types", "_ext_map", "_check_extension") __slots__ = ("allowed_types", "_ext_map", "_check_extension")
@@ -515,14 +334,30 @@ class FastContentTypeFilter(FastURLFilter):
@staticmethod @staticmethod
@lru_cache(maxsize=1000) @lru_cache(maxsize=1000)
def _extract_extension(path: str) -> str: def _extract_extension(url: str) -> str:
"""Fast extension extraction with caching""" """Extracts file extension from a URL."""
if "." not in path: # Remove scheme (http://, https://) if present
if "://" in url:
url = url.split("://", 1)[-1] # Get everything after '://'
# Remove domain (everything up to the first '/')
path_start = url.find("/")
path = url[path_start:] if path_start != -1 else ""
# Extract last filename in path
filename = path.rsplit("/", 1)[-1] if "/" in path else ""
# Extract and validate extension
if "." not in filename:
return "" return ""
return path.rpartition(".")[-1].lower()
return filename.rpartition(".")[-1].lower()
def __init__( def __init__(
self, allowed_types: Union[str, List[str]], check_extension: bool = True, ext_map: Dict[str, str] = _MIME_MAP self,
allowed_types: Union[str, List[str]],
check_extension: bool = True,
ext_map: Dict[str, str] = _MIME_MAP,
): ):
super().__init__() super().__init__()
# Normalize and store as frozenset for fast lookup # Normalize and store as frozenset for fast lookup
@@ -546,9 +381,7 @@ class FastContentTypeFilter(FastURLFilter):
"""Cached URL checking""" """Cached URL checking"""
if not self._check_extension: if not self._check_extension:
return True return True
ext = self._extract_extension(url)
path = url.split("?")[0] # Fast path split
ext = self._extract_extension(path)
if not ext: if not ext:
return True return True
@@ -561,7 +394,7 @@ class FastContentTypeFilter(FastURLFilter):
return result return result
class FastDomainFilter(FastURLFilter): class DomainFilter(URLFilter):
"""Optimized domain filter with fast lookups and caching""" """Optimized domain filter with fast lookups and caching"""
__slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache") __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
@@ -599,7 +432,7 @@ class FastDomainFilter(FastURLFilter):
@lru_cache(maxsize=10000) @lru_cache(maxsize=10000)
def _extract_domain(url: str) -> str: def _extract_domain(url: str) -> str:
"""Ultra-fast domain extraction with regex and caching""" """Ultra-fast domain extraction with regex and caching"""
match = FastDomainFilter._DOMAIN_REGEX.search(url) match = DomainFilter._DOMAIN_REGEX.search(url)
return match.group(1).lower() if match else "" return match.group(1).lower() if match else ""
def apply(self, url: str) -> bool: def apply(self, url: str) -> bool:
@@ -627,19 +460,24 @@ class FastDomainFilter(FastURLFilter):
return result return result
class ContentRelevanceFilter(URLFilter): class ContentRelevanceFilter(URLFilter):
"""BM25-based relevance filter using head section content""" """BM25-based relevance filter using head section content"""
__slots__ = ('query_terms', 'threshold', 'k1', 'b', 'avgdl') __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
def __init__(self, query: str, threshold: float, def __init__(
k1: float = 1.2, b: float = 0.75, avgdl: int = 1000): self,
query: str,
threshold: float,
k1: float = 1.2,
b: float = 0.75,
avgdl: int = 1000,
):
super().__init__(name="BM25RelevanceFilter") super().__init__(name="BM25RelevanceFilter")
self.query_terms = self._tokenize(query) self.query_terms = self._tokenize(query)
self.threshold = threshold self.threshold = threshold
self.k1 = k1 # TF saturation parameter self.k1 = k1 # TF saturation parameter
self.b = b # Length normalization parameter self.b = b # Length normalization parameter
self.avgdl = avgdl # Average document length (empirical value) self.avgdl = avgdl # Average document length (empirical value)
async def apply(self, url: str) -> bool: async def apply(self, url: str) -> bool:
@@ -650,8 +488,8 @@ class ContentRelevanceFilter(URLFilter):
# Field extraction with weighting # Field extraction with weighting
fields = { fields = {
'title': HeadPeekr.get_title(head_content) or "", "title": HeadPeekr.get_title(head_content) or "",
'meta': HeadPeekr.extract_meta_tags(head_content) "meta": HeadPeekr.extract_meta_tags(head_content),
} }
doc_text = self._build_document(fields) doc_text = self._build_document(fields)
@@ -662,12 +500,14 @@ class ContentRelevanceFilter(URLFilter):
def _build_document(self, fields: Dict) -> str: def _build_document(self, fields: Dict) -> str:
"""Weighted document construction""" """Weighted document construction"""
return ' '.join([ return " ".join(
fields['title'] * 3, # Title weight [
fields['meta'].get('description', '') * 2, fields["title"] * 3, # Title weight
fields['meta'].get('keywords', ''), fields["meta"].get("description", "") * 2,
' '.join(fields['meta'].values()) fields["meta"].get("keywords", ""),
]) " ".join(fields["meta"].values()),
]
)
def _tokenize(self, text: str) -> List[str]: def _tokenize(self, text: str) -> List[str]:
"""Fast case-insensitive tokenization""" """Fast case-insensitive tokenization"""
@@ -687,8 +527,9 @@ class ContentRelevanceFilter(URLFilter):
term_freq = tf[term] term_freq = tf[term]
idf = math.log((1 + 1) / (term_freq + 0.5) + 1) # Simplified IDF idf = math.log((1 + 1) / (term_freq + 0.5) + 1) # Simplified IDF
numerator = term_freq * (self.k1 + 1) numerator = term_freq * (self.k1 + 1)
denominator = term_freq + self.k1 * (1 - self.b + denominator = term_freq + self.k1 * (
self.b * (doc_len / self.avgdl)) 1 - self.b + self.b * (doc_len / self.avgdl)
)
score += idf * (numerator / denominator) score += idf * (numerator / denominator)
return score return score
@@ -697,29 +538,35 @@ class ContentRelevanceFilter(URLFilter):
class SEOFilter(URLFilter): class SEOFilter(URLFilter):
"""Quantitative SEO quality assessment filter using head section analysis""" """Quantitative SEO quality assessment filter using head section analysis"""
__slots__ = ('threshold', '_weights', '_kw_patterns') __slots__ = ("threshold", "_weights", "_kw_patterns")
# Based on SEMrush/Google ranking factors research # Based on SEMrush/Google ranking factors research
DEFAULT_WEIGHTS = { DEFAULT_WEIGHTS = {
'title_length': 0.15, "title_length": 0.15,
'title_kw': 0.18, "title_kw": 0.18,
'meta_description': 0.12, "meta_description": 0.12,
'canonical': 0.10, "canonical": 0.10,
'robot_ok': 0.20, # Most critical factor "robot_ok": 0.20, # Most critical factor
'schema_org': 0.10, "schema_org": 0.10,
'url_quality': 0.15 "url_quality": 0.15,
} }
def __init__(self, threshold: float = 0.65, def __init__(
keywords: List[str] = None, self,
weights: Dict[str, float] = None): threshold: float = 0.65,
keywords: List[str] = None,
weights: Dict[str, float] = None,
):
super().__init__(name="SEOFilter") super().__init__(name="SEOFilter")
self.threshold = threshold self.threshold = threshold
self._weights = weights or self.DEFAULT_WEIGHTS self._weights = weights or self.DEFAULT_WEIGHTS
self._kw_patterns = re.compile( self._kw_patterns = (
r'\b({})\b'.format('|'.join(map(re.escape, keywords or []))), re.compile(
re.I r"\b({})\b".format("|".join(map(re.escape, keywords or []))), re.I
) if keywords else None )
if keywords
else None
)
async def apply(self, url: str) -> bool: async def apply(self, url: str) -> bool:
head_content = await HeadPeekr.peek_html(url) head_content = await HeadPeekr.peek_html(url)
@@ -728,21 +575,24 @@ class SEOFilter(URLFilter):
return False return False
meta = HeadPeekr.extract_meta_tags(head_content) meta = HeadPeekr.extract_meta_tags(head_content)
title = HeadPeekr.get_title(head_content) or '' title = HeadPeekr.get_title(head_content) or ""
parsed_url = urlparse(url) parsed_url = urlparse(url)
scores = { scores = {
'title_length': self._score_title_length(title), "title_length": self._score_title_length(title),
'title_kw': self._score_keyword_presence(title), "title_kw": self._score_keyword_presence(title),
'meta_description': self._score_meta_description(meta.get('description', '')), "meta_description": self._score_meta_description(
'canonical': self._score_canonical(meta.get('canonical'), url), meta.get("description", "")
'robot_ok': 1.0 if 'noindex' not in meta.get('robots', '') else 0.0, ),
'schema_org': self._score_schema_org(head_content), "canonical": self._score_canonical(meta.get("canonical"), url),
'url_quality': self._score_url_quality(parsed_url) "robot_ok": 1.0 if "noindex" not in meta.get("robots", "") else 0.0,
"schema_org": self._score_schema_org(head_content),
"url_quality": self._score_url_quality(parsed_url),
} }
total_score = sum(weight * scores[factor] total_score = sum(
for factor, weight in self._weights.items()) weight * scores[factor] for factor, weight in self._weights.items()
)
decision = total_score >= self.threshold decision = total_score >= self.threshold
self._update_stats(decision) self._update_stats(decision)
@@ -750,291 +600,49 @@ class SEOFilter(URLFilter):
def _score_title_length(self, title: str) -> float: def _score_title_length(self, title: str) -> float:
length = len(title) length = len(title)
if 50 <= length <= 60: return 1.0 if 50 <= length <= 60:
if 40 <= length < 50 or 60 < length <= 70: return 0.7 return 1.0
if 40 <= length < 50 or 60 < length <= 70:
return 0.7
return 0.3 # Poor length return 0.3 # Poor length
def _score_keyword_presence(self, text: str) -> float: def _score_keyword_presence(self, text: str) -> float:
if not self._kw_patterns: return 0.0 if not self._kw_patterns:
return 0.0
matches = len(self._kw_patterns.findall(text)) matches = len(self._kw_patterns.findall(text))
return min(matches * 0.3, 1.0) # Max 3 matches return min(matches * 0.3, 1.0) # Max 3 matches
def _score_meta_description(self, desc: str) -> float: def _score_meta_description(self, desc: str) -> float:
length = len(desc) length = len(desc)
if 140 <= length <= 160: return 1.0 if 140 <= length <= 160:
return 1.0
return 0.5 if 120 <= length <= 200 else 0.2 return 0.5 if 120 <= length <= 200 else 0.2
def _score_canonical(self, canonical: str, original: str) -> float: def _score_canonical(self, canonical: str, original: str) -> float:
if not canonical: return 0.5 # Neutral score if not canonical:
return 0.5 # Neutral score
return 1.0 if canonical == original else 0.2 return 1.0 if canonical == original else 0.2
def _score_schema_org(self, html: str) -> float: def _score_schema_org(self, html: str) -> float:
# Detect any schema.org markup in head # Detect any schema.org markup in head
return 1.0 if re.search(r'<script[^>]+type=["\']application/ld\+json', html) else 0.0 return (
1.0
if re.search(r'<script[^>]+type=["\']application/ld\+json', html)
else 0.0
)
def _score_url_quality(self, parsed_url) -> float: def _score_url_quality(self, parsed_url) -> float:
score = 1.0 score = 1.0
path = parsed_url.path.lower() path = parsed_url.path.lower()
# Penalty factors # Penalty factors
if len(path) > 80: score *= 0.7 if len(path) > 80:
if re.search(r'\d{4}', path): score *= 0.8 # Numbers in path score *= 0.7
if parsed_url.query: score *= 0.6 # URL parameters if re.search(r"\d{4}", path):
if '_' in path: score *= 0.9 # Underscores vs hyphens score *= 0.8 # Numbers in path
if parsed_url.query:
score *= 0.6 # URL parameters
if "_" in path:
score *= 0.9 # Underscores vs hyphens
return score return score
def create_fast_filter_chain() -> FastFilterChain:
"""Create an optimized filter chain with filters ordered by rejection rate"""
return FastFilterChain(
[
# Domain filter first (fastest rejection)
FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]),
# Content filter second (medium speed)
FastContentTypeFilter(["text/html", "application/xhtml+xml"]),
# Pattern filter last (most expensive)
FastURLPatternFilter(
[
"*.html",
"*.htm",
"*/article/*",
"*/blog/*",
]
),
]
)
def run_performance_test():
import time
# Generate test URLs
base_urls = [
"https://example.com/article/123",
"https://blog.example.com/post/456",
"https://ads.example.com/tracking",
"https://example.com/about.html",
"https://analytics.example.com/script.js",
"https://example.com/products.php",
"https://subdomain.example.com/blog/post-123",
"https://example.com/path/file.pdf",
]
# Create more varied test data
test_urls = []
for base in base_urls:
# Add original
test_urls.append(base)
# Add variations
parts = base.split("/")
for i in range(10):
parts[-1] = f"page_{i}.html"
test_urls.append("/".join(parts))
# Multiply to get enough test data
test_urls = test_urls * 10000 # Creates ~800k URLs
def benchmark(name: str, func, *args, warmup=True):
if warmup:
# Warmup run
func(*args)
# Actual timing
start = time.perf_counter_ns()
result = func(*args)
elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms
print(
f"{name:<30} {elapsed:>8.3f} ms ({len(test_urls)/elapsed*1000:,.0f} URLs/sec)"
)
return result
print("\nBenchmarking original vs optimized implementations...")
print("-" * 70)
# Original implementation
pattern_filter = URLPatternFilter(["*.html", "*/article/*"])
content_filter = ContentTypeFilter(["text/html"])
domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"])
chain = FilterChain([pattern_filter, content_filter, domain_filter])
# Optimized implementation
fast_pattern_filter = FastURLPatternFilter(["*.html", "*/article/*"])
fast_content_filter = FastContentTypeFilter(["text/html"])
fast_domain_filter = FastDomainFilter(blocked_domains=["ads.*", "analytics.*"])
fast_chain = FastFilterChain(
[fast_domain_filter, fast_content_filter, fast_pattern_filter]
)
# Test individual filters
print("\nSingle filter performance (first 1000 URLs):")
test_subset = test_urls[:1000]
print("\nPattern Filters:")
benchmark(
"Original Pattern Filter",
lambda: [pattern_filter.apply(url) for url in test_subset],
)
benchmark(
"Optimized Pattern Filter",
lambda: [fast_pattern_filter.apply(url) for url in test_subset],
)
print("\nContent Filters:")
benchmark(
"Original Content Filter",
lambda: [content_filter.apply(url) for url in test_subset],
)
benchmark(
"Optimized Content Filter",
lambda: [fast_content_filter.apply(url) for url in test_subset],
)
print("\nDomain Filters:")
benchmark(
"Original Domain Filter",
lambda: [domain_filter.apply(url) for url in test_subset],
)
benchmark(
"Optimized Domain Filter",
lambda: [fast_domain_filter.apply(url) for url in test_subset],
)
print("\nFull Chain Performance (all URLs):")
# Test chain
benchmark("Original Chain", lambda: [chain.apply(url) for url in test_urls])
benchmark("Optimized Chain", lambda: [fast_chain.apply(url) for url in test_urls])
# Memory usage
import sys
print("\nMemory Usage per Filter:")
print(f"Original Pattern Filter: {sys.getsizeof(pattern_filter):,} bytes")
print(f"Optimized Pattern Filter: {sys.getsizeof(fast_pattern_filter):,} bytes")
print(f"Original Content Filter: {sys.getsizeof(content_filter):,} bytes")
print(f"Optimized Content Filter: {sys.getsizeof(fast_content_filter):,} bytes")
print(f"Original Domain Filter: {sys.getsizeof(domain_filter):,} bytes")
print(f"Optimized Domain Filter: {sys.getsizeof(fast_domain_filter):,} bytes")
def test_pattern_filter():
import time
from itertools import chain
# Test cases as list of tuples instead of dict for multiple patterns
test_cases = [
# Simple suffix patterns (*.html)
("*.html", {
"https://example.com/page.html": True,
"https://example.com/path/doc.html": True,
"https://example.com/page.htm": False,
"https://example.com/page.html?param=1": True,
}),
# Path prefix patterns (/foo/*)
("*/article/*", {
"https://example.com/article/123": True,
"https://example.com/blog/article/456": True,
"https://example.com/articles/789": False,
"https://example.com/article": False,
}),
# Complex patterns
("blog-*-[0-9]", {
"https://example.com/blog-post-1": True,
"https://example.com/blog-test-9": True,
"https://example.com/blog-post": False,
"https://example.com/blog-post-x": False,
}),
# Multiple patterns case
(["*.pdf", "*/download/*"], {
"https://example.com/doc.pdf": True,
"https://example.com/download/file.txt": True,
"https://example.com/path/download/doc": True,
"https://example.com/uploads/file.txt": False,
}),
# Edge cases
("*", {
"https://example.com": True,
"": True,
"http://test.com/path": True,
}),
# Complex regex
(r"^https?://.*\.example\.com/\d+", {
"https://sub.example.com/123": True,
"http://test.example.com/456": True,
"https://example.com/789": False,
"https://sub.example.com/abc": False,
})
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for patterns, test_urls in test_cases:
filter_obj = FastURLPatternFilter(patterns)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
return all_passed
def run_speed_test():
print("\nSpeed Tests:")
print("-" * 50)
# Create a large set of test URLs
all_urls = list(chain.from_iterable(urls.keys() for _, urls in test_cases))
test_urls = all_urls * 10000 # 100K+ URLs
# Test both implementations
original = URLPatternFilter(["*.html", "*/article/*", "blog-*"])
optimized = FastURLPatternFilter(["*.html", "*/article/*", "blog-*"])
def benchmark(name, filter_obj):
start = time.perf_counter()
for url in test_urls:
filter_obj.apply(url)
elapsed = time.perf_counter() - start
urls_per_sec = len(test_urls) / elapsed
print(f"{name:<20} {elapsed:.3f}s ({urls_per_sec:,.0f} URLs/sec)")
benchmark("Original Filter:", original)
benchmark("Optimized Filter:", optimized)
# Run tests
print("Running Pattern Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n✨ All accuracy tests passed!")
run_speed_test()
else:
print("\n❌ Some accuracy tests failed!")
async def test_content_relevancy_filter():
# Initialize with query and threshold (tune based on your corpus)
relevance_filter = ContentRelevanceFilter(
query="machine learning",
threshold=2.5
)
# In your crawler loop
for url in ["https://example.com", "https://example.com/blog/post-123"]:
if await relevance_filter.apply(url):
print(f"✅ Relevant: {url}")
else:
print(f"❌ Not Relevant: {url}")
if __name__ == "__main__":
run_performance_test()
# test_pattern_filter()

View File

@@ -23,35 +23,7 @@ _FRESHNESS_SCORES = [
0.5, # 5 years ago 0.5, # 5 years ago
] ]
# Pre-computed normalization factors for powers of 2
_POW2_NORM = [1.0, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625]
@dataclass
class ScoringStats: class ScoringStats:
# PERF: Dataclass introduces overhead with property access and __init__
# PERF: Float operations and comparisons are expensive for high-frequency updates
# PERF: Property calculation on every access is inefficient
# PERF: Storing min/max adds memory overhead and comparison costs
# PERF: Using inf/-inf creates unnecessary float objects
urls_scored: int = 0
total_score: float = 0.0
min_score: float = float("inf") # Expensive object creation
max_score: float = float("-inf")
def update(self, score: float):
"""Update scoring statistics"""
self.urls_scored += 1
self.total_score += score
self.min_score = min(self.min_score, score)
self.max_score = max(self.max_score, score)
@property
def average_score(self) -> float:
"""Calculate average score"""
return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0
class FastScoringStats:
__slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score') __slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score')
def __init__(self): def __init__(self):
@@ -88,32 +60,7 @@ class FastScoringStats:
if self._max_score is None: if self._max_score is None:
self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0 self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
return self._max_score return self._max_score
class URLScorer(ABC): class URLScorer(ABC):
# PERF: Property access overhead for weight
# PERF: Unnecessary name attribute
# PERF: Stats object creation overhead
# PERF: Logger creation for each instance
# PERF: Abstract method overhead
def __init__(self, weight: float = 1.0, name: str = None):
self.weight = weight
self.name = name or self.__class__.__name__
self.stats = ScoringStats()
self.logger = logging.getLogger(f"urlscorer.{self.name}")
@abstractmethod
def _calculate_score(self, url: str) -> float:
pass
def score(self, url: str) -> float:
raw_score = self._calculate_score(url)
weighted_score = raw_score * self.weight
self.stats.update(weighted_score)
return weighted_score
# Optimized base class
class FastURLScorer(ABC):
__slots__ = ('_weight', '_stats') __slots__ = ('_weight', '_stats')
def __init__(self, weight: float = 1.0): def __init__(self, weight: float = 1.0):
@@ -142,31 +89,6 @@ class FastURLScorer(ABC):
return self._weight return self._weight
class CompositeScorer(URLScorer): class CompositeScorer(URLScorer):
# PERF: Unnecessary list iteration for each score
# PERF: Creates new list for scores
# PERF: Division on every normalization
# PERF: No parallelization for independent scorers
# PERF: No short circuit for zero scores
# PERF: No weighting optimization
# PERF: No caching of combined scores
# PERF: List allocation for scores storag
"""Combines multiple scorers with weights"""
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
super().__init__(name="CompositeScorer")
self.scorers = scorers
self.normalize = normalize
def _calculate_score(self, url: str) -> float:
scores = [scorer.score(url) for scorer in self.scorers]
total_score = sum(scores)
if self.normalize and scores:
total_score /= len(scores)
return total_score
class FastCompositeScorer(FastURLScorer):
__slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array') __slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array')
def __init__(self, scorers: List[URLScorer], normalize: bool = True): def __init__(self, scorers: List[URLScorer], normalize: bool = True):
@@ -236,50 +158,6 @@ class FastCompositeScorer(FastURLScorer):
return score return score
class KeywordRelevanceScorer(URLScorer): class KeywordRelevanceScorer(URLScorer):
# PERF: Regex compilation and pattern matching is expensive
# PERF: List comprehension with pattern search has high overhead
# PERF: URL decoding on every calculation
# PERF: Division operation for normalization is costly
# PERF: Case insensitive regex adds overhead
# PERF: No pattern caching or reuse
# PERF: Using inheritance adds method lookup overhead
"""Score URLs based on keyword relevance.
keyword_scorer = KeywordRelevanceScorer(
keywords=["python", "programming"],
weight=1.0,
case_sensitive=False
)
- Score based on keyword matches
- Case sensitivity options
- Weighted scoring
"""
def __init__(
self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False
):
super().__init__(weight=weight)
self.keywords = keywords
self.case_sensitive = case_sensitive
self._compile_keywords()
def _compile_keywords(self):
"""Prepare keywords for matching"""
flags = 0 if self.case_sensitive else re.IGNORECASE
self.patterns = [re.compile(re.escape(k), flags) for k in self.keywords]
def _calculate_score(self, url: str) -> float:
"""Calculate score based on keyword matches"""
decoded_url = unquote(url)
total_matches = sum(
1 for pattern in self.patterns if pattern.search(decoded_url)
)
# Normalize score between 0 and 1
return total_matches / len(self.patterns) if self.patterns else 0.0
class FastKeywordRelevanceScorer(FastURLScorer):
__slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive') __slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive')
def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False): def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
@@ -310,39 +188,6 @@ class FastKeywordRelevanceScorer(FastURLScorer):
return matches / len(self._keywords) return matches / len(self._keywords)
class PathDepthScorer(URLScorer): class PathDepthScorer(URLScorer):
# PERF: URL parsing on every call is expensive
# PERF: Split and list comprehension creates temporary lists
# PERF: abs() call adds function overhead
# PERF: Division and addition in score calculation are expensive for high frequency
# PERF: Path parts filtering creates extra list
# PERF: Inherits URLScorer adding method lookup overhead
# PERF: No caching of parsed URLs or calculated depths
"""Score URLs based on their path depth.
path_scorer = PathDepthScorer(
optimal_depth=3, # Preferred URL depth
weight=0.7
)
- Score based on URL path depth
- Configurable optimal depth
- Diminishing returns for deeper paths
"""
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
super().__init__(weight=weight)
self.optimal_depth = optimal_depth
def _calculate_score(self, url: str) -> float:
"""Calculate score based on path depth"""
path = urlparse(url).path
depth = len([x for x in path.split("/") if x])
# Score decreases as we move away from optimal depth
distance_from_optimal = abs(depth - self.optimal_depth)
return 1.0 / (1.0 + distance_from_optimal)
class FastPathDepthScorer(FastURLScorer):
__slots__ = ('_weight', '_stats', '_optimal_depth') # Remove _url_cache __slots__ = ('_weight', '_stats', '_optimal_depth') # Remove _url_cache
def __init__(self, optimal_depth: int = 3, weight: float = 1.0): def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
@@ -400,45 +245,6 @@ class FastPathDepthScorer(FastURLScorer):
return 1.0 / (1.0 + distance) return 1.0 / (1.0 + distance)
class ContentTypeScorer(URLScorer): class ContentTypeScorer(URLScorer):
# PERF: Regex compilation on every initialization
# PERF: Dict lookup and regex search for every URL
# PERF: Pattern iteration adds loop overhead
# PERF: No pattern priority or short-circuit
# PERF: Dict storage has lookup overhead
# PERF: Missing extension fast path check
# PERF: Unnecessary regex for simple extensions
"""Score URLs based on content type preferences.
content_scorer = ContentTypeScorer({
r'\.html$': 1.0,
r'\.pdf$': 0.8,
r'\.xml$': 0.6
})
- Score based on file types
- Configurable type weights
- Pattern matching support
"""
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
super().__init__(weight=weight)
self.type_weights = type_weights
self._compile_patterns()
def _compile_patterns(self):
"""Prepare content type patterns"""
self.patterns = {
re.compile(pattern): weight for pattern, weight in self.type_weights.items()
}
def _calculate_score(self, url: str) -> float:
"""Calculate score based on content type matching"""
for pattern, weight in self.patterns.items():
if pattern.search(url):
return weight
return 0.0
class FastContentTypeScorer(FastURLScorer):
__slots__ = ('_weight', '_exact_types', '_regex_types') __slots__ = ('_weight', '_exact_types', '_regex_types')
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0): def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
@@ -524,45 +330,6 @@ class FastContentTypeScorer(FastURLScorer):
return 0.0 return 0.0
class FreshnessScorer(URLScorer): class FreshnessScorer(URLScorer):
# PERF: Multiple regex compilations for each pattern
# PERF: Tries all patterns sequentially
# PERF: Regex pattern matching is expensive
# PERF: Int conversion and arithmetic for every match
# PERF: Repeated constant value (2024) hardcoded
# PERF: No URL caching
# PERF: Complex patterns with redundant groups
# PERF: Unnecessary list of patterns when could combine
"""Score URLs based on freshness indicators.
freshness_scorer = FreshnessScorer(weight=0.9)
Score based on date indicators in URLs
Multiple date format support
Recency weighting"""
def __init__(self, weight: float = 1.0):
super().__init__(weight=weight)
self.date_patterns = [
r"/(\d{4})/(\d{2})/(\d{2})/", # yyyy/mm/dd
r"(\d{4})[-_](\d{2})[-_](\d{2})", # yyyy-mm-dd
r"/(\d{4})/", # year only
]
self._compile_patterns()
def _compile_patterns(self):
"""Prepare date patterns"""
self.compiled_patterns = [re.compile(p) for p in self.date_patterns]
def _calculate_score(self, url: str) -> float:
"""Calculate score based on date indicators"""
for pattern in self.compiled_patterns:
if match := pattern.search(url):
year = int(match.group(1))
# Score higher for more recent years
return 1.0 - (2024 - year) * 0.1
return 0.5 # Default score for URLs without dates
class FastFreshnessScorer(FastURLScorer):
__slots__ = ('_weight', '_date_pattern', '_current_year') __slots__ = ('_weight', '_date_pattern', '_current_year')
def __init__(self, weight: float = 1.0, current_year: int = 2024): def __init__(self, weight: float = 1.0, current_year: int = 2024):
@@ -645,41 +412,6 @@ class FastFreshnessScorer(FastURLScorer):
return max(0.1, 1.0 - year_diff * 0.1) return max(0.1, 1.0 - year_diff * 0.1)
class DomainAuthorityScorer(URLScorer): class DomainAuthorityScorer(URLScorer):
# PERF: URL parsing on every score calculation
# PERF: Repeated domain extraction
# PERF: Case conversion on every lookup
# PERF: Dict lookup without caching
# PERF: Processes full URL when only needs domain
# PERF: No fast path for common domains
# PERF: Netloc includes port which requires extra processing
"""Score URLs based on domain authority.
authority_scorer = DomainAuthorityScorer({
"python.org": 1.0,
"github.com": 0.9,
"medium.com": 0.7
})
Score based on domain importance
Configurable domain weights
Default weight for unknown domains"""
def __init__(
self,
domain_weights: Dict[str, float],
default_weight: float = 0.5,
weight: float = 1.0,
):
super().__init__(weight=weight)
self.domain_weights = domain_weights
self.default_weight = default_weight
def _calculate_score(self, url: str) -> float:
"""Calculate score based on domain authority"""
domain = urlparse(url).netloc.lower()
return self.domain_weights.get(domain, self.default_weight)
class FastDomainAuthorityScorer(FastURLScorer):
__slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains') __slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains')
def __init__( def __init__(
@@ -785,418 +517,3 @@ class FastDomainAuthorityScorer(FastURLScorer):
# Regular path: check all domains # Regular path: check all domains
return self._domain_weights.get(domain, self._default_weight) return self._domain_weights.get(domain, self._default_weight)
def create_balanced_scorer() -> CompositeScorer:
"""Create a balanced composite scorer"""
return CompositeScorer(
[
KeywordRelevanceScorer(
keywords=["article", "blog", "news", "research"], weight=1.0
),
PathDepthScorer(optimal_depth=3, weight=0.7),
ContentTypeScorer(
type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6},
weight=0.8,
),
FreshnessScorer(weight=0.9),
]
)
def create_balanced_fast_freshness_scorer() -> CompositeScorer:
"""Create a balanced composite scorer with fast freshness scorer"""
return FastCompositeScorer(
[
FastKeywordRelevanceScorer(
keywords=["article", "blog", "news", "research"], weight=1.0
),
FastPathDepthScorer(optimal_depth=3, weight=0.7),
FastContentTypeScorer(
type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6},
weight=0.8,
),
FastFreshnessScorer(weight=0.9),
]
)
# Example Usage:
"""
# Create a composite scorer
scorer = CompositeScorer([
KeywordRelevanceScorer(["python", "programming"], weight=1.0),
PathDepthScorer(optimal_depth=2, weight=0.7),
FreshnessScorer(weight=0.8),
DomainAuthorityScorer(
domain_weights={
"python.org": 1.0,
"github.com": 0.9,
"medium.com": 0.7
},
weight=0.9
)
])
# Score a URL
score = scorer.score("https://python.org/article/2024/01/new-features")
# Access statistics
print(f"Average score: {scorer.stats.average_score}")
print(f"URLs scored: {scorer.stats.urls_scored}")
"""
def run_scorer_performance_test():
import time
import random
from itertools import cycle
import sys
# Generate varied test URLs
base_urls = [
# News/blog articles with dates
"https://example.com/2024/01/article-123",
"https://news.com/2023-12-31/breaking-news",
"https://blog.site.com/2022_11_15/tech-update",
# Different content types
"https://docs.example.com/report.pdf",
"https://site.com/page.html?q=test",
"https://api.service.com/data.json",
# Various domain authorities
"https://python.org/downloads",
"https://github.com/repo/code",
"https://medium.com/@user/post",
# Different path depths
"https://site.com/category/subcategory/product/detail",
"https://shop.com/items",
"https://edu.org/courses/cs/intro/lecture1",
]
# Create variations
test_urls = []
years = list(range(2020, 2025))
domains = ["example.com", "python.org", "github.com", "medium.com"]
extensions = ["html", "pdf", "php", "jsx"]
for base in base_urls:
test_urls.append(base)
# Add year variations
for year in years:
test_urls.append(f"https://blog.com/{year}/post-{random.randint(1,999)}")
# Add domain variations
for domain in domains:
test_urls.append(f"https://{domain}/article-{random.randint(1,999)}")
# Add extension variations
for ext in extensions:
test_urls.append(f"https://site.com/doc-{random.randint(1,999)}.{ext}")
# Multiply dataset
test_urls = test_urls * 5000 # Creates ~300k URLs
def benchmark(name: str, scorer, urls, warmup=True):
if warmup:
for url in urls[:100]: # Warmup with subset
scorer.score(url)
start = time.perf_counter_ns()
for url in urls:
scorer.score(url)
elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms
print(
f"{name:<35} {elapsed:>8.3f} ms ({len(urls)/elapsed*1000:,.0f} URLs/sec)"
)
return elapsed
print("\nBenchmarking original vs optimized scorers...")
print("-" * 75)
# Initialize test data
domain_weights = {"python.org": 1.0, "github.com": 0.9, "medium.com": 0.7}
type_weights = {".html$": 1.0, ".pdf$": 0.8, ".php$": 0.6}
keywords = ["python", "article", "blog", "docs"]
# Original implementations
keyword_scorer = KeywordRelevanceScorer(keywords=keywords, weight=1.0)
path_scorer = PathDepthScorer(optimal_depth=3, weight=0.7)
content_scorer = ContentTypeScorer(type_weights=type_weights, weight=0.8)
freshness_scorer = FreshnessScorer(weight=0.9)
domain_scorer = DomainAuthorityScorer(domain_weights=domain_weights, weight=1.0)
# Fast implementations
fast_keyword_scorer = FastKeywordRelevanceScorer(keywords=keywords, weight=1.0)
fast_path_scorer = FastPathDepthScorer(optimal_depth=3, weight=0.7)
fast_content_scorer = FastContentTypeScorer(type_weights=type_weights, weight=0.8)
fast_freshness_scorer = FastFreshnessScorer(weight=0.9)
fast_domain_scorer = FastDomainAuthorityScorer(domain_weights=domain_weights, weight=1.0)
# Test subset for individual scorers
test_subset = test_urls[:1000]
print("\nIndividual Scorer Performance (first 1000 URLs):")
print("\nKeyword Relevance Scorers:")
benchmark("Original Keyword Scorer", keyword_scorer, test_subset)
benchmark("Optimized Keyword Scorer", fast_keyword_scorer, test_subset)
print("\nPath Depth Scorers:")
benchmark("Original Path Scorer", path_scorer, test_subset)
benchmark("Optimized Path Scorer", fast_path_scorer, test_subset)
print("\nContent Type Scorers:")
benchmark("Original Content Scorer", content_scorer, test_subset)
benchmark("Optimized Content Scorer", fast_content_scorer, test_subset)
print("\nFreshness Scorers:")
benchmark("Original Freshness Scorer", freshness_scorer, test_subset)
benchmark("Optimized Freshness Scorer", fast_freshness_scorer, test_subset)
print("\nDomain Authority Scorers:")
benchmark("Original Domain Scorer", domain_scorer, test_subset)
benchmark("Optimized Domain Scorer", fast_domain_scorer, test_subset)
# Test composite scorers
print("\nComposite Scorer Performance (all URLs):")
original_composite = CompositeScorer([
keyword_scorer, path_scorer, content_scorer,
freshness_scorer, domain_scorer
])
fast_composite = FastCompositeScorer([
fast_keyword_scorer, fast_path_scorer, fast_content_scorer,
fast_freshness_scorer, fast_domain_scorer
])
benchmark("Original Composite Scorer", original_composite, test_urls)
benchmark("Optimized Composite Scorer", fast_composite, test_urls)
# Memory usage
print("\nMemory Usage per Scorer:")
print(f"Original Keyword Scorer: {sys.getsizeof(keyword_scorer):,} bytes")
print(f"Optimized Keyword Scorer: {sys.getsizeof(fast_keyword_scorer):,} bytes")
print(f"Original Path Scorer: {sys.getsizeof(path_scorer):,} bytes")
print(f"Optimized Path Scorer: {sys.getsizeof(fast_path_scorer):,} bytes")
print(f"Original Content Scorer: {sys.getsizeof(content_scorer):,} bytes")
print(f"Optimized Content Scorer: {sys.getsizeof(fast_content_scorer):,} bytes")
print(f"Original Freshness Scorer: {sys.getsizeof(freshness_scorer):,} bytes")
print(f"Optimized Freshness Scorer: {sys.getsizeof(fast_freshness_scorer):,} bytes")
print(f"Original Domain Scorer: {sys.getsizeof(domain_scorer):,} bytes")
print(f"Optimized Domain Scorer: {sys.getsizeof(fast_domain_scorer):,} bytes")
print(f"Original Composite: {sys.getsizeof(original_composite):,} bytes")
print(f"Optimized Composite: {sys.getsizeof(fast_composite):,} bytes")
def test_scorers():
import time
from itertools import chain
test_cases = [
# Keyword Scorer Tests
{
"scorer_type": "keyword",
"config": {
"keywords": ["python", "blog"],
"weight": 1.0,
"case_sensitive": False
},
"urls": {
"https://example.com/python-blog": 1.0,
"https://example.com/PYTHON-BLOG": 1.0,
"https://example.com/python-only": 0.5,
"https://example.com/other": 0.0
}
},
# Path Depth Scorer Tests
{
"scorer_type": "path_depth",
"config": {
"optimal_depth": 2,
"weight": 1.0
},
"urls": {
"https://example.com/a/b": 1.0,
"https://example.com/a": 0.5,
"https://example.com/a/b/c": 0.5,
"https://example.com": 0.33333333
}
},
# Content Type Scorer Tests
{
"scorer_type": "content_type",
"config": {
"type_weights": {
".html$": 1.0,
".pdf$": 0.8,
".jpg$": 0.6
},
"weight": 1.0
},
"urls": {
"https://example.com/doc.html": 1.0,
"https://example.com/doc.pdf": 0.8,
"https://example.com/img.jpg": 0.6,
"https://example.com/other.txt": 0.0
}
},
# Freshness Scorer Tests
{
"scorer_type": "freshness",
"config": {
"weight": 1.0, # Remove current_year since original doesn't support it
},
"urls": {
"https://example.com/2024/01/post": 1.0,
"https://example.com/2023/12/post": 0.9,
"https://example.com/2022/post": 0.8,
"https://example.com/no-date": 0.5
}
},
# Domain Authority Scorer Tests
{
"scorer_type": "domain",
"config": {
"domain_weights": {
"python.org": 1.0,
"github.com": 0.8,
"medium.com": 0.6
},
"default_weight": 0.3,
"weight": 1.0
},
"urls": {
"https://python.org/about": 1.0,
"https://github.com/repo": 0.8,
"https://medium.com/post": 0.6,
"https://unknown.com": 0.3
}
}
]
def create_scorer(scorer_type, config):
if scorer_type == "keyword":
return (
KeywordRelevanceScorer(**config),
FastKeywordRelevanceScorer(**config)
)
elif scorer_type == "path_depth":
return (
PathDepthScorer(**config),
FastPathDepthScorer(**config)
)
elif scorer_type == "content_type":
return (
ContentTypeScorer(**config),
FastContentTypeScorer(**config)
)
elif scorer_type == "freshness":
return (
FreshnessScorer(**config),
FastFreshnessScorer(**config, current_year=2024)
)
elif scorer_type == "domain":
return (
DomainAuthorityScorer(**config),
FastDomainAuthorityScorer(**config)
)
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for test_case in test_cases:
print(f"\nTesting {test_case['scorer_type']} scorer:")
original, fast = create_scorer(
test_case['scorer_type'],
test_case['config']
)
for url, expected in test_case['urls'].items():
orig_score = round(original.score(url), 8)
fast_score = round(fast.score(url), 8)
expected = round(expected, 8)
if abs(orig_score - expected) > 0.00001:
print(f"❌ Original Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {orig_score}")
all_passed = False
else:
print(f"✅ Original Passed: URL '{url}'")
if abs(fast_score - expected) > 0.00001:
print(f"❌ Fast Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {fast_score}")
all_passed = False
else:
print(f"✅ Fast Passed: URL '{url}'")
return all_passed
def run_composite_test():
print("\nTesting Composite Scorer:")
print("-" * 50)
# Create test data
test_urls = {
"https://python.org/blog/2024/01/new-release.html":0.86666667,
"https://github.com/repo/old-code.pdf": 0.62,
"https://unknown.com/random": 0.26
}
# Create composite scorers with all types
original_scorers = []
fast_scorers = []
for test_case in test_cases:
orig, fast = create_scorer(
test_case['scorer_type'],
test_case['config']
)
original_scorers.append(orig)
fast_scorers.append(fast)
original_composite = CompositeScorer(original_scorers, normalize=True)
fast_composite = FastCompositeScorer(fast_scorers, normalize=True)
all_passed = True
for url, expected in test_urls.items():
orig_score = round(original_composite.score(url), 8)
fast_score = round(fast_composite.score(url), 8)
if abs(orig_score - expected) > 0.00001:
print(f"❌ Original Composite Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {orig_score}")
all_passed = False
else:
print(f"✅ Original Composite Passed: URL '{url}'")
if abs(fast_score - expected) > 0.00001:
print(f"❌ Fast Composite Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {fast_score}")
all_passed = False
else:
print(f"✅ Fast Composite Passed: URL '{url}'")
return all_passed
# Run tests
print("Running Scorer Tests...")
accuracy_passed = run_accuracy_test()
composite_passed = run_composite_test()
if accuracy_passed and composite_passed:
print("\n✨ All tests passed!")
# Note: Already have performance tests in run_scorer_performance_test()
else:
print("\n❌ Some tests failed!")
if __name__ == "__main__":
run_scorer_performance_test()
# test_scorers()

View File

@@ -510,6 +510,7 @@ class HTML2Text(html.parser.HTMLParser):
if tag == "a" and not self.ignore_links: if tag == "a" and not self.ignore_links:
if start: if start:
self.inside_link = True
if ( if (
"href" in attrs "href" in attrs
and attrs["href"] is not None and attrs["href"] is not None
@@ -526,6 +527,7 @@ class HTML2Text(html.parser.HTMLParser):
else: else:
self.astack.append(None) self.astack.append(None)
else: else:
self.inside_link = False
if self.astack: if self.astack:
a = self.astack.pop() a = self.astack.pop()
if self.maybe_automatic_link and not self.empty_link: if self.maybe_automatic_link and not self.empty_link:
@@ -610,13 +612,22 @@ class HTML2Text(html.parser.HTMLParser):
self.o("[" + str(a_props.count) + "]") self.o("[" + str(a_props.count) + "]")
if tag == "dl" and start: if tag == "dl" and start:
self.p() self.p() # Add paragraph break before list starts
if tag == "dt" and not start: self.p_p = 0 # Reset paragraph state
self.pbr()
if tag == "dd" and start: elif tag == "dt" and start:
self.o(" ") if self.p_p == 0: # If not first term
if tag == "dd" and not start: self.o("\n\n") # Add spacing before new term-definition pair
self.pbr() self.p_p = 0 # Reset paragraph state
elif tag == "dt" and not start:
self.o("\n") # Single newline between term and definition
elif tag == "dd" and start:
self.o(" ") # Indent definition
elif tag == "dd" and not start:
self.p_p = 0
if tag in ["ol", "ul"]: if tag in ["ol", "ul"]:
# Google Docs create sub lists as top level lists # Google Docs create sub lists as top level lists
@@ -1026,6 +1037,7 @@ class CustomHTML2Text(HTML2Text):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.inside_pre = False self.inside_pre = False
self.inside_code = False self.inside_code = False
self.inside_link = False
self.preserve_tags = set() # Set of tags to preserve self.preserve_tags = set() # Set of tags to preserve
self.current_preserved_tag = None self.current_preserved_tag = None
self.preserved_content = [] self.preserved_content = []
@@ -1105,11 +1117,17 @@ class CustomHTML2Text(HTML2Text):
# Ignore code tags inside pre blocks if handle_code_in_pre is False # Ignore code tags inside pre blocks if handle_code_in_pre is False
return return
if start: if start:
self.o("`") # Markdown inline code start if not self.inside_link:
self.o("`") # Only output backtick if not inside a link
self.inside_code = True self.inside_code = True
else: else:
self.o("`") # Markdown inline code end if not self.inside_link:
self.o("`") # Only output backtick if not inside a link
self.inside_code = False self.inside_code = False
# If inside a link, let the parent class handle the content
if self.inside_link:
super().handle_tag(tag, attrs, start)
else: else:
super().handle_tag(tag, attrs, start) super().handle_tag(tag, attrs, start)

View File

@@ -179,7 +179,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
"ignore_emphasis": False, "ignore_emphasis": False,
"ignore_links": False, "ignore_links": False,
"ignore_images": False, "ignore_images": False,
"protect_links": True, "protect_links": False,
"single_line_break": True, "single_line_break": True,
"mark_code": True, "mark_code": True,
"escape_snob": False, "escape_snob": False,

View File

@@ -198,7 +198,7 @@ Avoid Common Mistakes:
- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors. - Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places. - Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
- Do not miss closing </blocks> tag at the end of the JSON output. - Do not miss closing </blocks> tag at the end of the JSON output.
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format. - Do not generate the Python code show me how to do the task, this is your task to extract the information and return it in JSON format.
Result Result
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly.""" Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""

View File

@@ -168,10 +168,10 @@ async def main():
"name": "News Items", "name": "News Items",
"baseSelector": "tr.athing", "baseSelector": "tr.athing",
"fields": [ "fields": [
{"name": "title", "selector": "a.storylink", "type": "text"}, {"name": "title", "selector": "span.titleline a", "type": "text"},
{ {
"name": "link", "name": "link",
"selector": "a.storylink", "selector": "span.titleline a",
"type": "attribute", "type": "attribute",
"attribute": "href" "attribute": "href"
} }

View File

@@ -135,14 +135,14 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
# Using OpenAI (requires API token) # Using OpenAI (requires API token)
schema = JsonCssExtractionStrategy.generate_schema( schema = JsonCssExtractionStrategy.generate_schema(
html, html,
llm_provider="openai/gpt-4o", # Default provider provider="openai/gpt-4o", # Default provider
api_token="your-openai-token" # Required for OpenAI api_token="your-openai-token" # Required for OpenAI
) )
# Or using Ollama (open source, no token needed) # Or using Ollama (open source, no token needed)
schema = JsonCssExtractionStrategy.generate_schema( schema = JsonCssExtractionStrategy.generate_schema(
html, html,
llm_provider="ollama/llama3.3", # Open source alternative provider="ollama/llama3.3", # Open source alternative
api_token=None # Not needed for Ollama api_token=None # Not needed for Ollama
) )

View File

@@ -434,7 +434,7 @@ html = """
css_schema = JsonCssExtractionStrategy.generate_schema( css_schema = JsonCssExtractionStrategy.generate_schema(
html, html,
schema_type="css", # This is the default schema_type="css", # This is the default
llm_provider="openai/gpt-4o", # Default provider provider="openai/gpt-4o", # Default provider
api_token="your-openai-token" # Required for OpenAI api_token="your-openai-token" # Required for OpenAI
) )
@@ -442,7 +442,7 @@ css_schema = JsonCssExtractionStrategy.generate_schema(
xpath_schema = JsonXPathExtractionStrategy.generate_schema( xpath_schema = JsonXPathExtractionStrategy.generate_schema(
html, html,
schema_type="xpath", schema_type="xpath",
llm_provider="ollama/llama3.3", # Open source alternative provider="ollama/llama3.3", # Open source alternative
api_token=None # Not needed for Ollama api_token=None # Not needed for Ollama
) )

View File

@@ -0,0 +1,46 @@
import asyncio
import time
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
async def main():
"""Example deep crawl of documentation site."""
filter_chain = FilterChain([
URLPatternFilter(patterns=["*2025*"]),
DomainFilter(allowed_domains=["techcrunch.com"]),
ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
ContentTypeFilter(allowed_types=["text/html","application/javascript"])
])
config = CrawlerRunConfig(
deep_crawl_strategy = BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
filter_chain=filter_chain,
url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
),
stream=False,
verbose=True,
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy()
)
async with AsyncWebCrawler() as crawler:
print("Starting deep crawl in streaming mode:")
config.stream = True
start_time = time.perf_counter()
async for result in await crawler.arun(
url="https://techcrunch.com",
config=config
):
print(f"{result.url} (Depth: {result.metadata.get('depth', 0)})")
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,279 @@
from crawl4ai.deep_crawling.filters import ContentRelevanceFilter, URLPatternFilter, DomainFilter, ContentTypeFilter, SEOFilter
async def test_pattern_filter():
# Test cases as list of tuples instead of dict for multiple patterns
test_cases = [
# Simple suffix patterns (*.html)
("*.html", {
"https://example.com/page.html": True,
"https://example.com/path/doc.html": True,
"https://example.com/page.htm": False,
"https://example.com/page.html?param=1": True,
}),
# Path prefix patterns (/foo/*)
("*/article/*", {
"https://example.com/article/123": True,
"https://example.com/blog/article/456": True,
"https://example.com/articles/789": False,
"https://example.com/article": False,
}),
# Complex patterns
("blog-*-[0-9]", {
"https://example.com/blog-post-1": True,
"https://example.com/blog-test-9": True,
"https://example.com/blog-post": False,
"https://example.com/blog-post-x": False,
}),
# Multiple patterns case
(["*.pdf", "*/download/*"], {
"https://example.com/doc.pdf": True,
"https://example.com/download/file.txt": True,
"https://example.com/path/download/doc": True,
"https://example.com/uploads/file.txt": False,
}),
# Edge cases
("*", {
"https://example.com": True,
"": True,
"http://test.com/path": True,
}),
# Complex regex
(r"^https?://.*\.example\.com/\d+", {
"https://sub.example.com/123": True,
"http://test.example.com/456": True,
"https://example.com/789": False,
"https://sub.example.com/abc": False,
})
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for patterns, test_urls in test_cases:
filter_obj = URLPatternFilter(patterns)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
return all_passed
# Run tests
print("Running Pattern Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n✨ All accuracy tests passed!")
else:
print("\n❌ Some accuracy tests failed!")
async def test_domain_filter():
from itertools import chain
# Test cases
test_cases = [
# Allowed domains
({"allowed": "example.com"}, {
"https://example.com/page": True,
"http://example.com": True,
"https://sub.example.com": False,
"https://other.com": False,
}),
({"allowed": ["example.com", "test.com"]}, {
"https://example.com/page": True,
"https://test.com/home": True,
"https://other.com": False,
}),
# Blocked domains
({"blocked": "malicious.com"}, {
"https://malicious.com": False,
"https://safe.com": True,
"http://malicious.com/login": False,
}),
({"blocked": ["spam.com", "ads.com"]}, {
"https://spam.com": False,
"https://ads.com/banner": False,
"https://example.com": True,
}),
# Allowed and Blocked combination
({"allowed": "example.com", "blocked": "sub.example.com"}, {
"https://example.com": True,
"https://sub.example.com": False,
"https://other.com": False,
}),
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for params, test_urls in test_cases:
filter_obj = DomainFilter(
allowed_domains=params.get("allowed"),
blocked_domains=params.get("blocked"),
)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"\u274C Failed: Params {params} with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: Params {params} with URL '{url}'")
return all_passed
# Run tests
print("Running Domain Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n\u2728 All accuracy tests passed!")
else:
print("\n\u274C Some accuracy tests failed!")
async def test_content_relevance_filter():
relevance_filter = ContentRelevanceFilter(
query="What was the cause of american civil war?",
threshold=1
)
test_cases = {
"https://en.wikipedia.org/wiki/Cricket": False,
"https://en.wikipedia.org/wiki/American_Civil_War": True,
}
print("\nRunning Content Relevance Filter Tests...")
print("-" * 50)
all_passed = True
for url, expected in test_cases.items():
result = await relevance_filter.apply(url)
if result != expected:
print(f"\u274C Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: URL '{url}'")
if all_passed:
print("\n\u2728 All content relevance tests passed!")
else:
print("\n\u274C Some content relevance tests failed!")
async def test_content_type_filter():
from itertools import chain
# Test cases
test_cases = [
# Allowed single type
({"allowed": "image/png"}, {
"https://example.com/image.png": True,
"https://example.com/photo.jpg": False,
"https://example.com/document.pdf": False,
}),
# Multiple allowed types
({"allowed": ["image/jpeg", "application/pdf"]}, {
"https://example.com/photo.jpg": True,
"https://example.com/document.pdf": True,
"https://example.com/script.js": False,
}),
# No extension should be allowed
({"allowed": "application/json"}, {
"https://example.com/api/data": True,
"https://example.com/data.json": True,
"https://example.com/page.html": False,
}),
# Unknown extensions should not be allowed
({"allowed": "application/octet-stream"}, {
"https://example.com/file.unknown": True,
"https://example.com/archive.zip": False,
"https://example.com/software.exe": False,
}),
]
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for params, test_urls in test_cases:
filter_obj = ContentTypeFilter(
allowed_types=params.get("allowed"),
)
for url, expected in test_urls.items():
result = filter_obj.apply(url)
if result != expected:
print(f"\u274C Failed: Params {params} with URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: Params {params} with URL '{url}'")
return all_passed
# Run tests
print("Running Content Type Filter Tests...")
accuracy_passed = run_accuracy_test()
if accuracy_passed:
print("\n\u2728 All accuracy tests passed!")
else:
print("\n\u274C Some accuracy tests failed!")
async def test_seo_filter():
seo_filter = SEOFilter(threshold=0.5, keywords=["SEO", "search engines", "Optimization"])
test_cases = {
"https://en.wikipedia.org/wiki/Search_engine_optimization": True,
"https://en.wikipedia.org/wiki/Randomness": False,
}
print("\nRunning SEO Filter Tests...")
print("-" * 50)
all_passed = True
for url, expected in test_cases.items():
result = await seo_filter.apply(url)
if result != expected:
print(f"\u274C Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {result}")
all_passed = False
else:
print(f"\u2705 Passed: URL '{url}'")
if all_passed:
print("\n\u2728 All SEO filter tests passed!")
else:
print("\n\u274C Some SEO filter tests failed!")
import asyncio
if __name__ == "__main__":
asyncio.run(test_pattern_filter())
asyncio.run(test_domain_filter())
asyncio.run(test_content_type_filter())
asyncio.run(test_content_relevance_filter())
asyncio.run(test_seo_filter())

View File

@@ -0,0 +1,179 @@
from crawl4ai.deep_crawling.scorers import CompositeScorer, ContentTypeScorer, DomainAuthorityScorer, FreshnessScorer, KeywordRelevanceScorer, PathDepthScorer
def test_scorers():
test_cases = [
# Keyword Scorer Tests
{
"scorer_type": "keyword",
"config": {
"keywords": ["python", "blog"],
"weight": 1.0,
"case_sensitive": False
},
"urls": {
"https://example.com/python-blog": 1.0,
"https://example.com/PYTHON-BLOG": 1.0,
"https://example.com/python-only": 0.5,
"https://example.com/other": 0.0
}
},
# Path Depth Scorer Tests
{
"scorer_type": "path_depth",
"config": {
"optimal_depth": 2,
"weight": 1.0
},
"urls": {
"https://example.com/a/b": 1.0,
"https://example.com/a": 0.5,
"https://example.com/a/b/c": 0.5,
"https://example.com": 0.33333333
}
},
# Content Type Scorer Tests
{
"scorer_type": "content_type",
"config": {
"type_weights": {
".html$": 1.0,
".pdf$": 0.8,
".jpg$": 0.6
},
"weight": 1.0
},
"urls": {
"https://example.com/doc.html": 1.0,
"https://example.com/doc.pdf": 0.8,
"https://example.com/img.jpg": 0.6,
"https://example.com/other.txt": 0.0
}
},
# Freshness Scorer Tests
{
"scorer_type": "freshness",
"config": {
"weight": 1.0, # Remove current_year since original doesn't support it
},
"urls": {
"https://example.com/2024/01/post": 1.0,
"https://example.com/2023/12/post": 0.9,
"https://example.com/2022/post": 0.8,
"https://example.com/no-date": 0.5
}
},
# Domain Authority Scorer Tests
{
"scorer_type": "domain",
"config": {
"domain_weights": {
"python.org": 1.0,
"github.com": 0.8,
"medium.com": 0.6
},
"default_weight": 0.3,
"weight": 1.0
},
"urls": {
"https://python.org/about": 1.0,
"https://github.com/repo": 0.8,
"https://medium.com/post": 0.6,
"https://unknown.com": 0.3
}
}
]
def create_scorer(scorer_type, config):
if scorer_type == "keyword":
return KeywordRelevanceScorer(**config)
elif scorer_type == "path_depth":
return PathDepthScorer(**config)
elif scorer_type == "content_type":
return ContentTypeScorer(**config)
elif scorer_type == "freshness":
return FreshnessScorer(**config,current_year=2024)
elif scorer_type == "domain":
return DomainAuthorityScorer(**config)
def run_accuracy_test():
print("\nAccuracy Tests:")
print("-" * 50)
all_passed = True
for test_case in test_cases:
print(f"\nTesting {test_case['scorer_type']} scorer:")
scorer = create_scorer(
test_case['scorer_type'],
test_case['config']
)
for url, expected in test_case['urls'].items():
score = round(scorer.score(url), 8)
expected = round(expected, 8)
if abs(score - expected) > 0.00001:
print(f"❌ Scorer Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {score}")
all_passed = False
else:
print(f"✅ Scorer Passed: URL '{url}'")
return all_passed
def run_composite_test():
print("\nTesting Composite Scorer:")
print("-" * 50)
# Create test data
test_urls = {
"https://python.org/blog/2024/01/new-release.html":0.86666667,
"https://github.com/repo/old-code.pdf": 0.62,
"https://unknown.com/random": 0.26
}
# Create composite scorers with all types
scorers = []
for test_case in test_cases:
scorer = create_scorer(
test_case['scorer_type'],
test_case['config']
)
scorers.append(scorer)
composite = CompositeScorer(scorers, normalize=True)
all_passed = True
for url, expected in test_urls.items():
score = round(composite.score(url), 8)
if abs(score - expected) > 0.00001:
print(f"❌ Composite Failed: URL '{url}'")
print(f" Expected: {expected}, Got: {score}")
all_passed = False
else:
print(f"✅ Composite Passed: URL '{url}'")
return all_passed
# Run tests
print("Running Scorer Tests...")
accuracy_passed = run_accuracy_test()
composite_passed = run_composite_test()
if accuracy_passed and composite_passed:
print("\n✨ All tests passed!")
# Note: Already have performance tests in run_scorer_performance_test()
else:
print("\n❌ Some tests failed!")
if __name__ == "__main__":
test_scorers()