2025 feb alpha 1 (#685)
* spelling change in prompt * gpt-4o-mini support * Remove leading Y before here * prompt spell correction * (Docs) Fix numbered list end-of-line formatting Added the missing "two spaces" to add a line break * fix: access downloads_path through browser_config in _handle_download method - Fixes #585 * crawl * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/583 * Docs update: https://github.com/unclecode/crawl4ai/issues/649 * fix: https://github.com/unclecode/crawl4ai/issues/570 * Docs: updated example for content-selection to reflect new changes in yc newsfeed css * Refactor: Removed old filters and replaced with optimised filters * fix:Fixed imports as per the new names of filters * Tests: For deep crawl filters * Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers. * fix: awaiting on filters that are async in nature eg: content relevance and seo filters * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/715 --------- Co-authored-by: DarshanTank <darshan.tank@gnani.ai> Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com> Co-authored-by: Serhat Soydan <ssoydan@gmail.com> Co-authored-by: cardit1 <maneesh@cardit.in> Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com>
This commit is contained in:
@@ -17,11 +17,16 @@ from .extraction_strategy import (
|
|||||||
LLMExtractionStrategy,
|
LLMExtractionStrategy,
|
||||||
CosineStrategy,
|
CosineStrategy,
|
||||||
JsonCssExtractionStrategy,
|
JsonCssExtractionStrategy,
|
||||||
JsonXPathExtractionStrategy
|
JsonXPathExtractionStrategy,
|
||||||
)
|
)
|
||||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
|
from .content_filter_strategy import (
|
||||||
|
PruningContentFilter,
|
||||||
|
BM25ContentFilter,
|
||||||
|
LLMContentFilter,
|
||||||
|
RelevantContentFilter,
|
||||||
|
)
|
||||||
from .models import CrawlResult, MarkdownGenerationResult
|
from .models import CrawlResult, MarkdownGenerationResult
|
||||||
from .async_dispatcher import (
|
from .async_dispatcher import (
|
||||||
MemoryAdaptiveDispatcher,
|
MemoryAdaptiveDispatcher,
|
||||||
@@ -29,20 +34,25 @@ from .async_dispatcher import (
|
|||||||
RateLimiter,
|
RateLimiter,
|
||||||
CrawlerMonitor,
|
CrawlerMonitor,
|
||||||
DisplayMode,
|
DisplayMode,
|
||||||
BaseDispatcher
|
BaseDispatcher,
|
||||||
)
|
)
|
||||||
from .docker_client import Crawl4aiDockerClient
|
from .docker_client import Crawl4aiDockerClient
|
||||||
from .hub import CrawlerHub
|
from .hub import CrawlerHub
|
||||||
from .deep_crawling import (
|
from .deep_crawling import (
|
||||||
DeepCrawlStrategy,
|
DeepCrawlStrategy,
|
||||||
BFSDeepCrawlStrategy,
|
BFSDeepCrawlStrategy,
|
||||||
FastFilterChain,
|
FilterChain,
|
||||||
FastContentTypeFilter,
|
ContentTypeFilter,
|
||||||
FastDomainFilter,
|
DomainFilter,
|
||||||
FastURLFilter,
|
URLFilter,
|
||||||
FastFilterStats,
|
FilterStats,
|
||||||
FastKeywordRelevanceScorer,
|
SEOFilter,
|
||||||
FastURLScorer,
|
KeywordRelevanceScorer,
|
||||||
|
URLScorer,
|
||||||
|
CompositeScorer,
|
||||||
|
DomainAuthorityScorer,
|
||||||
|
FreshnessScorer,
|
||||||
|
PathDepthScorer,
|
||||||
BestFirstCrawlingStrategy,
|
BestFirstCrawlingStrategy,
|
||||||
DFSDeepCrawlStrategy,
|
DFSDeepCrawlStrategy,
|
||||||
DeepCrawlDecorator,
|
DeepCrawlDecorator,
|
||||||
@@ -54,13 +64,18 @@ __all__ = [
|
|||||||
"BFSDeepCrawlStrategy",
|
"BFSDeepCrawlStrategy",
|
||||||
"BestFirstCrawlingStrategy",
|
"BestFirstCrawlingStrategy",
|
||||||
"DFSDeepCrawlStrategy",
|
"DFSDeepCrawlStrategy",
|
||||||
"FastFilterChain",
|
"FilterChain",
|
||||||
"FastContentTypeFilter",
|
"ContentTypeFilter",
|
||||||
"FastDomainFilter",
|
"DomainFilter",
|
||||||
"FastFilterStats",
|
"FilterStats",
|
||||||
"FastURLFilter",
|
"URLFilter",
|
||||||
"FastKeywordRelevanceScorer",
|
"SEOFilter",
|
||||||
"FastURLScorer",
|
"KeywordRelevanceScorer",
|
||||||
|
"URLScorer",
|
||||||
|
"CompositeScorer",
|
||||||
|
"DomainAuthorityScorer",
|
||||||
|
"FreshnessScorer",
|
||||||
|
"PathDepthScorer",
|
||||||
"DeepCrawlDecorator",
|
"DeepCrawlDecorator",
|
||||||
"CrawlResult",
|
"CrawlResult",
|
||||||
"CrawlerHub",
|
"CrawlerHub",
|
||||||
|
|||||||
@@ -886,7 +886,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
viewport_height = page.viewport_size.get(
|
viewport_size = page.viewport_size
|
||||||
|
if viewport_size is None:
|
||||||
|
await page.set_viewport_size(
|
||||||
|
{"width": self.browser_config.viewport_width, "height": self.browser_config.viewport_height}
|
||||||
|
)
|
||||||
|
viewport_size = page.viewport_size
|
||||||
|
|
||||||
|
viewport_height = viewport_size.get(
|
||||||
"height", self.browser_config.viewport_height
|
"height", self.browser_config.viewport_height
|
||||||
)
|
)
|
||||||
current_position = viewport_height
|
current_position = viewport_height
|
||||||
@@ -946,7 +953,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
suggested_filename = download.suggested_filename
|
suggested_filename = download.suggested_filename
|
||||||
download_path = os.path.join(self.downloads_path, suggested_filename)
|
download_path = os.path.join(self.browser_config.downloads_path, suggested_filename)
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
message="Downloading {filename} to {path}",
|
message="Downloading {filename} to {path}",
|
||||||
|
|||||||
@@ -166,7 +166,7 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Initialize crawler strategy
|
# Initialize crawler strategy
|
||||||
params = {k: v for k, v in kwargs.items() if k in ["browser_congig", "logger"]}
|
params = {k: v for k, v in kwargs.items() if k in ["browser_config", "logger"]}
|
||||||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
||||||
browser_config=browser_config,
|
browser_config=browser_config,
|
||||||
logger=self.logger,
|
logger=self.logger,
|
||||||
|
|||||||
@@ -4,15 +4,22 @@ from .bfs_strategy import BFSDeepCrawlStrategy
|
|||||||
from .bff_strategy import BestFirstCrawlingStrategy
|
from .bff_strategy import BestFirstCrawlingStrategy
|
||||||
from .dfs_strategy import DFSDeepCrawlStrategy
|
from .dfs_strategy import DFSDeepCrawlStrategy
|
||||||
from .filters import (
|
from .filters import (
|
||||||
FastFilterChain,
|
FilterChain,
|
||||||
FastContentTypeFilter,
|
ContentTypeFilter,
|
||||||
FastDomainFilter,
|
DomainFilter,
|
||||||
FastURLFilter,
|
URLFilter,
|
||||||
FastFilterStats,
|
FilterStats,
|
||||||
|
ContentRelevanceFilter,
|
||||||
|
SEOFilter
|
||||||
)
|
)
|
||||||
from .scorers import (
|
from .scorers import (
|
||||||
FastKeywordRelevanceScorer,
|
KeywordRelevanceScorer,
|
||||||
FastURLScorer,
|
URLScorer,
|
||||||
|
CompositeScorer,
|
||||||
|
DomainAuthorityScorer,
|
||||||
|
FreshnessScorer,
|
||||||
|
PathDepthScorer,
|
||||||
|
ContentTypeScorer
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -21,11 +28,18 @@ __all__ = [
|
|||||||
"BFSDeepCrawlStrategy",
|
"BFSDeepCrawlStrategy",
|
||||||
"BestFirstCrawlingStrategy",
|
"BestFirstCrawlingStrategy",
|
||||||
"DFSDeepCrawlStrategy",
|
"DFSDeepCrawlStrategy",
|
||||||
"FastFilterChain",
|
"FilterChain",
|
||||||
"FastContentTypeFilter",
|
"ContentTypeFilter",
|
||||||
"FastDomainFilter",
|
"DomainFilter",
|
||||||
"FastURLFilter",
|
"URLFilter",
|
||||||
"FastFilterStats",
|
"FilterStats",
|
||||||
"FastKeywordRelevanceScorer",
|
"ContentRelevanceFilter",
|
||||||
"FastURLScorer",
|
"SEOFilter",
|
||||||
|
"KeywordRelevanceScorer",
|
||||||
|
"URLScorer",
|
||||||
|
"CompositeScorer",
|
||||||
|
"DomainAuthorityScorer",
|
||||||
|
"FreshnessScorer",
|
||||||
|
"PathDepthScorer",
|
||||||
|
"ContentTypeScorer",
|
||||||
]
|
]
|
||||||
@@ -6,8 +6,8 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from ..models import TraversalStats
|
from ..models import TraversalStats
|
||||||
from .filters import FastFilterChain
|
from .filters import FilterChain
|
||||||
from .scorers import FastURLScorer
|
from .scorers import URLScorer
|
||||||
from . import DeepCrawlStrategy
|
from . import DeepCrawlStrategy
|
||||||
|
|
||||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
||||||
@@ -34,8 +34,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
max_depth: int,
|
max_depth: int,
|
||||||
filter_chain: FastFilterChain = FastFilterChain(),
|
filter_chain: FilterChain = FilterChain(),
|
||||||
url_scorer: Optional[FastURLScorer] = None,
|
url_scorer: Optional[URLScorer] = None,
|
||||||
include_external: bool = False,
|
include_external: bool = False,
|
||||||
logger: Optional[logging.Logger] = None,
|
logger: Optional[logging.Logger] = None,
|
||||||
):
|
):
|
||||||
@@ -64,7 +64,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
self.logger.warning(f"Invalid URL: {url}, error: {e}")
|
self.logger.warning(f"Invalid URL: {url}, error: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if depth != 0 and not self.filter_chain.apply(url):
|
if depth != 0 and not await self.filter_chain.apply(url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|||||||
@@ -6,8 +6,8 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
|
|||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from ..models import TraversalStats
|
from ..models import TraversalStats
|
||||||
from .filters import FastFilterChain
|
from .filters import FilterChain
|
||||||
from .scorers import FastURLScorer
|
from .scorers import URLScorer
|
||||||
from . import DeepCrawlStrategy
|
from . import DeepCrawlStrategy
|
||||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
||||||
|
|
||||||
@@ -23,8 +23,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
max_depth: int,
|
max_depth: int,
|
||||||
filter_chain: FastFilterChain = FastFilterChain(),
|
filter_chain: FilterChain = FilterChain(),
|
||||||
url_scorer: Optional[FastURLScorer] = None,
|
url_scorer: Optional[URLScorer] = None,
|
||||||
include_external: bool = False,
|
include_external: bool = False,
|
||||||
logger: Optional[logging.Logger] = None,
|
logger: Optional[logging.Logger] = None,
|
||||||
):
|
):
|
||||||
@@ -53,7 +53,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
self.logger.warning(f"Invalid URL: {url}, error: {e}")
|
self.logger.warning(f"Invalid URL: {url}, error: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if depth != 0 and not self.filter_chain.apply(url):
|
if depth != 0 and not await self.filter_chain.apply(url):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|||||||
@@ -374,7 +374,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
return (parsed.scheme in {'http', 'https'}
|
return (parsed.scheme in {'http', 'https'}
|
||||||
and '.' in parsed.netloc
|
and '.' in parsed.netloc
|
||||||
and self.filter_chain.apply(url))
|
and await self.filter_chain.apply(url))
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|||||||
@@ -8,224 +8,16 @@ from functools import lru_cache
|
|||||||
import fnmatch
|
import fnmatch
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import weakref
|
import weakref
|
||||||
import mimetypes
|
|
||||||
import math
|
import math
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from ..utils import HeadPeekr
|
from ..utils import HeadPeekr
|
||||||
|
import asyncio
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FilterStats:
|
class FilterStats:
|
||||||
# PERF: Using dataclass creates overhead with __init__ and property access
|
|
||||||
# PERF: Could use __slots__ to reduce memory footprint
|
|
||||||
# PERF: Consider using array.array('I') for atomic increments
|
|
||||||
total_urls: int = 0
|
|
||||||
rejected_urls: int = 0
|
|
||||||
passed_urls: int = 0
|
|
||||||
|
|
||||||
|
|
||||||
class URLFilter(ABC):
|
|
||||||
# PERF: Logger creation is expensive, consider lazy initialization
|
|
||||||
# PERF: stats object creation adds overhead for each filter instance
|
|
||||||
def __init__(self, name: str = None):
|
|
||||||
self.name = name or self.__class__.__name__
|
|
||||||
self.stats = FilterStats()
|
|
||||||
self.logger = logging.getLogger(f"urlfilter.{self.name}")
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _update_stats(self, passed: bool):
|
|
||||||
# PERF: Already optimized but could use bitwise operations
|
|
||||||
# PERF: Consider removing stats entirely in production/fast mode
|
|
||||||
self.stats.total_urls += 1
|
|
||||||
self.stats.passed_urls += passed
|
|
||||||
self.stats.rejected_urls += not passed
|
|
||||||
|
|
||||||
|
|
||||||
class FilterChain:
|
|
||||||
# PERF: List traversal for each URL is expensive
|
|
||||||
# PERF: Could use array.array instead of list for filters
|
|
||||||
# PERF: Consider adding fast path for single filter case
|
|
||||||
def __init__(self, filters: List[URLFilter] = None):
|
|
||||||
self.filters = filters or []
|
|
||||||
self.stats = FilterStats()
|
|
||||||
self.logger = logging.getLogger("urlfilter.chain")
|
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
# PERF: Logging on every rejection is expensive
|
|
||||||
# PERF: Could reorder filters by rejection rate
|
|
||||||
# PERF: Consider batch processing mode
|
|
||||||
self.stats.total_urls += 1
|
|
||||||
|
|
||||||
for filter_ in self.filters:
|
|
||||||
if not filter_.apply(url):
|
|
||||||
self.stats.rejected_urls += 1
|
|
||||||
self.logger.debug(f"URL {url} rejected by {filter_.name}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
self.stats.passed_urls += 1
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
class URLPatternFilter(URLFilter):
|
|
||||||
# PERF: Converting glob to regex is expensive
|
|
||||||
# PERF: Multiple regex compilation is slow
|
|
||||||
# PERF: List of patterns causes multiple regex evaluations
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
|
||||||
use_glob: bool = True,
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
|
||||||
self.use_glob = use_glob
|
|
||||||
self._compiled_patterns = []
|
|
||||||
|
|
||||||
# PERF: This could be consolidated into a single regex with OR conditions
|
|
||||||
# PERF: glob_to_regex creates complex patterns, could be simplified
|
|
||||||
for pattern in self.patterns:
|
|
||||||
if isinstance(pattern, str) and use_glob:
|
|
||||||
self._compiled_patterns.append(self._glob_to_regex(pattern))
|
|
||||||
else:
|
|
||||||
self._compiled_patterns.append(
|
|
||||||
re.compile(pattern) if isinstance(pattern, str) else pattern
|
|
||||||
)
|
|
||||||
|
|
||||||
def _glob_to_regex(self, pattern: str) -> Pattern:
|
|
||||||
# PERF: fnmatch.translate creates overly complex patterns
|
|
||||||
# PERF: Could cache common translations
|
|
||||||
return re.compile(fnmatch.translate(pattern))
|
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
# PERF: any() with generator is slower than direct loop with early return
|
|
||||||
# PERF: searching entire string is slower than anchored match
|
|
||||||
matches = any(pattern.search(url) for pattern in self._compiled_patterns)
|
|
||||||
self._update_stats(matches)
|
|
||||||
return matches
|
|
||||||
|
|
||||||
|
|
||||||
class ContentTypeFilter(URLFilter):
|
|
||||||
# PERF: mimetypes guessing is extremely slow
|
|
||||||
# PERF: URL parsing on every check is expensive
|
|
||||||
# PERF: No caching of results for similar extensions
|
|
||||||
def __init__(
|
|
||||||
self, allowed_types: Union[str, List[str]], check_extension: bool = True
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
self.allowed_types = (
|
|
||||||
[allowed_types] if isinstance(allowed_types, str) else allowed_types
|
|
||||||
)
|
|
||||||
self.check_extension = check_extension
|
|
||||||
self._normalize_types()
|
|
||||||
|
|
||||||
def _normalize_types(self):
|
|
||||||
"""Normalize content type strings"""
|
|
||||||
self.allowed_types = [t.lower() for t in self.allowed_types]
|
|
||||||
|
|
||||||
def _check_extension(self, url: str) -> bool:
|
|
||||||
# PERF: urlparse is called on every check
|
|
||||||
# PERF: multiple string splits are expensive
|
|
||||||
# PERF: mimetypes.guess_type is very slow
|
|
||||||
ext = (
|
|
||||||
urlparse(url).path.split(".")[-1].lower()
|
|
||||||
if "." in urlparse(url).path
|
|
||||||
else ""
|
|
||||||
)
|
|
||||||
if not ext:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# PERF: guess_type is main bottleneck
|
|
||||||
guessed_type = mimetypes.guess_type(url)[0]
|
|
||||||
return any(
|
|
||||||
allowed in (guessed_type or "").lower() for allowed in self.allowed_types
|
|
||||||
)
|
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
"""Check if URL's content type is allowed"""
|
|
||||||
result = True
|
|
||||||
if self.check_extension:
|
|
||||||
result = self._check_extension(url)
|
|
||||||
self._update_stats(result)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
class DomainFilter(URLFilter):
|
|
||||||
# PERF: Set lookups are fast but string normalizations on init are not
|
|
||||||
# PERF: Creating two sets doubles memory usage
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
allowed_domains: Union[str, List[str]] = None,
|
|
||||||
blocked_domains: Union[str, List[str]] = None,
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
# PERF: Normalizing domains on every init is wasteful
|
|
||||||
# PERF: Could use frozenset for immutable lists
|
|
||||||
self.allowed_domains = (
|
|
||||||
set(self._normalize_domains(allowed_domains)) if allowed_domains else None
|
|
||||||
)
|
|
||||||
self.blocked_domains = (
|
|
||||||
set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
|
|
||||||
)
|
|
||||||
|
|
||||||
def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
|
|
||||||
# PERF: strip() and lower() create new strings for each domain
|
|
||||||
# PERF: List comprehension creates intermediate list
|
|
||||||
if isinstance(domains, str):
|
|
||||||
domains = [domains]
|
|
||||||
return [d.lower().strip() for d in domains]
|
|
||||||
|
|
||||||
def _extract_domain(self, url: str) -> str:
|
|
||||||
# PERF: urlparse is called for every URL check
|
|
||||||
# PERF: lower() creates new string every time
|
|
||||||
# PERF: Could cache recent results
|
|
||||||
return urlparse(url).netloc.lower()
|
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
|
||||||
# PERF: Two separate set lookups in worst case
|
|
||||||
# PERF: Domain extraction happens before knowing if we have any filters
|
|
||||||
domain = self._extract_domain(url)
|
|
||||||
|
|
||||||
if domain in self.blocked_domains:
|
|
||||||
self._update_stats(False)
|
|
||||||
return False
|
|
||||||
|
|
||||||
if self.allowed_domains is not None and domain not in self.allowed_domains:
|
|
||||||
self._update_stats(False)
|
|
||||||
return False
|
|
||||||
|
|
||||||
self._update_stats(True)
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
# Example usage:
|
|
||||||
def create_common_filter_chain() -> FilterChain:
|
|
||||||
"""Create a commonly used filter chain"""
|
|
||||||
return FilterChain(
|
|
||||||
[
|
|
||||||
URLPatternFilter(
|
|
||||||
[
|
|
||||||
"*.html",
|
|
||||||
"*.htm", # HTML files
|
|
||||||
"*/article/*",
|
|
||||||
"*/blog/*", # Common content paths
|
|
||||||
]
|
|
||||||
),
|
|
||||||
ContentTypeFilter(["text/html", "application/xhtml+xml"]),
|
|
||||||
DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
####################################################################################
|
|
||||||
# Uncledoe: Optimized Version
|
|
||||||
####################################################################################
|
|
||||||
|
|
||||||
|
|
||||||
# Use __slots__ and array for maximum memory/speed efficiency
|
|
||||||
@dataclass
|
|
||||||
class FastFilterStats:
|
|
||||||
__slots__ = ("_counters",)
|
__slots__ = ("_counters",)
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -245,14 +37,14 @@ class FastFilterStats:
|
|||||||
return self._counters[2]
|
return self._counters[2]
|
||||||
|
|
||||||
|
|
||||||
class FastURLFilter(ABC):
|
class URLFilter(ABC):
|
||||||
"""Optimized base filter class"""
|
"""Optimized base filter class"""
|
||||||
|
|
||||||
__slots__ = ("name", "stats", "_logger_ref")
|
__slots__ = ("name", "stats", "_logger_ref")
|
||||||
|
|
||||||
def __init__(self, name: str = None):
|
def __init__(self, name: str = None):
|
||||||
self.name = name or self.__class__.__name__
|
self.name = name or self.__class__.__name__
|
||||||
self.stats = FastFilterStats()
|
self.stats = FilterStats()
|
||||||
# Lazy logger initialization using weakref
|
# Lazy logger initialization using weakref
|
||||||
self._logger_ref = None
|
self._logger_ref = None
|
||||||
|
|
||||||
@@ -274,14 +66,14 @@ class FastURLFilter(ABC):
|
|||||||
self.stats._counters[2] += not passed # rejected
|
self.stats._counters[2] += not passed # rejected
|
||||||
|
|
||||||
|
|
||||||
class FastFilterChain:
|
class FilterChain:
|
||||||
"""Optimized filter chain"""
|
"""Optimized filter chain"""
|
||||||
|
|
||||||
__slots__ = ("filters", "stats", "_logger_ref")
|
__slots__ = ("filters", "stats", "_logger_ref")
|
||||||
|
|
||||||
def __init__(self, filters: List[FastURLFilter] = None):
|
def __init__(self, filters: List[URLFilter] = None):
|
||||||
self.filters = tuple(filters or []) # Immutable tuple for speed
|
self.filters = tuple(filters or []) # Immutable tuple for speed
|
||||||
self.stats = FastFilterStats()
|
self.stats = FilterStats()
|
||||||
self._logger_ref = None
|
self._logger_ref = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -291,37 +83,62 @@ class FastFilterChain:
|
|||||||
self._logger_ref = weakref.ref(logger)
|
self._logger_ref = weakref.ref(logger)
|
||||||
return self._logger_ref()
|
return self._logger_ref()
|
||||||
|
|
||||||
def add_filter(self, filter_: FastURLFilter) -> "FastFilterChain":
|
def add_filter(self, filter_: URLFilter) -> "FilterChain":
|
||||||
"""Add a filter to the chain"""
|
"""Add a filter to the chain"""
|
||||||
self.filters.append(filter_)
|
self.filters.append(filter_)
|
||||||
return self # Enable method chaining
|
return self # Enable method chaining
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
async def apply(self, url: str) -> bool:
|
||||||
"""Optimized apply with minimal operations"""
|
"""Apply all filters concurrently when possible"""
|
||||||
self.stats._counters[0] += 1 # total
|
self.stats._counters[0] += 1 # Total processed URLs
|
||||||
|
|
||||||
# Direct tuple iteration is faster than list
|
tasks = []
|
||||||
for f in self.filters:
|
for f in self.filters:
|
||||||
if not f.apply(url):
|
result = f.apply(url)
|
||||||
self.stats._counters[2] += 1 # rejected
|
|
||||||
|
if inspect.isawaitable(result):
|
||||||
|
tasks.append(result) # Collect async tasks
|
||||||
|
elif not result: # Sync rejection
|
||||||
|
self.stats._counters[2] += 1 # Sync rejected
|
||||||
return False
|
return False
|
||||||
|
|
||||||
self.stats._counters[1] += 1 # passed
|
if tasks:
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
# Count how many filters rejected
|
||||||
|
rejections = results.count(False)
|
||||||
|
self.stats._counters[2] += rejections
|
||||||
|
|
||||||
|
if not all(results):
|
||||||
|
return False # Stop early if any filter rejected
|
||||||
|
|
||||||
|
self.stats._counters[1] += 1 # Passed
|
||||||
return True
|
return True
|
||||||
|
|
||||||
class FastURLPatternFilter(FastURLFilter):
|
|
||||||
|
class URLPatternFilter(URLFilter):
|
||||||
"""Pattern filter balancing speed and completeness"""
|
"""Pattern filter balancing speed and completeness"""
|
||||||
__slots__ = ('_simple_suffixes', '_simple_prefixes', '_domain_patterns', '_path_patterns')
|
|
||||||
|
__slots__ = (
|
||||||
|
"_simple_suffixes",
|
||||||
|
"_simple_prefixes",
|
||||||
|
"_domain_patterns",
|
||||||
|
"_path_patterns",
|
||||||
|
)
|
||||||
|
|
||||||
PATTERN_TYPES = {
|
PATTERN_TYPES = {
|
||||||
'SUFFIX': 1, # *.html
|
"SUFFIX": 1, # *.html
|
||||||
'PREFIX': 2, # /foo/*
|
"PREFIX": 2, # /foo/*
|
||||||
'DOMAIN': 3, # *.example.com
|
"DOMAIN": 3, # *.example.com
|
||||||
'PATH': 4 , # Everything else
|
"PATH": 4, # Everything else
|
||||||
'REGEX': 5
|
"REGEX": 5,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True):
|
def __init__(
|
||||||
|
self,
|
||||||
|
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||||||
|
use_glob: bool = True,
|
||||||
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||||
|
|
||||||
@@ -337,48 +154,50 @@ class FastURLPatternFilter(FastURLFilter):
|
|||||||
def _categorize_pattern(self, pattern: str) -> int:
|
def _categorize_pattern(self, pattern: str) -> int:
|
||||||
"""Categorize pattern for specialized handling"""
|
"""Categorize pattern for specialized handling"""
|
||||||
if not isinstance(pattern, str):
|
if not isinstance(pattern, str):
|
||||||
return self.PATTERN_TYPES['PATH']
|
return self.PATTERN_TYPES["PATH"]
|
||||||
|
|
||||||
# Check if it's a regex pattern
|
# Check if it's a regex pattern
|
||||||
if pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern:
|
if pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern:
|
||||||
return self.PATTERN_TYPES['REGEX']
|
return self.PATTERN_TYPES["REGEX"]
|
||||||
|
|
||||||
if pattern.count('*') == 1:
|
if pattern.count("*") == 1:
|
||||||
if pattern.startswith('*.'):
|
if pattern.startswith("*."):
|
||||||
return self.PATTERN_TYPES['SUFFIX']
|
return self.PATTERN_TYPES["SUFFIX"]
|
||||||
if pattern.endswith('/*'):
|
if pattern.endswith("/*"):
|
||||||
return self.PATTERN_TYPES['PREFIX']
|
return self.PATTERN_TYPES["PREFIX"]
|
||||||
|
|
||||||
if '://' in pattern and pattern.startswith('*.'):
|
if "://" in pattern and pattern.startswith("*."):
|
||||||
return self.PATTERN_TYPES['DOMAIN']
|
return self.PATTERN_TYPES["DOMAIN"]
|
||||||
|
|
||||||
return self.PATTERN_TYPES['PATH']
|
return self.PATTERN_TYPES["PATH"]
|
||||||
|
|
||||||
def _add_pattern(self, pattern: str, pattern_type: int):
|
def _add_pattern(self, pattern: str, pattern_type: int):
|
||||||
"""Add pattern to appropriate matcher"""
|
"""Add pattern to appropriate matcher"""
|
||||||
if pattern_type == self.PATTERN_TYPES['REGEX']:
|
if pattern_type == self.PATTERN_TYPES["REGEX"]:
|
||||||
# For regex patterns, compile directly without glob translation
|
# For regex patterns, compile directly without glob translation
|
||||||
if isinstance(pattern, str) and (pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern):
|
if isinstance(pattern, str) and (
|
||||||
|
pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern
|
||||||
|
):
|
||||||
self._path_patterns.append(re.compile(pattern))
|
self._path_patterns.append(re.compile(pattern))
|
||||||
return
|
return
|
||||||
elif pattern_type == self.PATTERN_TYPES['SUFFIX']:
|
elif pattern_type == self.PATTERN_TYPES["SUFFIX"]:
|
||||||
self._simple_suffixes.add(pattern[2:])
|
self._simple_suffixes.add(pattern[2:])
|
||||||
elif pattern_type == self.PATTERN_TYPES['PREFIX']:
|
elif pattern_type == self.PATTERN_TYPES["PREFIX"]:
|
||||||
self._simple_prefixes.add(pattern[:-2])
|
self._simple_prefixes.add(pattern[:-2])
|
||||||
elif pattern_type == self.PATTERN_TYPES['DOMAIN']:
|
elif pattern_type == self.PATTERN_TYPES["DOMAIN"]:
|
||||||
self._domain_patterns.append(
|
self._domain_patterns.append(re.compile(pattern.replace("*.", r"[^/]+\.")))
|
||||||
re.compile(pattern.replace('*.', r'[^/]+\.'))
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
if isinstance(pattern, str):
|
if isinstance(pattern, str):
|
||||||
# Handle complex glob patterns
|
# Handle complex glob patterns
|
||||||
if '**' in pattern:
|
if "**" in pattern:
|
||||||
pattern = pattern.replace('**', '.*')
|
pattern = pattern.replace("**", ".*")
|
||||||
if '{' in pattern:
|
if "{" in pattern:
|
||||||
# Convert {a,b} to (a|b)
|
# Convert {a,b} to (a|b)
|
||||||
pattern = re.sub(r'\{([^}]+)\}',
|
pattern = re.sub(
|
||||||
lambda m: f'({"|".join(m.group(1).split(","))})',
|
r"\{([^}]+)\}",
|
||||||
pattern)
|
lambda m: f'({"|".join(m.group(1).split(","))})',
|
||||||
|
pattern,
|
||||||
|
)
|
||||||
pattern = fnmatch.translate(pattern)
|
pattern = fnmatch.translate(pattern)
|
||||||
self._path_patterns.append(
|
self._path_patterns.append(
|
||||||
pattern if isinstance(pattern, Pattern) else re.compile(pattern)
|
pattern if isinstance(pattern, Pattern) else re.compile(pattern)
|
||||||
@@ -389,8 +208,8 @@ class FastURLPatternFilter(FastURLFilter):
|
|||||||
"""Hierarchical pattern matching"""
|
"""Hierarchical pattern matching"""
|
||||||
# Quick suffix check (*.html)
|
# Quick suffix check (*.html)
|
||||||
if self._simple_suffixes:
|
if self._simple_suffixes:
|
||||||
path = url.split('?')[0]
|
path = url.split("?")[0]
|
||||||
if path.split('/')[-1].split('.')[-1] in self._simple_suffixes:
|
if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
|
||||||
self._update_stats(True)
|
self._update_stats(True)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -403,7 +222,7 @@ class FastURLPatternFilter(FastURLFilter):
|
|||||||
|
|
||||||
# Prefix check (/foo/*)
|
# Prefix check (/foo/*)
|
||||||
if self._simple_prefixes:
|
if self._simple_prefixes:
|
||||||
path = url.split('?')[0]
|
path = url.split("?")[0]
|
||||||
if any(path.startswith(p) for p in self._simple_prefixes):
|
if any(path.startswith(p) for p in self._simple_prefixes):
|
||||||
self._update_stats(True)
|
self._update_stats(True)
|
||||||
return True
|
return True
|
||||||
@@ -418,7 +237,7 @@ class FastURLPatternFilter(FastURLFilter):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
class FastContentTypeFilter(FastURLFilter):
|
class ContentTypeFilter(URLFilter):
|
||||||
"""Optimized content type filter using fast lookups"""
|
"""Optimized content type filter using fast lookups"""
|
||||||
|
|
||||||
__slots__ = ("allowed_types", "_ext_map", "_check_extension")
|
__slots__ = ("allowed_types", "_ext_map", "_check_extension")
|
||||||
@@ -515,14 +334,30 @@ class FastContentTypeFilter(FastURLFilter):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@lru_cache(maxsize=1000)
|
@lru_cache(maxsize=1000)
|
||||||
def _extract_extension(path: str) -> str:
|
def _extract_extension(url: str) -> str:
|
||||||
"""Fast extension extraction with caching"""
|
"""Extracts file extension from a URL."""
|
||||||
if "." not in path:
|
# Remove scheme (http://, https://) if present
|
||||||
|
if "://" in url:
|
||||||
|
url = url.split("://", 1)[-1] # Get everything after '://'
|
||||||
|
|
||||||
|
# Remove domain (everything up to the first '/')
|
||||||
|
path_start = url.find("/")
|
||||||
|
path = url[path_start:] if path_start != -1 else ""
|
||||||
|
|
||||||
|
# Extract last filename in path
|
||||||
|
filename = path.rsplit("/", 1)[-1] if "/" in path else ""
|
||||||
|
|
||||||
|
# Extract and validate extension
|
||||||
|
if "." not in filename:
|
||||||
return ""
|
return ""
|
||||||
return path.rpartition(".")[-1].lower()
|
|
||||||
|
return filename.rpartition(".")[-1].lower()
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, allowed_types: Union[str, List[str]], check_extension: bool = True, ext_map: Dict[str, str] = _MIME_MAP
|
self,
|
||||||
|
allowed_types: Union[str, List[str]],
|
||||||
|
check_extension: bool = True,
|
||||||
|
ext_map: Dict[str, str] = _MIME_MAP,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# Normalize and store as frozenset for fast lookup
|
# Normalize and store as frozenset for fast lookup
|
||||||
@@ -546,9 +381,7 @@ class FastContentTypeFilter(FastURLFilter):
|
|||||||
"""Cached URL checking"""
|
"""Cached URL checking"""
|
||||||
if not self._check_extension:
|
if not self._check_extension:
|
||||||
return True
|
return True
|
||||||
|
ext = self._extract_extension(url)
|
||||||
path = url.split("?")[0] # Fast path split
|
|
||||||
ext = self._extract_extension(path)
|
|
||||||
if not ext:
|
if not ext:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -561,7 +394,7 @@ class FastContentTypeFilter(FastURLFilter):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
class FastDomainFilter(FastURLFilter):
|
class DomainFilter(URLFilter):
|
||||||
"""Optimized domain filter with fast lookups and caching"""
|
"""Optimized domain filter with fast lookups and caching"""
|
||||||
|
|
||||||
__slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
|
__slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
|
||||||
@@ -599,7 +432,7 @@ class FastDomainFilter(FastURLFilter):
|
|||||||
@lru_cache(maxsize=10000)
|
@lru_cache(maxsize=10000)
|
||||||
def _extract_domain(url: str) -> str:
|
def _extract_domain(url: str) -> str:
|
||||||
"""Ultra-fast domain extraction with regex and caching"""
|
"""Ultra-fast domain extraction with regex and caching"""
|
||||||
match = FastDomainFilter._DOMAIN_REGEX.search(url)
|
match = DomainFilter._DOMAIN_REGEX.search(url)
|
||||||
return match.group(1).lower() if match else ""
|
return match.group(1).lower() if match else ""
|
||||||
|
|
||||||
def apply(self, url: str) -> bool:
|
def apply(self, url: str) -> bool:
|
||||||
@@ -627,19 +460,24 @@ class FastDomainFilter(FastURLFilter):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ContentRelevanceFilter(URLFilter):
|
class ContentRelevanceFilter(URLFilter):
|
||||||
"""BM25-based relevance filter using head section content"""
|
"""BM25-based relevance filter using head section content"""
|
||||||
|
|
||||||
__slots__ = ('query_terms', 'threshold', 'k1', 'b', 'avgdl')
|
__slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
|
||||||
|
|
||||||
def __init__(self, query: str, threshold: float,
|
def __init__(
|
||||||
k1: float = 1.2, b: float = 0.75, avgdl: int = 1000):
|
self,
|
||||||
|
query: str,
|
||||||
|
threshold: float,
|
||||||
|
k1: float = 1.2,
|
||||||
|
b: float = 0.75,
|
||||||
|
avgdl: int = 1000,
|
||||||
|
):
|
||||||
super().__init__(name="BM25RelevanceFilter")
|
super().__init__(name="BM25RelevanceFilter")
|
||||||
self.query_terms = self._tokenize(query)
|
self.query_terms = self._tokenize(query)
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
self.k1 = k1 # TF saturation parameter
|
self.k1 = k1 # TF saturation parameter
|
||||||
self.b = b # Length normalization parameter
|
self.b = b # Length normalization parameter
|
||||||
self.avgdl = avgdl # Average document length (empirical value)
|
self.avgdl = avgdl # Average document length (empirical value)
|
||||||
|
|
||||||
async def apply(self, url: str) -> bool:
|
async def apply(self, url: str) -> bool:
|
||||||
@@ -650,8 +488,8 @@ class ContentRelevanceFilter(URLFilter):
|
|||||||
|
|
||||||
# Field extraction with weighting
|
# Field extraction with weighting
|
||||||
fields = {
|
fields = {
|
||||||
'title': HeadPeekr.get_title(head_content) or "",
|
"title": HeadPeekr.get_title(head_content) or "",
|
||||||
'meta': HeadPeekr.extract_meta_tags(head_content)
|
"meta": HeadPeekr.extract_meta_tags(head_content),
|
||||||
}
|
}
|
||||||
doc_text = self._build_document(fields)
|
doc_text = self._build_document(fields)
|
||||||
|
|
||||||
@@ -662,12 +500,14 @@ class ContentRelevanceFilter(URLFilter):
|
|||||||
|
|
||||||
def _build_document(self, fields: Dict) -> str:
|
def _build_document(self, fields: Dict) -> str:
|
||||||
"""Weighted document construction"""
|
"""Weighted document construction"""
|
||||||
return ' '.join([
|
return " ".join(
|
||||||
fields['title'] * 3, # Title weight
|
[
|
||||||
fields['meta'].get('description', '') * 2,
|
fields["title"] * 3, # Title weight
|
||||||
fields['meta'].get('keywords', ''),
|
fields["meta"].get("description", "") * 2,
|
||||||
' '.join(fields['meta'].values())
|
fields["meta"].get("keywords", ""),
|
||||||
])
|
" ".join(fields["meta"].values()),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
def _tokenize(self, text: str) -> List[str]:
|
def _tokenize(self, text: str) -> List[str]:
|
||||||
"""Fast case-insensitive tokenization"""
|
"""Fast case-insensitive tokenization"""
|
||||||
@@ -687,8 +527,9 @@ class ContentRelevanceFilter(URLFilter):
|
|||||||
term_freq = tf[term]
|
term_freq = tf[term]
|
||||||
idf = math.log((1 + 1) / (term_freq + 0.5) + 1) # Simplified IDF
|
idf = math.log((1 + 1) / (term_freq + 0.5) + 1) # Simplified IDF
|
||||||
numerator = term_freq * (self.k1 + 1)
|
numerator = term_freq * (self.k1 + 1)
|
||||||
denominator = term_freq + self.k1 * (1 - self.b +
|
denominator = term_freq + self.k1 * (
|
||||||
self.b * (doc_len / self.avgdl))
|
1 - self.b + self.b * (doc_len / self.avgdl)
|
||||||
|
)
|
||||||
score += idf * (numerator / denominator)
|
score += idf * (numerator / denominator)
|
||||||
|
|
||||||
return score
|
return score
|
||||||
@@ -697,29 +538,35 @@ class ContentRelevanceFilter(URLFilter):
|
|||||||
class SEOFilter(URLFilter):
|
class SEOFilter(URLFilter):
|
||||||
"""Quantitative SEO quality assessment filter using head section analysis"""
|
"""Quantitative SEO quality assessment filter using head section analysis"""
|
||||||
|
|
||||||
__slots__ = ('threshold', '_weights', '_kw_patterns')
|
__slots__ = ("threshold", "_weights", "_kw_patterns")
|
||||||
|
|
||||||
# Based on SEMrush/Google ranking factors research
|
# Based on SEMrush/Google ranking factors research
|
||||||
DEFAULT_WEIGHTS = {
|
DEFAULT_WEIGHTS = {
|
||||||
'title_length': 0.15,
|
"title_length": 0.15,
|
||||||
'title_kw': 0.18,
|
"title_kw": 0.18,
|
||||||
'meta_description': 0.12,
|
"meta_description": 0.12,
|
||||||
'canonical': 0.10,
|
"canonical": 0.10,
|
||||||
'robot_ok': 0.20, # Most critical factor
|
"robot_ok": 0.20, # Most critical factor
|
||||||
'schema_org': 0.10,
|
"schema_org": 0.10,
|
||||||
'url_quality': 0.15
|
"url_quality": 0.15,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, threshold: float = 0.65,
|
def __init__(
|
||||||
keywords: List[str] = None,
|
self,
|
||||||
weights: Dict[str, float] = None):
|
threshold: float = 0.65,
|
||||||
|
keywords: List[str] = None,
|
||||||
|
weights: Dict[str, float] = None,
|
||||||
|
):
|
||||||
super().__init__(name="SEOFilter")
|
super().__init__(name="SEOFilter")
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
self._weights = weights or self.DEFAULT_WEIGHTS
|
self._weights = weights or self.DEFAULT_WEIGHTS
|
||||||
self._kw_patterns = re.compile(
|
self._kw_patterns = (
|
||||||
r'\b({})\b'.format('|'.join(map(re.escape, keywords or []))),
|
re.compile(
|
||||||
re.I
|
r"\b({})\b".format("|".join(map(re.escape, keywords or []))), re.I
|
||||||
) if keywords else None
|
)
|
||||||
|
if keywords
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
async def apply(self, url: str) -> bool:
|
async def apply(self, url: str) -> bool:
|
||||||
head_content = await HeadPeekr.peek_html(url)
|
head_content = await HeadPeekr.peek_html(url)
|
||||||
@@ -728,21 +575,24 @@ class SEOFilter(URLFilter):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
meta = HeadPeekr.extract_meta_tags(head_content)
|
meta = HeadPeekr.extract_meta_tags(head_content)
|
||||||
title = HeadPeekr.get_title(head_content) or ''
|
title = HeadPeekr.get_title(head_content) or ""
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
|
|
||||||
scores = {
|
scores = {
|
||||||
'title_length': self._score_title_length(title),
|
"title_length": self._score_title_length(title),
|
||||||
'title_kw': self._score_keyword_presence(title),
|
"title_kw": self._score_keyword_presence(title),
|
||||||
'meta_description': self._score_meta_description(meta.get('description', '')),
|
"meta_description": self._score_meta_description(
|
||||||
'canonical': self._score_canonical(meta.get('canonical'), url),
|
meta.get("description", "")
|
||||||
'robot_ok': 1.0 if 'noindex' not in meta.get('robots', '') else 0.0,
|
),
|
||||||
'schema_org': self._score_schema_org(head_content),
|
"canonical": self._score_canonical(meta.get("canonical"), url),
|
||||||
'url_quality': self._score_url_quality(parsed_url)
|
"robot_ok": 1.0 if "noindex" not in meta.get("robots", "") else 0.0,
|
||||||
|
"schema_org": self._score_schema_org(head_content),
|
||||||
|
"url_quality": self._score_url_quality(parsed_url),
|
||||||
}
|
}
|
||||||
|
|
||||||
total_score = sum(weight * scores[factor]
|
total_score = sum(
|
||||||
for factor, weight in self._weights.items())
|
weight * scores[factor] for factor, weight in self._weights.items()
|
||||||
|
)
|
||||||
|
|
||||||
decision = total_score >= self.threshold
|
decision = total_score >= self.threshold
|
||||||
self._update_stats(decision)
|
self._update_stats(decision)
|
||||||
@@ -750,291 +600,49 @@ class SEOFilter(URLFilter):
|
|||||||
|
|
||||||
def _score_title_length(self, title: str) -> float:
|
def _score_title_length(self, title: str) -> float:
|
||||||
length = len(title)
|
length = len(title)
|
||||||
if 50 <= length <= 60: return 1.0
|
if 50 <= length <= 60:
|
||||||
if 40 <= length < 50 or 60 < length <= 70: return 0.7
|
return 1.0
|
||||||
|
if 40 <= length < 50 or 60 < length <= 70:
|
||||||
|
return 0.7
|
||||||
return 0.3 # Poor length
|
return 0.3 # Poor length
|
||||||
|
|
||||||
def _score_keyword_presence(self, text: str) -> float:
|
def _score_keyword_presence(self, text: str) -> float:
|
||||||
if not self._kw_patterns: return 0.0
|
if not self._kw_patterns:
|
||||||
|
return 0.0
|
||||||
matches = len(self._kw_patterns.findall(text))
|
matches = len(self._kw_patterns.findall(text))
|
||||||
return min(matches * 0.3, 1.0) # Max 3 matches
|
return min(matches * 0.3, 1.0) # Max 3 matches
|
||||||
|
|
||||||
def _score_meta_description(self, desc: str) -> float:
|
def _score_meta_description(self, desc: str) -> float:
|
||||||
length = len(desc)
|
length = len(desc)
|
||||||
if 140 <= length <= 160: return 1.0
|
if 140 <= length <= 160:
|
||||||
|
return 1.0
|
||||||
return 0.5 if 120 <= length <= 200 else 0.2
|
return 0.5 if 120 <= length <= 200 else 0.2
|
||||||
|
|
||||||
def _score_canonical(self, canonical: str, original: str) -> float:
|
def _score_canonical(self, canonical: str, original: str) -> float:
|
||||||
if not canonical: return 0.5 # Neutral score
|
if not canonical:
|
||||||
|
return 0.5 # Neutral score
|
||||||
return 1.0 if canonical == original else 0.2
|
return 1.0 if canonical == original else 0.2
|
||||||
|
|
||||||
def _score_schema_org(self, html: str) -> float:
|
def _score_schema_org(self, html: str) -> float:
|
||||||
# Detect any schema.org markup in head
|
# Detect any schema.org markup in head
|
||||||
return 1.0 if re.search(r'<script[^>]+type=["\']application/ld\+json', html) else 0.0
|
return (
|
||||||
|
1.0
|
||||||
|
if re.search(r'<script[^>]+type=["\']application/ld\+json', html)
|
||||||
|
else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
def _score_url_quality(self, parsed_url) -> float:
|
def _score_url_quality(self, parsed_url) -> float:
|
||||||
score = 1.0
|
score = 1.0
|
||||||
path = parsed_url.path.lower()
|
path = parsed_url.path.lower()
|
||||||
|
|
||||||
# Penalty factors
|
# Penalty factors
|
||||||
if len(path) > 80: score *= 0.7
|
if len(path) > 80:
|
||||||
if re.search(r'\d{4}', path): score *= 0.8 # Numbers in path
|
score *= 0.7
|
||||||
if parsed_url.query: score *= 0.6 # URL parameters
|
if re.search(r"\d{4}", path):
|
||||||
if '_' in path: score *= 0.9 # Underscores vs hyphens
|
score *= 0.8 # Numbers in path
|
||||||
|
if parsed_url.query:
|
||||||
|
score *= 0.6 # URL parameters
|
||||||
|
if "_" in path:
|
||||||
|
score *= 0.9 # Underscores vs hyphens
|
||||||
|
|
||||||
return score
|
return score
|
||||||
|
|
||||||
def create_fast_filter_chain() -> FastFilterChain:
|
|
||||||
"""Create an optimized filter chain with filters ordered by rejection rate"""
|
|
||||||
return FastFilterChain(
|
|
||||||
[
|
|
||||||
# Domain filter first (fastest rejection)
|
|
||||||
FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]),
|
|
||||||
# Content filter second (medium speed)
|
|
||||||
FastContentTypeFilter(["text/html", "application/xhtml+xml"]),
|
|
||||||
# Pattern filter last (most expensive)
|
|
||||||
FastURLPatternFilter(
|
|
||||||
[
|
|
||||||
"*.html",
|
|
||||||
"*.htm",
|
|
||||||
"*/article/*",
|
|
||||||
"*/blog/*",
|
|
||||||
]
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def run_performance_test():
|
|
||||||
import time
|
|
||||||
|
|
||||||
# Generate test URLs
|
|
||||||
base_urls = [
|
|
||||||
"https://example.com/article/123",
|
|
||||||
"https://blog.example.com/post/456",
|
|
||||||
"https://ads.example.com/tracking",
|
|
||||||
"https://example.com/about.html",
|
|
||||||
"https://analytics.example.com/script.js",
|
|
||||||
"https://example.com/products.php",
|
|
||||||
"https://subdomain.example.com/blog/post-123",
|
|
||||||
"https://example.com/path/file.pdf",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create more varied test data
|
|
||||||
test_urls = []
|
|
||||||
for base in base_urls:
|
|
||||||
# Add original
|
|
||||||
test_urls.append(base)
|
|
||||||
# Add variations
|
|
||||||
parts = base.split("/")
|
|
||||||
for i in range(10):
|
|
||||||
parts[-1] = f"page_{i}.html"
|
|
||||||
test_urls.append("/".join(parts))
|
|
||||||
|
|
||||||
# Multiply to get enough test data
|
|
||||||
test_urls = test_urls * 10000 # Creates ~800k URLs
|
|
||||||
|
|
||||||
def benchmark(name: str, func, *args, warmup=True):
|
|
||||||
if warmup:
|
|
||||||
# Warmup run
|
|
||||||
func(*args)
|
|
||||||
|
|
||||||
# Actual timing
|
|
||||||
start = time.perf_counter_ns()
|
|
||||||
result = func(*args)
|
|
||||||
elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms
|
|
||||||
print(
|
|
||||||
f"{name:<30} {elapsed:>8.3f} ms ({len(test_urls)/elapsed*1000:,.0f} URLs/sec)"
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
print("\nBenchmarking original vs optimized implementations...")
|
|
||||||
print("-" * 70)
|
|
||||||
|
|
||||||
# Original implementation
|
|
||||||
pattern_filter = URLPatternFilter(["*.html", "*/article/*"])
|
|
||||||
content_filter = ContentTypeFilter(["text/html"])
|
|
||||||
domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"])
|
|
||||||
chain = FilterChain([pattern_filter, content_filter, domain_filter])
|
|
||||||
|
|
||||||
# Optimized implementation
|
|
||||||
fast_pattern_filter = FastURLPatternFilter(["*.html", "*/article/*"])
|
|
||||||
fast_content_filter = FastContentTypeFilter(["text/html"])
|
|
||||||
fast_domain_filter = FastDomainFilter(blocked_domains=["ads.*", "analytics.*"])
|
|
||||||
fast_chain = FastFilterChain(
|
|
||||||
[fast_domain_filter, fast_content_filter, fast_pattern_filter]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test individual filters
|
|
||||||
print("\nSingle filter performance (first 1000 URLs):")
|
|
||||||
test_subset = test_urls[:1000]
|
|
||||||
|
|
||||||
print("\nPattern Filters:")
|
|
||||||
benchmark(
|
|
||||||
"Original Pattern Filter",
|
|
||||||
lambda: [pattern_filter.apply(url) for url in test_subset],
|
|
||||||
)
|
|
||||||
benchmark(
|
|
||||||
"Optimized Pattern Filter",
|
|
||||||
lambda: [fast_pattern_filter.apply(url) for url in test_subset],
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\nContent Filters:")
|
|
||||||
benchmark(
|
|
||||||
"Original Content Filter",
|
|
||||||
lambda: [content_filter.apply(url) for url in test_subset],
|
|
||||||
)
|
|
||||||
benchmark(
|
|
||||||
"Optimized Content Filter",
|
|
||||||
lambda: [fast_content_filter.apply(url) for url in test_subset],
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\nDomain Filters:")
|
|
||||||
benchmark(
|
|
||||||
"Original Domain Filter",
|
|
||||||
lambda: [domain_filter.apply(url) for url in test_subset],
|
|
||||||
)
|
|
||||||
benchmark(
|
|
||||||
"Optimized Domain Filter",
|
|
||||||
lambda: [fast_domain_filter.apply(url) for url in test_subset],
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\nFull Chain Performance (all URLs):")
|
|
||||||
# Test chain
|
|
||||||
benchmark("Original Chain", lambda: [chain.apply(url) for url in test_urls])
|
|
||||||
benchmark("Optimized Chain", lambda: [fast_chain.apply(url) for url in test_urls])
|
|
||||||
|
|
||||||
# Memory usage
|
|
||||||
import sys
|
|
||||||
|
|
||||||
print("\nMemory Usage per Filter:")
|
|
||||||
print(f"Original Pattern Filter: {sys.getsizeof(pattern_filter):,} bytes")
|
|
||||||
print(f"Optimized Pattern Filter: {sys.getsizeof(fast_pattern_filter):,} bytes")
|
|
||||||
print(f"Original Content Filter: {sys.getsizeof(content_filter):,} bytes")
|
|
||||||
print(f"Optimized Content Filter: {sys.getsizeof(fast_content_filter):,} bytes")
|
|
||||||
print(f"Original Domain Filter: {sys.getsizeof(domain_filter):,} bytes")
|
|
||||||
print(f"Optimized Domain Filter: {sys.getsizeof(fast_domain_filter):,} bytes")
|
|
||||||
|
|
||||||
def test_pattern_filter():
|
|
||||||
import time
|
|
||||||
from itertools import chain
|
|
||||||
|
|
||||||
# Test cases as list of tuples instead of dict for multiple patterns
|
|
||||||
test_cases = [
|
|
||||||
# Simple suffix patterns (*.html)
|
|
||||||
("*.html", {
|
|
||||||
"https://example.com/page.html": True,
|
|
||||||
"https://example.com/path/doc.html": True,
|
|
||||||
"https://example.com/page.htm": False,
|
|
||||||
"https://example.com/page.html?param=1": True,
|
|
||||||
}),
|
|
||||||
|
|
||||||
# Path prefix patterns (/foo/*)
|
|
||||||
("*/article/*", {
|
|
||||||
"https://example.com/article/123": True,
|
|
||||||
"https://example.com/blog/article/456": True,
|
|
||||||
"https://example.com/articles/789": False,
|
|
||||||
"https://example.com/article": False,
|
|
||||||
}),
|
|
||||||
|
|
||||||
# Complex patterns
|
|
||||||
("blog-*-[0-9]", {
|
|
||||||
"https://example.com/blog-post-1": True,
|
|
||||||
"https://example.com/blog-test-9": True,
|
|
||||||
"https://example.com/blog-post": False,
|
|
||||||
"https://example.com/blog-post-x": False,
|
|
||||||
}),
|
|
||||||
|
|
||||||
# Multiple patterns case
|
|
||||||
(["*.pdf", "*/download/*"], {
|
|
||||||
"https://example.com/doc.pdf": True,
|
|
||||||
"https://example.com/download/file.txt": True,
|
|
||||||
"https://example.com/path/download/doc": True,
|
|
||||||
"https://example.com/uploads/file.txt": False,
|
|
||||||
}),
|
|
||||||
|
|
||||||
# Edge cases
|
|
||||||
("*", {
|
|
||||||
"https://example.com": True,
|
|
||||||
"": True,
|
|
||||||
"http://test.com/path": True,
|
|
||||||
}),
|
|
||||||
|
|
||||||
# Complex regex
|
|
||||||
(r"^https?://.*\.example\.com/\d+", {
|
|
||||||
"https://sub.example.com/123": True,
|
|
||||||
"http://test.example.com/456": True,
|
|
||||||
"https://example.com/789": False,
|
|
||||||
"https://sub.example.com/abc": False,
|
|
||||||
})
|
|
||||||
]
|
|
||||||
|
|
||||||
def run_accuracy_test():
|
|
||||||
print("\nAccuracy Tests:")
|
|
||||||
print("-" * 50)
|
|
||||||
|
|
||||||
all_passed = True
|
|
||||||
for patterns, test_urls in test_cases:
|
|
||||||
filter_obj = FastURLPatternFilter(patterns)
|
|
||||||
|
|
||||||
for url, expected in test_urls.items():
|
|
||||||
result = filter_obj.apply(url)
|
|
||||||
if result != expected:
|
|
||||||
print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
|
|
||||||
print(f" Expected: {expected}, Got: {result}")
|
|
||||||
all_passed = False
|
|
||||||
else:
|
|
||||||
print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
|
|
||||||
|
|
||||||
return all_passed
|
|
||||||
|
|
||||||
def run_speed_test():
|
|
||||||
print("\nSpeed Tests:")
|
|
||||||
print("-" * 50)
|
|
||||||
|
|
||||||
# Create a large set of test URLs
|
|
||||||
all_urls = list(chain.from_iterable(urls.keys() for _, urls in test_cases))
|
|
||||||
test_urls = all_urls * 10000 # 100K+ URLs
|
|
||||||
|
|
||||||
# Test both implementations
|
|
||||||
original = URLPatternFilter(["*.html", "*/article/*", "blog-*"])
|
|
||||||
optimized = FastURLPatternFilter(["*.html", "*/article/*", "blog-*"])
|
|
||||||
|
|
||||||
def benchmark(name, filter_obj):
|
|
||||||
start = time.perf_counter()
|
|
||||||
for url in test_urls:
|
|
||||||
filter_obj.apply(url)
|
|
||||||
elapsed = time.perf_counter() - start
|
|
||||||
urls_per_sec = len(test_urls) / elapsed
|
|
||||||
print(f"{name:<20} {elapsed:.3f}s ({urls_per_sec:,.0f} URLs/sec)")
|
|
||||||
|
|
||||||
benchmark("Original Filter:", original)
|
|
||||||
benchmark("Optimized Filter:", optimized)
|
|
||||||
|
|
||||||
# Run tests
|
|
||||||
print("Running Pattern Filter Tests...")
|
|
||||||
accuracy_passed = run_accuracy_test()
|
|
||||||
|
|
||||||
if accuracy_passed:
|
|
||||||
print("\n✨ All accuracy tests passed!")
|
|
||||||
run_speed_test()
|
|
||||||
else:
|
|
||||||
print("\n❌ Some accuracy tests failed!")
|
|
||||||
|
|
||||||
async def test_content_relevancy_filter():
|
|
||||||
# Initialize with query and threshold (tune based on your corpus)
|
|
||||||
relevance_filter = ContentRelevanceFilter(
|
|
||||||
query="machine learning",
|
|
||||||
threshold=2.5
|
|
||||||
)
|
|
||||||
|
|
||||||
# In your crawler loop
|
|
||||||
for url in ["https://example.com", "https://example.com/blog/post-123"]:
|
|
||||||
if await relevance_filter.apply(url):
|
|
||||||
print(f"✅ Relevant: {url}")
|
|
||||||
else:
|
|
||||||
print(f"❌ Not Relevant: {url}")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_performance_test()
|
|
||||||
# test_pattern_filter()
|
|
||||||
@@ -23,35 +23,7 @@ _FRESHNESS_SCORES = [
|
|||||||
0.5, # 5 years ago
|
0.5, # 5 years ago
|
||||||
]
|
]
|
||||||
|
|
||||||
# Pre-computed normalization factors for powers of 2
|
|
||||||
_POW2_NORM = [1.0, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625]
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ScoringStats:
|
class ScoringStats:
|
||||||
# PERF: Dataclass introduces overhead with property access and __init__
|
|
||||||
# PERF: Float operations and comparisons are expensive for high-frequency updates
|
|
||||||
# PERF: Property calculation on every access is inefficient
|
|
||||||
# PERF: Storing min/max adds memory overhead and comparison costs
|
|
||||||
# PERF: Using inf/-inf creates unnecessary float objects
|
|
||||||
urls_scored: int = 0
|
|
||||||
total_score: float = 0.0
|
|
||||||
min_score: float = float("inf") # Expensive object creation
|
|
||||||
max_score: float = float("-inf")
|
|
||||||
|
|
||||||
def update(self, score: float):
|
|
||||||
"""Update scoring statistics"""
|
|
||||||
self.urls_scored += 1
|
|
||||||
self.total_score += score
|
|
||||||
self.min_score = min(self.min_score, score)
|
|
||||||
self.max_score = max(self.max_score, score)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def average_score(self) -> float:
|
|
||||||
"""Calculate average score"""
|
|
||||||
return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0
|
|
||||||
|
|
||||||
class FastScoringStats:
|
|
||||||
__slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score')
|
__slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score')
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -88,32 +60,7 @@ class FastScoringStats:
|
|||||||
if self._max_score is None:
|
if self._max_score is None:
|
||||||
self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
|
self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
|
||||||
return self._max_score
|
return self._max_score
|
||||||
|
|
||||||
class URLScorer(ABC):
|
class URLScorer(ABC):
|
||||||
# PERF: Property access overhead for weight
|
|
||||||
# PERF: Unnecessary name attribute
|
|
||||||
# PERF: Stats object creation overhead
|
|
||||||
# PERF: Logger creation for each instance
|
|
||||||
# PERF: Abstract method overhead
|
|
||||||
|
|
||||||
def __init__(self, weight: float = 1.0, name: str = None):
|
|
||||||
self.weight = weight
|
|
||||||
self.name = name or self.__class__.__name__
|
|
||||||
self.stats = ScoringStats()
|
|
||||||
self.logger = logging.getLogger(f"urlscorer.{self.name}")
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def score(self, url: str) -> float:
|
|
||||||
raw_score = self._calculate_score(url)
|
|
||||||
weighted_score = raw_score * self.weight
|
|
||||||
self.stats.update(weighted_score)
|
|
||||||
return weighted_score
|
|
||||||
|
|
||||||
# Optimized base class
|
|
||||||
class FastURLScorer(ABC):
|
|
||||||
__slots__ = ('_weight', '_stats')
|
__slots__ = ('_weight', '_stats')
|
||||||
|
|
||||||
def __init__(self, weight: float = 1.0):
|
def __init__(self, weight: float = 1.0):
|
||||||
@@ -142,31 +89,6 @@ class FastURLScorer(ABC):
|
|||||||
return self._weight
|
return self._weight
|
||||||
|
|
||||||
class CompositeScorer(URLScorer):
|
class CompositeScorer(URLScorer):
|
||||||
# PERF: Unnecessary list iteration for each score
|
|
||||||
# PERF: Creates new list for scores
|
|
||||||
# PERF: Division on every normalization
|
|
||||||
# PERF: No parallelization for independent scorers
|
|
||||||
# PERF: No short circuit for zero scores
|
|
||||||
# PERF: No weighting optimization
|
|
||||||
# PERF: No caching of combined scores
|
|
||||||
# PERF: List allocation for scores storag
|
|
||||||
"""Combines multiple scorers with weights"""
|
|
||||||
|
|
||||||
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
|
|
||||||
super().__init__(name="CompositeScorer")
|
|
||||||
self.scorers = scorers
|
|
||||||
self.normalize = normalize
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
scores = [scorer.score(url) for scorer in self.scorers]
|
|
||||||
total_score = sum(scores)
|
|
||||||
|
|
||||||
if self.normalize and scores:
|
|
||||||
total_score /= len(scores)
|
|
||||||
|
|
||||||
return total_score
|
|
||||||
|
|
||||||
class FastCompositeScorer(FastURLScorer):
|
|
||||||
__slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array')
|
__slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array')
|
||||||
|
|
||||||
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
|
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
|
||||||
@@ -236,50 +158,6 @@ class FastCompositeScorer(FastURLScorer):
|
|||||||
return score
|
return score
|
||||||
|
|
||||||
class KeywordRelevanceScorer(URLScorer):
|
class KeywordRelevanceScorer(URLScorer):
|
||||||
# PERF: Regex compilation and pattern matching is expensive
|
|
||||||
# PERF: List comprehension with pattern search has high overhead
|
|
||||||
# PERF: URL decoding on every calculation
|
|
||||||
# PERF: Division operation for normalization is costly
|
|
||||||
# PERF: Case insensitive regex adds overhead
|
|
||||||
# PERF: No pattern caching or reuse
|
|
||||||
# PERF: Using inheritance adds method lookup overhead
|
|
||||||
|
|
||||||
"""Score URLs based on keyword relevance.
|
|
||||||
|
|
||||||
keyword_scorer = KeywordRelevanceScorer(
|
|
||||||
keywords=["python", "programming"],
|
|
||||||
weight=1.0,
|
|
||||||
case_sensitive=False
|
|
||||||
)
|
|
||||||
|
|
||||||
- Score based on keyword matches
|
|
||||||
- Case sensitivity options
|
|
||||||
- Weighted scoring
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False
|
|
||||||
):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.keywords = keywords
|
|
||||||
self.case_sensitive = case_sensitive
|
|
||||||
self._compile_keywords()
|
|
||||||
|
|
||||||
def _compile_keywords(self):
|
|
||||||
"""Prepare keywords for matching"""
|
|
||||||
flags = 0 if self.case_sensitive else re.IGNORECASE
|
|
||||||
self.patterns = [re.compile(re.escape(k), flags) for k in self.keywords]
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on keyword matches"""
|
|
||||||
decoded_url = unquote(url)
|
|
||||||
total_matches = sum(
|
|
||||||
1 for pattern in self.patterns if pattern.search(decoded_url)
|
|
||||||
)
|
|
||||||
# Normalize score between 0 and 1
|
|
||||||
return total_matches / len(self.patterns) if self.patterns else 0.0
|
|
||||||
|
|
||||||
class FastKeywordRelevanceScorer(FastURLScorer):
|
|
||||||
__slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive')
|
__slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive')
|
||||||
|
|
||||||
def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
|
def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
|
||||||
@@ -310,39 +188,6 @@ class FastKeywordRelevanceScorer(FastURLScorer):
|
|||||||
return matches / len(self._keywords)
|
return matches / len(self._keywords)
|
||||||
|
|
||||||
class PathDepthScorer(URLScorer):
|
class PathDepthScorer(URLScorer):
|
||||||
# PERF: URL parsing on every call is expensive
|
|
||||||
# PERF: Split and list comprehension creates temporary lists
|
|
||||||
# PERF: abs() call adds function overhead
|
|
||||||
# PERF: Division and addition in score calculation are expensive for high frequency
|
|
||||||
# PERF: Path parts filtering creates extra list
|
|
||||||
# PERF: Inherits URLScorer adding method lookup overhead
|
|
||||||
# PERF: No caching of parsed URLs or calculated depths
|
|
||||||
"""Score URLs based on their path depth.
|
|
||||||
|
|
||||||
path_scorer = PathDepthScorer(
|
|
||||||
optimal_depth=3, # Preferred URL depth
|
|
||||||
weight=0.7
|
|
||||||
)
|
|
||||||
|
|
||||||
- Score based on URL path depth
|
|
||||||
- Configurable optimal depth
|
|
||||||
- Diminishing returns for deeper paths
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.optimal_depth = optimal_depth
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on path depth"""
|
|
||||||
path = urlparse(url).path
|
|
||||||
depth = len([x for x in path.split("/") if x])
|
|
||||||
|
|
||||||
# Score decreases as we move away from optimal depth
|
|
||||||
distance_from_optimal = abs(depth - self.optimal_depth)
|
|
||||||
return 1.0 / (1.0 + distance_from_optimal)
|
|
||||||
|
|
||||||
class FastPathDepthScorer(FastURLScorer):
|
|
||||||
__slots__ = ('_weight', '_stats', '_optimal_depth') # Remove _url_cache
|
__slots__ = ('_weight', '_stats', '_optimal_depth') # Remove _url_cache
|
||||||
|
|
||||||
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
|
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
|
||||||
@@ -400,45 +245,6 @@ class FastPathDepthScorer(FastURLScorer):
|
|||||||
return 1.0 / (1.0 + distance)
|
return 1.0 / (1.0 + distance)
|
||||||
|
|
||||||
class ContentTypeScorer(URLScorer):
|
class ContentTypeScorer(URLScorer):
|
||||||
# PERF: Regex compilation on every initialization
|
|
||||||
# PERF: Dict lookup and regex search for every URL
|
|
||||||
# PERF: Pattern iteration adds loop overhead
|
|
||||||
# PERF: No pattern priority or short-circuit
|
|
||||||
# PERF: Dict storage has lookup overhead
|
|
||||||
# PERF: Missing extension fast path check
|
|
||||||
# PERF: Unnecessary regex for simple extensions
|
|
||||||
"""Score URLs based on content type preferences.
|
|
||||||
|
|
||||||
content_scorer = ContentTypeScorer({
|
|
||||||
r'\.html$': 1.0,
|
|
||||||
r'\.pdf$': 0.8,
|
|
||||||
r'\.xml$': 0.6
|
|
||||||
})
|
|
||||||
|
|
||||||
- Score based on file types
|
|
||||||
- Configurable type weights
|
|
||||||
- Pattern matching support
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.type_weights = type_weights
|
|
||||||
self._compile_patterns()
|
|
||||||
|
|
||||||
def _compile_patterns(self):
|
|
||||||
"""Prepare content type patterns"""
|
|
||||||
self.patterns = {
|
|
||||||
re.compile(pattern): weight for pattern, weight in self.type_weights.items()
|
|
||||||
}
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on content type matching"""
|
|
||||||
for pattern, weight in self.patterns.items():
|
|
||||||
if pattern.search(url):
|
|
||||||
return weight
|
|
||||||
return 0.0
|
|
||||||
|
|
||||||
class FastContentTypeScorer(FastURLScorer):
|
|
||||||
__slots__ = ('_weight', '_exact_types', '_regex_types')
|
__slots__ = ('_weight', '_exact_types', '_regex_types')
|
||||||
|
|
||||||
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
|
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
|
||||||
@@ -524,45 +330,6 @@ class FastContentTypeScorer(FastURLScorer):
|
|||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
class FreshnessScorer(URLScorer):
|
class FreshnessScorer(URLScorer):
|
||||||
# PERF: Multiple regex compilations for each pattern
|
|
||||||
# PERF: Tries all patterns sequentially
|
|
||||||
# PERF: Regex pattern matching is expensive
|
|
||||||
# PERF: Int conversion and arithmetic for every match
|
|
||||||
# PERF: Repeated constant value (2024) hardcoded
|
|
||||||
# PERF: No URL caching
|
|
||||||
# PERF: Complex patterns with redundant groups
|
|
||||||
# PERF: Unnecessary list of patterns when could combine
|
|
||||||
"""Score URLs based on freshness indicators.
|
|
||||||
|
|
||||||
freshness_scorer = FreshnessScorer(weight=0.9)
|
|
||||||
|
|
||||||
Score based on date indicators in URLs
|
|
||||||
Multiple date format support
|
|
||||||
Recency weighting"""
|
|
||||||
|
|
||||||
def __init__(self, weight: float = 1.0):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.date_patterns = [
|
|
||||||
r"/(\d{4})/(\d{2})/(\d{2})/", # yyyy/mm/dd
|
|
||||||
r"(\d{4})[-_](\d{2})[-_](\d{2})", # yyyy-mm-dd
|
|
||||||
r"/(\d{4})/", # year only
|
|
||||||
]
|
|
||||||
self._compile_patterns()
|
|
||||||
|
|
||||||
def _compile_patterns(self):
|
|
||||||
"""Prepare date patterns"""
|
|
||||||
self.compiled_patterns = [re.compile(p) for p in self.date_patterns]
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on date indicators"""
|
|
||||||
for pattern in self.compiled_patterns:
|
|
||||||
if match := pattern.search(url):
|
|
||||||
year = int(match.group(1))
|
|
||||||
# Score higher for more recent years
|
|
||||||
return 1.0 - (2024 - year) * 0.1
|
|
||||||
return 0.5 # Default score for URLs without dates
|
|
||||||
|
|
||||||
class FastFreshnessScorer(FastURLScorer):
|
|
||||||
__slots__ = ('_weight', '_date_pattern', '_current_year')
|
__slots__ = ('_weight', '_date_pattern', '_current_year')
|
||||||
|
|
||||||
def __init__(self, weight: float = 1.0, current_year: int = 2024):
|
def __init__(self, weight: float = 1.0, current_year: int = 2024):
|
||||||
@@ -645,41 +412,6 @@ class FastFreshnessScorer(FastURLScorer):
|
|||||||
return max(0.1, 1.0 - year_diff * 0.1)
|
return max(0.1, 1.0 - year_diff * 0.1)
|
||||||
|
|
||||||
class DomainAuthorityScorer(URLScorer):
|
class DomainAuthorityScorer(URLScorer):
|
||||||
# PERF: URL parsing on every score calculation
|
|
||||||
# PERF: Repeated domain extraction
|
|
||||||
# PERF: Case conversion on every lookup
|
|
||||||
# PERF: Dict lookup without caching
|
|
||||||
# PERF: Processes full URL when only needs domain
|
|
||||||
# PERF: No fast path for common domains
|
|
||||||
# PERF: Netloc includes port which requires extra processing
|
|
||||||
"""Score URLs based on domain authority.
|
|
||||||
|
|
||||||
authority_scorer = DomainAuthorityScorer({
|
|
||||||
"python.org": 1.0,
|
|
||||||
"github.com": 0.9,
|
|
||||||
"medium.com": 0.7
|
|
||||||
})
|
|
||||||
|
|
||||||
Score based on domain importance
|
|
||||||
Configurable domain weights
|
|
||||||
Default weight for unknown domains"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
domain_weights: Dict[str, float],
|
|
||||||
default_weight: float = 0.5,
|
|
||||||
weight: float = 1.0,
|
|
||||||
):
|
|
||||||
super().__init__(weight=weight)
|
|
||||||
self.domain_weights = domain_weights
|
|
||||||
self.default_weight = default_weight
|
|
||||||
|
|
||||||
def _calculate_score(self, url: str) -> float:
|
|
||||||
"""Calculate score based on domain authority"""
|
|
||||||
domain = urlparse(url).netloc.lower()
|
|
||||||
return self.domain_weights.get(domain, self.default_weight)
|
|
||||||
|
|
||||||
class FastDomainAuthorityScorer(FastURLScorer):
|
|
||||||
__slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains')
|
__slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains')
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -785,418 +517,3 @@ class FastDomainAuthorityScorer(FastURLScorer):
|
|||||||
|
|
||||||
# Regular path: check all domains
|
# Regular path: check all domains
|
||||||
return self._domain_weights.get(domain, self._default_weight)
|
return self._domain_weights.get(domain, self._default_weight)
|
||||||
|
|
||||||
def create_balanced_scorer() -> CompositeScorer:
|
|
||||||
"""Create a balanced composite scorer"""
|
|
||||||
return CompositeScorer(
|
|
||||||
[
|
|
||||||
KeywordRelevanceScorer(
|
|
||||||
keywords=["article", "blog", "news", "research"], weight=1.0
|
|
||||||
),
|
|
||||||
PathDepthScorer(optimal_depth=3, weight=0.7),
|
|
||||||
ContentTypeScorer(
|
|
||||||
type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6},
|
|
||||||
weight=0.8,
|
|
||||||
),
|
|
||||||
FreshnessScorer(weight=0.9),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
def create_balanced_fast_freshness_scorer() -> CompositeScorer:
|
|
||||||
"""Create a balanced composite scorer with fast freshness scorer"""
|
|
||||||
return FastCompositeScorer(
|
|
||||||
[
|
|
||||||
FastKeywordRelevanceScorer(
|
|
||||||
keywords=["article", "blog", "news", "research"], weight=1.0
|
|
||||||
),
|
|
||||||
FastPathDepthScorer(optimal_depth=3, weight=0.7),
|
|
||||||
FastContentTypeScorer(
|
|
||||||
type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6},
|
|
||||||
weight=0.8,
|
|
||||||
),
|
|
||||||
FastFreshnessScorer(weight=0.9),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Example Usage:
|
|
||||||
"""
|
|
||||||
# Create a composite scorer
|
|
||||||
scorer = CompositeScorer([
|
|
||||||
KeywordRelevanceScorer(["python", "programming"], weight=1.0),
|
|
||||||
PathDepthScorer(optimal_depth=2, weight=0.7),
|
|
||||||
FreshnessScorer(weight=0.8),
|
|
||||||
DomainAuthorityScorer(
|
|
||||||
domain_weights={
|
|
||||||
"python.org": 1.0,
|
|
||||||
"github.com": 0.9,
|
|
||||||
"medium.com": 0.7
|
|
||||||
},
|
|
||||||
weight=0.9
|
|
||||||
)
|
|
||||||
])
|
|
||||||
|
|
||||||
# Score a URL
|
|
||||||
score = scorer.score("https://python.org/article/2024/01/new-features")
|
|
||||||
|
|
||||||
# Access statistics
|
|
||||||
print(f"Average score: {scorer.stats.average_score}")
|
|
||||||
print(f"URLs scored: {scorer.stats.urls_scored}")
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def run_scorer_performance_test():
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
from itertools import cycle
|
|
||||||
import sys
|
|
||||||
|
|
||||||
# Generate varied test URLs
|
|
||||||
base_urls = [
|
|
||||||
# News/blog articles with dates
|
|
||||||
"https://example.com/2024/01/article-123",
|
|
||||||
"https://news.com/2023-12-31/breaking-news",
|
|
||||||
"https://blog.site.com/2022_11_15/tech-update",
|
|
||||||
|
|
||||||
# Different content types
|
|
||||||
"https://docs.example.com/report.pdf",
|
|
||||||
"https://site.com/page.html?q=test",
|
|
||||||
"https://api.service.com/data.json",
|
|
||||||
|
|
||||||
# Various domain authorities
|
|
||||||
"https://python.org/downloads",
|
|
||||||
"https://github.com/repo/code",
|
|
||||||
"https://medium.com/@user/post",
|
|
||||||
|
|
||||||
# Different path depths
|
|
||||||
"https://site.com/category/subcategory/product/detail",
|
|
||||||
"https://shop.com/items",
|
|
||||||
"https://edu.org/courses/cs/intro/lecture1",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create variations
|
|
||||||
test_urls = []
|
|
||||||
years = list(range(2020, 2025))
|
|
||||||
domains = ["example.com", "python.org", "github.com", "medium.com"]
|
|
||||||
extensions = ["html", "pdf", "php", "jsx"]
|
|
||||||
|
|
||||||
for base in base_urls:
|
|
||||||
test_urls.append(base)
|
|
||||||
# Add year variations
|
|
||||||
for year in years:
|
|
||||||
test_urls.append(f"https://blog.com/{year}/post-{random.randint(1,999)}")
|
|
||||||
# Add domain variations
|
|
||||||
for domain in domains:
|
|
||||||
test_urls.append(f"https://{domain}/article-{random.randint(1,999)}")
|
|
||||||
# Add extension variations
|
|
||||||
for ext in extensions:
|
|
||||||
test_urls.append(f"https://site.com/doc-{random.randint(1,999)}.{ext}")
|
|
||||||
|
|
||||||
# Multiply dataset
|
|
||||||
test_urls = test_urls * 5000 # Creates ~300k URLs
|
|
||||||
|
|
||||||
def benchmark(name: str, scorer, urls, warmup=True):
|
|
||||||
if warmup:
|
|
||||||
for url in urls[:100]: # Warmup with subset
|
|
||||||
scorer.score(url)
|
|
||||||
|
|
||||||
start = time.perf_counter_ns()
|
|
||||||
for url in urls:
|
|
||||||
scorer.score(url)
|
|
||||||
elapsed = (time.perf_counter_ns() - start) / 1_000_000 # Convert to ms
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"{name:<35} {elapsed:>8.3f} ms ({len(urls)/elapsed*1000:,.0f} URLs/sec)"
|
|
||||||
)
|
|
||||||
return elapsed
|
|
||||||
|
|
||||||
print("\nBenchmarking original vs optimized scorers...")
|
|
||||||
print("-" * 75)
|
|
||||||
|
|
||||||
# Initialize test data
|
|
||||||
domain_weights = {"python.org": 1.0, "github.com": 0.9, "medium.com": 0.7}
|
|
||||||
type_weights = {".html$": 1.0, ".pdf$": 0.8, ".php$": 0.6}
|
|
||||||
keywords = ["python", "article", "blog", "docs"]
|
|
||||||
|
|
||||||
# Original implementations
|
|
||||||
keyword_scorer = KeywordRelevanceScorer(keywords=keywords, weight=1.0)
|
|
||||||
path_scorer = PathDepthScorer(optimal_depth=3, weight=0.7)
|
|
||||||
content_scorer = ContentTypeScorer(type_weights=type_weights, weight=0.8)
|
|
||||||
freshness_scorer = FreshnessScorer(weight=0.9)
|
|
||||||
domain_scorer = DomainAuthorityScorer(domain_weights=domain_weights, weight=1.0)
|
|
||||||
|
|
||||||
# Fast implementations
|
|
||||||
fast_keyword_scorer = FastKeywordRelevanceScorer(keywords=keywords, weight=1.0)
|
|
||||||
fast_path_scorer = FastPathDepthScorer(optimal_depth=3, weight=0.7)
|
|
||||||
fast_content_scorer = FastContentTypeScorer(type_weights=type_weights, weight=0.8)
|
|
||||||
fast_freshness_scorer = FastFreshnessScorer(weight=0.9)
|
|
||||||
fast_domain_scorer = FastDomainAuthorityScorer(domain_weights=domain_weights, weight=1.0)
|
|
||||||
|
|
||||||
# Test subset for individual scorers
|
|
||||||
test_subset = test_urls[:1000]
|
|
||||||
|
|
||||||
print("\nIndividual Scorer Performance (first 1000 URLs):")
|
|
||||||
|
|
||||||
print("\nKeyword Relevance Scorers:")
|
|
||||||
benchmark("Original Keyword Scorer", keyword_scorer, test_subset)
|
|
||||||
benchmark("Optimized Keyword Scorer", fast_keyword_scorer, test_subset)
|
|
||||||
|
|
||||||
print("\nPath Depth Scorers:")
|
|
||||||
benchmark("Original Path Scorer", path_scorer, test_subset)
|
|
||||||
benchmark("Optimized Path Scorer", fast_path_scorer, test_subset)
|
|
||||||
|
|
||||||
print("\nContent Type Scorers:")
|
|
||||||
benchmark("Original Content Scorer", content_scorer, test_subset)
|
|
||||||
benchmark("Optimized Content Scorer", fast_content_scorer, test_subset)
|
|
||||||
|
|
||||||
print("\nFreshness Scorers:")
|
|
||||||
benchmark("Original Freshness Scorer", freshness_scorer, test_subset)
|
|
||||||
benchmark("Optimized Freshness Scorer", fast_freshness_scorer, test_subset)
|
|
||||||
|
|
||||||
print("\nDomain Authority Scorers:")
|
|
||||||
benchmark("Original Domain Scorer", domain_scorer, test_subset)
|
|
||||||
benchmark("Optimized Domain Scorer", fast_domain_scorer, test_subset)
|
|
||||||
|
|
||||||
# Test composite scorers
|
|
||||||
print("\nComposite Scorer Performance (all URLs):")
|
|
||||||
|
|
||||||
original_composite = CompositeScorer([
|
|
||||||
keyword_scorer, path_scorer, content_scorer,
|
|
||||||
freshness_scorer, domain_scorer
|
|
||||||
])
|
|
||||||
|
|
||||||
fast_composite = FastCompositeScorer([
|
|
||||||
fast_keyword_scorer, fast_path_scorer, fast_content_scorer,
|
|
||||||
fast_freshness_scorer, fast_domain_scorer
|
|
||||||
])
|
|
||||||
|
|
||||||
benchmark("Original Composite Scorer", original_composite, test_urls)
|
|
||||||
benchmark("Optimized Composite Scorer", fast_composite, test_urls)
|
|
||||||
|
|
||||||
# Memory usage
|
|
||||||
print("\nMemory Usage per Scorer:")
|
|
||||||
print(f"Original Keyword Scorer: {sys.getsizeof(keyword_scorer):,} bytes")
|
|
||||||
print(f"Optimized Keyword Scorer: {sys.getsizeof(fast_keyword_scorer):,} bytes")
|
|
||||||
print(f"Original Path Scorer: {sys.getsizeof(path_scorer):,} bytes")
|
|
||||||
print(f"Optimized Path Scorer: {sys.getsizeof(fast_path_scorer):,} bytes")
|
|
||||||
print(f"Original Content Scorer: {sys.getsizeof(content_scorer):,} bytes")
|
|
||||||
print(f"Optimized Content Scorer: {sys.getsizeof(fast_content_scorer):,} bytes")
|
|
||||||
print(f"Original Freshness Scorer: {sys.getsizeof(freshness_scorer):,} bytes")
|
|
||||||
print(f"Optimized Freshness Scorer: {sys.getsizeof(fast_freshness_scorer):,} bytes")
|
|
||||||
print(f"Original Domain Scorer: {sys.getsizeof(domain_scorer):,} bytes")
|
|
||||||
print(f"Optimized Domain Scorer: {sys.getsizeof(fast_domain_scorer):,} bytes")
|
|
||||||
print(f"Original Composite: {sys.getsizeof(original_composite):,} bytes")
|
|
||||||
print(f"Optimized Composite: {sys.getsizeof(fast_composite):,} bytes")
|
|
||||||
|
|
||||||
def test_scorers():
|
|
||||||
import time
|
|
||||||
from itertools import chain
|
|
||||||
|
|
||||||
test_cases = [
|
|
||||||
# Keyword Scorer Tests
|
|
||||||
{
|
|
||||||
"scorer_type": "keyword",
|
|
||||||
"config": {
|
|
||||||
"keywords": ["python", "blog"],
|
|
||||||
"weight": 1.0,
|
|
||||||
"case_sensitive": False
|
|
||||||
},
|
|
||||||
"urls": {
|
|
||||||
"https://example.com/python-blog": 1.0,
|
|
||||||
"https://example.com/PYTHON-BLOG": 1.0,
|
|
||||||
"https://example.com/python-only": 0.5,
|
|
||||||
"https://example.com/other": 0.0
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
# Path Depth Scorer Tests
|
|
||||||
{
|
|
||||||
"scorer_type": "path_depth",
|
|
||||||
"config": {
|
|
||||||
"optimal_depth": 2,
|
|
||||||
"weight": 1.0
|
|
||||||
},
|
|
||||||
"urls": {
|
|
||||||
"https://example.com/a/b": 1.0,
|
|
||||||
"https://example.com/a": 0.5,
|
|
||||||
"https://example.com/a/b/c": 0.5,
|
|
||||||
"https://example.com": 0.33333333
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
# Content Type Scorer Tests
|
|
||||||
{
|
|
||||||
"scorer_type": "content_type",
|
|
||||||
"config": {
|
|
||||||
"type_weights": {
|
|
||||||
".html$": 1.0,
|
|
||||||
".pdf$": 0.8,
|
|
||||||
".jpg$": 0.6
|
|
||||||
},
|
|
||||||
"weight": 1.0
|
|
||||||
},
|
|
||||||
"urls": {
|
|
||||||
"https://example.com/doc.html": 1.0,
|
|
||||||
"https://example.com/doc.pdf": 0.8,
|
|
||||||
"https://example.com/img.jpg": 0.6,
|
|
||||||
"https://example.com/other.txt": 0.0
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
# Freshness Scorer Tests
|
|
||||||
{
|
|
||||||
"scorer_type": "freshness",
|
|
||||||
"config": {
|
|
||||||
"weight": 1.0, # Remove current_year since original doesn't support it
|
|
||||||
},
|
|
||||||
"urls": {
|
|
||||||
"https://example.com/2024/01/post": 1.0,
|
|
||||||
"https://example.com/2023/12/post": 0.9,
|
|
||||||
"https://example.com/2022/post": 0.8,
|
|
||||||
"https://example.com/no-date": 0.5
|
|
||||||
}
|
|
||||||
},
|
|
||||||
|
|
||||||
# Domain Authority Scorer Tests
|
|
||||||
{
|
|
||||||
"scorer_type": "domain",
|
|
||||||
"config": {
|
|
||||||
"domain_weights": {
|
|
||||||
"python.org": 1.0,
|
|
||||||
"github.com": 0.8,
|
|
||||||
"medium.com": 0.6
|
|
||||||
},
|
|
||||||
"default_weight": 0.3,
|
|
||||||
"weight": 1.0
|
|
||||||
},
|
|
||||||
"urls": {
|
|
||||||
"https://python.org/about": 1.0,
|
|
||||||
"https://github.com/repo": 0.8,
|
|
||||||
"https://medium.com/post": 0.6,
|
|
||||||
"https://unknown.com": 0.3
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
def create_scorer(scorer_type, config):
|
|
||||||
if scorer_type == "keyword":
|
|
||||||
return (
|
|
||||||
KeywordRelevanceScorer(**config),
|
|
||||||
FastKeywordRelevanceScorer(**config)
|
|
||||||
)
|
|
||||||
elif scorer_type == "path_depth":
|
|
||||||
return (
|
|
||||||
PathDepthScorer(**config),
|
|
||||||
FastPathDepthScorer(**config)
|
|
||||||
)
|
|
||||||
elif scorer_type == "content_type":
|
|
||||||
return (
|
|
||||||
ContentTypeScorer(**config),
|
|
||||||
FastContentTypeScorer(**config)
|
|
||||||
)
|
|
||||||
elif scorer_type == "freshness":
|
|
||||||
return (
|
|
||||||
FreshnessScorer(**config),
|
|
||||||
FastFreshnessScorer(**config, current_year=2024)
|
|
||||||
)
|
|
||||||
elif scorer_type == "domain":
|
|
||||||
return (
|
|
||||||
DomainAuthorityScorer(**config),
|
|
||||||
FastDomainAuthorityScorer(**config)
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_accuracy_test():
|
|
||||||
print("\nAccuracy Tests:")
|
|
||||||
print("-" * 50)
|
|
||||||
|
|
||||||
all_passed = True
|
|
||||||
for test_case in test_cases:
|
|
||||||
print(f"\nTesting {test_case['scorer_type']} scorer:")
|
|
||||||
original, fast = create_scorer(
|
|
||||||
test_case['scorer_type'],
|
|
||||||
test_case['config']
|
|
||||||
)
|
|
||||||
|
|
||||||
for url, expected in test_case['urls'].items():
|
|
||||||
orig_score = round(original.score(url), 8)
|
|
||||||
fast_score = round(fast.score(url), 8)
|
|
||||||
expected = round(expected, 8)
|
|
||||||
|
|
||||||
if abs(orig_score - expected) > 0.00001:
|
|
||||||
print(f"❌ Original Failed: URL '{url}'")
|
|
||||||
print(f" Expected: {expected}, Got: {orig_score}")
|
|
||||||
all_passed = False
|
|
||||||
else:
|
|
||||||
print(f"✅ Original Passed: URL '{url}'")
|
|
||||||
|
|
||||||
if abs(fast_score - expected) > 0.00001:
|
|
||||||
print(f"❌ Fast Failed: URL '{url}'")
|
|
||||||
print(f" Expected: {expected}, Got: {fast_score}")
|
|
||||||
all_passed = False
|
|
||||||
else:
|
|
||||||
print(f"✅ Fast Passed: URL '{url}'")
|
|
||||||
|
|
||||||
return all_passed
|
|
||||||
|
|
||||||
def run_composite_test():
|
|
||||||
print("\nTesting Composite Scorer:")
|
|
||||||
print("-" * 50)
|
|
||||||
|
|
||||||
# Create test data
|
|
||||||
test_urls = {
|
|
||||||
"https://python.org/blog/2024/01/new-release.html":0.86666667,
|
|
||||||
"https://github.com/repo/old-code.pdf": 0.62,
|
|
||||||
"https://unknown.com/random": 0.26
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create composite scorers with all types
|
|
||||||
original_scorers = []
|
|
||||||
fast_scorers = []
|
|
||||||
|
|
||||||
for test_case in test_cases:
|
|
||||||
orig, fast = create_scorer(
|
|
||||||
test_case['scorer_type'],
|
|
||||||
test_case['config']
|
|
||||||
)
|
|
||||||
original_scorers.append(orig)
|
|
||||||
fast_scorers.append(fast)
|
|
||||||
|
|
||||||
original_composite = CompositeScorer(original_scorers, normalize=True)
|
|
||||||
fast_composite = FastCompositeScorer(fast_scorers, normalize=True)
|
|
||||||
|
|
||||||
all_passed = True
|
|
||||||
for url, expected in test_urls.items():
|
|
||||||
orig_score = round(original_composite.score(url), 8)
|
|
||||||
fast_score = round(fast_composite.score(url), 8)
|
|
||||||
|
|
||||||
if abs(orig_score - expected) > 0.00001:
|
|
||||||
print(f"❌ Original Composite Failed: URL '{url}'")
|
|
||||||
print(f" Expected: {expected}, Got: {orig_score}")
|
|
||||||
all_passed = False
|
|
||||||
else:
|
|
||||||
print(f"✅ Original Composite Passed: URL '{url}'")
|
|
||||||
|
|
||||||
if abs(fast_score - expected) > 0.00001:
|
|
||||||
print(f"❌ Fast Composite Failed: URL '{url}'")
|
|
||||||
print(f" Expected: {expected}, Got: {fast_score}")
|
|
||||||
all_passed = False
|
|
||||||
else:
|
|
||||||
print(f"✅ Fast Composite Passed: URL '{url}'")
|
|
||||||
|
|
||||||
return all_passed
|
|
||||||
|
|
||||||
# Run tests
|
|
||||||
print("Running Scorer Tests...")
|
|
||||||
accuracy_passed = run_accuracy_test()
|
|
||||||
composite_passed = run_composite_test()
|
|
||||||
|
|
||||||
if accuracy_passed and composite_passed:
|
|
||||||
print("\n✨ All tests passed!")
|
|
||||||
# Note: Already have performance tests in run_scorer_performance_test()
|
|
||||||
else:
|
|
||||||
print("\n❌ Some tests failed!")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
run_scorer_performance_test()
|
|
||||||
# test_scorers()
|
|
||||||
@@ -510,6 +510,7 @@ class HTML2Text(html.parser.HTMLParser):
|
|||||||
|
|
||||||
if tag == "a" and not self.ignore_links:
|
if tag == "a" and not self.ignore_links:
|
||||||
if start:
|
if start:
|
||||||
|
self.inside_link = True
|
||||||
if (
|
if (
|
||||||
"href" in attrs
|
"href" in attrs
|
||||||
and attrs["href"] is not None
|
and attrs["href"] is not None
|
||||||
@@ -526,6 +527,7 @@ class HTML2Text(html.parser.HTMLParser):
|
|||||||
else:
|
else:
|
||||||
self.astack.append(None)
|
self.astack.append(None)
|
||||||
else:
|
else:
|
||||||
|
self.inside_link = False
|
||||||
if self.astack:
|
if self.astack:
|
||||||
a = self.astack.pop()
|
a = self.astack.pop()
|
||||||
if self.maybe_automatic_link and not self.empty_link:
|
if self.maybe_automatic_link and not self.empty_link:
|
||||||
@@ -610,13 +612,22 @@ class HTML2Text(html.parser.HTMLParser):
|
|||||||
self.o("[" + str(a_props.count) + "]")
|
self.o("[" + str(a_props.count) + "]")
|
||||||
|
|
||||||
if tag == "dl" and start:
|
if tag == "dl" and start:
|
||||||
self.p()
|
self.p() # Add paragraph break before list starts
|
||||||
if tag == "dt" and not start:
|
self.p_p = 0 # Reset paragraph state
|
||||||
self.pbr()
|
|
||||||
if tag == "dd" and start:
|
elif tag == "dt" and start:
|
||||||
self.o(" ")
|
if self.p_p == 0: # If not first term
|
||||||
if tag == "dd" and not start:
|
self.o("\n\n") # Add spacing before new term-definition pair
|
||||||
self.pbr()
|
self.p_p = 0 # Reset paragraph state
|
||||||
|
|
||||||
|
elif tag == "dt" and not start:
|
||||||
|
self.o("\n") # Single newline between term and definition
|
||||||
|
|
||||||
|
elif tag == "dd" and start:
|
||||||
|
self.o(" ") # Indent definition
|
||||||
|
|
||||||
|
elif tag == "dd" and not start:
|
||||||
|
self.p_p = 0
|
||||||
|
|
||||||
if tag in ["ol", "ul"]:
|
if tag in ["ol", "ul"]:
|
||||||
# Google Docs create sub lists as top level lists
|
# Google Docs create sub lists as top level lists
|
||||||
@@ -1026,6 +1037,7 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
self.inside_pre = False
|
self.inside_pre = False
|
||||||
self.inside_code = False
|
self.inside_code = False
|
||||||
|
self.inside_link = False
|
||||||
self.preserve_tags = set() # Set of tags to preserve
|
self.preserve_tags = set() # Set of tags to preserve
|
||||||
self.current_preserved_tag = None
|
self.current_preserved_tag = None
|
||||||
self.preserved_content = []
|
self.preserved_content = []
|
||||||
@@ -1105,11 +1117,17 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
# Ignore code tags inside pre blocks if handle_code_in_pre is False
|
# Ignore code tags inside pre blocks if handle_code_in_pre is False
|
||||||
return
|
return
|
||||||
if start:
|
if start:
|
||||||
self.o("`") # Markdown inline code start
|
if not self.inside_link:
|
||||||
|
self.o("`") # Only output backtick if not inside a link
|
||||||
self.inside_code = True
|
self.inside_code = True
|
||||||
else:
|
else:
|
||||||
self.o("`") # Markdown inline code end
|
if not self.inside_link:
|
||||||
|
self.o("`") # Only output backtick if not inside a link
|
||||||
self.inside_code = False
|
self.inside_code = False
|
||||||
|
|
||||||
|
# If inside a link, let the parent class handle the content
|
||||||
|
if self.inside_link:
|
||||||
|
super().handle_tag(tag, attrs, start)
|
||||||
else:
|
else:
|
||||||
super().handle_tag(tag, attrs, start)
|
super().handle_tag(tag, attrs, start)
|
||||||
|
|
||||||
|
|||||||
@@ -179,7 +179,7 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
|||||||
"ignore_emphasis": False,
|
"ignore_emphasis": False,
|
||||||
"ignore_links": False,
|
"ignore_links": False,
|
||||||
"ignore_images": False,
|
"ignore_images": False,
|
||||||
"protect_links": True,
|
"protect_links": False,
|
||||||
"single_line_break": True,
|
"single_line_break": True,
|
||||||
"mark_code": True,
|
"mark_code": True,
|
||||||
"escape_snob": False,
|
"escape_snob": False,
|
||||||
|
|||||||
@@ -198,7 +198,7 @@ Avoid Common Mistakes:
|
|||||||
- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
|
- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
|
||||||
- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
|
- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
|
||||||
- Do not miss closing </blocks> tag at the end of the JSON output.
|
- Do not miss closing </blocks> tag at the end of the JSON output.
|
||||||
- Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.
|
- Do not generate the Python code show me how to do the task, this is your task to extract the information and return it in JSON format.
|
||||||
|
|
||||||
Result
|
Result
|
||||||
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
|
||||||
|
|||||||
@@ -168,10 +168,10 @@ async def main():
|
|||||||
"name": "News Items",
|
"name": "News Items",
|
||||||
"baseSelector": "tr.athing",
|
"baseSelector": "tr.athing",
|
||||||
"fields": [
|
"fields": [
|
||||||
{"name": "title", "selector": "a.storylink", "type": "text"},
|
{"name": "title", "selector": "span.titleline a", "type": "text"},
|
||||||
{
|
{
|
||||||
"name": "link",
|
"name": "link",
|
||||||
"selector": "a.storylink",
|
"selector": "span.titleline a",
|
||||||
"type": "attribute",
|
"type": "attribute",
|
||||||
"attribute": "href"
|
"attribute": "href"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -135,14 +135,14 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
|
|||||||
# Using OpenAI (requires API token)
|
# Using OpenAI (requires API token)
|
||||||
schema = JsonCssExtractionStrategy.generate_schema(
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
html,
|
html,
|
||||||
llm_provider="openai/gpt-4o", # Default provider
|
provider="openai/gpt-4o", # Default provider
|
||||||
api_token="your-openai-token" # Required for OpenAI
|
api_token="your-openai-token" # Required for OpenAI
|
||||||
)
|
)
|
||||||
|
|
||||||
# Or using Ollama (open source, no token needed)
|
# Or using Ollama (open source, no token needed)
|
||||||
schema = JsonCssExtractionStrategy.generate_schema(
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
html,
|
html,
|
||||||
llm_provider="ollama/llama3.3", # Open source alternative
|
provider="ollama/llama3.3", # Open source alternative
|
||||||
api_token=None # Not needed for Ollama
|
api_token=None # Not needed for Ollama
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -434,7 +434,7 @@ html = """
|
|||||||
css_schema = JsonCssExtractionStrategy.generate_schema(
|
css_schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
html,
|
html,
|
||||||
schema_type="css", # This is the default
|
schema_type="css", # This is the default
|
||||||
llm_provider="openai/gpt-4o", # Default provider
|
provider="openai/gpt-4o", # Default provider
|
||||||
api_token="your-openai-token" # Required for OpenAI
|
api_token="your-openai-token" # Required for OpenAI
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -442,7 +442,7 @@ css_schema = JsonCssExtractionStrategy.generate_schema(
|
|||||||
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
|
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
|
||||||
html,
|
html,
|
||||||
schema_type="xpath",
|
schema_type="xpath",
|
||||||
llm_provider="ollama/llama3.3", # Open source alternative
|
provider="ollama/llama3.3", # Open source alternative
|
||||||
api_token=None # Not needed for Ollama
|
api_token=None # Not needed for Ollama
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
46
tests/20241401/test_advanced_deep_crawl.py
Normal file
46
tests/20241401/test_advanced_deep_crawl.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||||
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||||
|
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||||
|
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
|
||||||
|
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||||
|
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Example deep crawl of documentation site."""
|
||||||
|
filter_chain = FilterChain([
|
||||||
|
URLPatternFilter(patterns=["*2025*"]),
|
||||||
|
DomainFilter(allowed_domains=["techcrunch.com"]),
|
||||||
|
ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
|
||||||
|
ContentTypeFilter(allowed_types=["text/html","application/javascript"])
|
||||||
|
])
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy = BestFirstCrawlingStrategy(
|
||||||
|
max_depth=2,
|
||||||
|
include_external=False,
|
||||||
|
filter_chain=filter_chain,
|
||||||
|
url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
|
||||||
|
),
|
||||||
|
stream=False,
|
||||||
|
verbose=True,
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
scraping_strategy=LXMLWebScrapingStrategy()
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
print("Starting deep crawl in streaming mode:")
|
||||||
|
config.stream = True
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
async for result in await crawler.arun(
|
||||||
|
url="https://techcrunch.com",
|
||||||
|
config=config
|
||||||
|
):
|
||||||
|
print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
|
||||||
|
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
279
tests/20241401/test_deep_crawl_filters.py
Normal file
279
tests/20241401/test_deep_crawl_filters.py
Normal file
@@ -0,0 +1,279 @@
|
|||||||
|
from crawl4ai.deep_crawling.filters import ContentRelevanceFilter, URLPatternFilter, DomainFilter, ContentTypeFilter, SEOFilter
|
||||||
|
async def test_pattern_filter():
|
||||||
|
# Test cases as list of tuples instead of dict for multiple patterns
|
||||||
|
test_cases = [
|
||||||
|
# Simple suffix patterns (*.html)
|
||||||
|
("*.html", {
|
||||||
|
"https://example.com/page.html": True,
|
||||||
|
"https://example.com/path/doc.html": True,
|
||||||
|
"https://example.com/page.htm": False,
|
||||||
|
"https://example.com/page.html?param=1": True,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# Path prefix patterns (/foo/*)
|
||||||
|
("*/article/*", {
|
||||||
|
"https://example.com/article/123": True,
|
||||||
|
"https://example.com/blog/article/456": True,
|
||||||
|
"https://example.com/articles/789": False,
|
||||||
|
"https://example.com/article": False,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# Complex patterns
|
||||||
|
("blog-*-[0-9]", {
|
||||||
|
"https://example.com/blog-post-1": True,
|
||||||
|
"https://example.com/blog-test-9": True,
|
||||||
|
"https://example.com/blog-post": False,
|
||||||
|
"https://example.com/blog-post-x": False,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# Multiple patterns case
|
||||||
|
(["*.pdf", "*/download/*"], {
|
||||||
|
"https://example.com/doc.pdf": True,
|
||||||
|
"https://example.com/download/file.txt": True,
|
||||||
|
"https://example.com/path/download/doc": True,
|
||||||
|
"https://example.com/uploads/file.txt": False,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# Edge cases
|
||||||
|
("*", {
|
||||||
|
"https://example.com": True,
|
||||||
|
"": True,
|
||||||
|
"http://test.com/path": True,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# Complex regex
|
||||||
|
(r"^https?://.*\.example\.com/\d+", {
|
||||||
|
"https://sub.example.com/123": True,
|
||||||
|
"http://test.example.com/456": True,
|
||||||
|
"https://example.com/789": False,
|
||||||
|
"https://sub.example.com/abc": False,
|
||||||
|
})
|
||||||
|
]
|
||||||
|
|
||||||
|
def run_accuracy_test():
|
||||||
|
print("\nAccuracy Tests:")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for patterns, test_urls in test_cases:
|
||||||
|
filter_obj = URLPatternFilter(patterns)
|
||||||
|
|
||||||
|
for url, expected in test_urls.items():
|
||||||
|
result = filter_obj.apply(url)
|
||||||
|
if result != expected:
|
||||||
|
print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
|
||||||
|
print(f" Expected: {expected}, Got: {result}")
|
||||||
|
all_passed = False
|
||||||
|
else:
|
||||||
|
print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
print("Running Pattern Filter Tests...")
|
||||||
|
accuracy_passed = run_accuracy_test()
|
||||||
|
|
||||||
|
if accuracy_passed:
|
||||||
|
print("\n✨ All accuracy tests passed!")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("\n❌ Some accuracy tests failed!")
|
||||||
|
|
||||||
|
async def test_domain_filter():
|
||||||
|
from itertools import chain
|
||||||
|
|
||||||
|
# Test cases
|
||||||
|
test_cases = [
|
||||||
|
# Allowed domains
|
||||||
|
({"allowed": "example.com"}, {
|
||||||
|
"https://example.com/page": True,
|
||||||
|
"http://example.com": True,
|
||||||
|
"https://sub.example.com": False,
|
||||||
|
"https://other.com": False,
|
||||||
|
}),
|
||||||
|
|
||||||
|
({"allowed": ["example.com", "test.com"]}, {
|
||||||
|
"https://example.com/page": True,
|
||||||
|
"https://test.com/home": True,
|
||||||
|
"https://other.com": False,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# Blocked domains
|
||||||
|
({"blocked": "malicious.com"}, {
|
||||||
|
"https://malicious.com": False,
|
||||||
|
"https://safe.com": True,
|
||||||
|
"http://malicious.com/login": False,
|
||||||
|
}),
|
||||||
|
|
||||||
|
({"blocked": ["spam.com", "ads.com"]}, {
|
||||||
|
"https://spam.com": False,
|
||||||
|
"https://ads.com/banner": False,
|
||||||
|
"https://example.com": True,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# Allowed and Blocked combination
|
||||||
|
({"allowed": "example.com", "blocked": "sub.example.com"}, {
|
||||||
|
"https://example.com": True,
|
||||||
|
"https://sub.example.com": False,
|
||||||
|
"https://other.com": False,
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
|
||||||
|
def run_accuracy_test():
|
||||||
|
print("\nAccuracy Tests:")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for params, test_urls in test_cases:
|
||||||
|
filter_obj = DomainFilter(
|
||||||
|
allowed_domains=params.get("allowed"),
|
||||||
|
blocked_domains=params.get("blocked"),
|
||||||
|
)
|
||||||
|
|
||||||
|
for url, expected in test_urls.items():
|
||||||
|
result = filter_obj.apply(url)
|
||||||
|
if result != expected:
|
||||||
|
print(f"\u274C Failed: Params {params} with URL '{url}'")
|
||||||
|
print(f" Expected: {expected}, Got: {result}")
|
||||||
|
all_passed = False
|
||||||
|
else:
|
||||||
|
print(f"\u2705 Passed: Params {params} with URL '{url}'")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
print("Running Domain Filter Tests...")
|
||||||
|
accuracy_passed = run_accuracy_test()
|
||||||
|
|
||||||
|
if accuracy_passed:
|
||||||
|
print("\n\u2728 All accuracy tests passed!")
|
||||||
|
else:
|
||||||
|
print("\n\u274C Some accuracy tests failed!")
|
||||||
|
|
||||||
|
async def test_content_relevance_filter():
|
||||||
|
relevance_filter = ContentRelevanceFilter(
|
||||||
|
query="What was the cause of american civil war?",
|
||||||
|
threshold=1
|
||||||
|
)
|
||||||
|
|
||||||
|
test_cases = {
|
||||||
|
"https://en.wikipedia.org/wiki/Cricket": False,
|
||||||
|
"https://en.wikipedia.org/wiki/American_Civil_War": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
print("\nRunning Content Relevance Filter Tests...")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for url, expected in test_cases.items():
|
||||||
|
result = await relevance_filter.apply(url)
|
||||||
|
if result != expected:
|
||||||
|
print(f"\u274C Failed: URL '{url}'")
|
||||||
|
print(f" Expected: {expected}, Got: {result}")
|
||||||
|
all_passed = False
|
||||||
|
else:
|
||||||
|
print(f"\u2705 Passed: URL '{url}'")
|
||||||
|
|
||||||
|
if all_passed:
|
||||||
|
print("\n\u2728 All content relevance tests passed!")
|
||||||
|
else:
|
||||||
|
print("\n\u274C Some content relevance tests failed!")
|
||||||
|
|
||||||
|
async def test_content_type_filter():
|
||||||
|
from itertools import chain
|
||||||
|
|
||||||
|
# Test cases
|
||||||
|
test_cases = [
|
||||||
|
# Allowed single type
|
||||||
|
({"allowed": "image/png"}, {
|
||||||
|
"https://example.com/image.png": True,
|
||||||
|
"https://example.com/photo.jpg": False,
|
||||||
|
"https://example.com/document.pdf": False,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# Multiple allowed types
|
||||||
|
({"allowed": ["image/jpeg", "application/pdf"]}, {
|
||||||
|
"https://example.com/photo.jpg": True,
|
||||||
|
"https://example.com/document.pdf": True,
|
||||||
|
"https://example.com/script.js": False,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# No extension should be allowed
|
||||||
|
({"allowed": "application/json"}, {
|
||||||
|
"https://example.com/api/data": True,
|
||||||
|
"https://example.com/data.json": True,
|
||||||
|
"https://example.com/page.html": False,
|
||||||
|
}),
|
||||||
|
|
||||||
|
# Unknown extensions should not be allowed
|
||||||
|
({"allowed": "application/octet-stream"}, {
|
||||||
|
"https://example.com/file.unknown": True,
|
||||||
|
"https://example.com/archive.zip": False,
|
||||||
|
"https://example.com/software.exe": False,
|
||||||
|
}),
|
||||||
|
]
|
||||||
|
|
||||||
|
def run_accuracy_test():
|
||||||
|
print("\nAccuracy Tests:")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for params, test_urls in test_cases:
|
||||||
|
filter_obj = ContentTypeFilter(
|
||||||
|
allowed_types=params.get("allowed"),
|
||||||
|
)
|
||||||
|
|
||||||
|
for url, expected in test_urls.items():
|
||||||
|
result = filter_obj.apply(url)
|
||||||
|
if result != expected:
|
||||||
|
print(f"\u274C Failed: Params {params} with URL '{url}'")
|
||||||
|
print(f" Expected: {expected}, Got: {result}")
|
||||||
|
all_passed = False
|
||||||
|
else:
|
||||||
|
print(f"\u2705 Passed: Params {params} with URL '{url}'")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
print("Running Content Type Filter Tests...")
|
||||||
|
accuracy_passed = run_accuracy_test()
|
||||||
|
|
||||||
|
if accuracy_passed:
|
||||||
|
print("\n\u2728 All accuracy tests passed!")
|
||||||
|
else:
|
||||||
|
print("\n\u274C Some accuracy tests failed!")
|
||||||
|
|
||||||
|
async def test_seo_filter():
|
||||||
|
seo_filter = SEOFilter(threshold=0.5, keywords=["SEO", "search engines", "Optimization"])
|
||||||
|
|
||||||
|
test_cases = {
|
||||||
|
"https://en.wikipedia.org/wiki/Search_engine_optimization": True,
|
||||||
|
"https://en.wikipedia.org/wiki/Randomness": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
print("\nRunning SEO Filter Tests...")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for url, expected in test_cases.items():
|
||||||
|
result = await seo_filter.apply(url)
|
||||||
|
if result != expected:
|
||||||
|
print(f"\u274C Failed: URL '{url}'")
|
||||||
|
print(f" Expected: {expected}, Got: {result}")
|
||||||
|
all_passed = False
|
||||||
|
else:
|
||||||
|
print(f"\u2705 Passed: URL '{url}'")
|
||||||
|
|
||||||
|
if all_passed:
|
||||||
|
print("\n\u2728 All SEO filter tests passed!")
|
||||||
|
else:
|
||||||
|
print("\n\u274C Some SEO filter tests failed!")
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_pattern_filter())
|
||||||
|
asyncio.run(test_domain_filter())
|
||||||
|
asyncio.run(test_content_type_filter())
|
||||||
|
asyncio.run(test_content_relevance_filter())
|
||||||
|
asyncio.run(test_seo_filter())
|
||||||
179
tests/20241401/test_deep_crawl_scorers.py
Normal file
179
tests/20241401/test_deep_crawl_scorers.py
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
from crawl4ai.deep_crawling.scorers import CompositeScorer, ContentTypeScorer, DomainAuthorityScorer, FreshnessScorer, KeywordRelevanceScorer, PathDepthScorer
|
||||||
|
|
||||||
|
|
||||||
|
def test_scorers():
|
||||||
|
test_cases = [
|
||||||
|
# Keyword Scorer Tests
|
||||||
|
{
|
||||||
|
"scorer_type": "keyword",
|
||||||
|
"config": {
|
||||||
|
"keywords": ["python", "blog"],
|
||||||
|
"weight": 1.0,
|
||||||
|
"case_sensitive": False
|
||||||
|
},
|
||||||
|
"urls": {
|
||||||
|
"https://example.com/python-blog": 1.0,
|
||||||
|
"https://example.com/PYTHON-BLOG": 1.0,
|
||||||
|
"https://example.com/python-only": 0.5,
|
||||||
|
"https://example.com/other": 0.0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
# Path Depth Scorer Tests
|
||||||
|
{
|
||||||
|
"scorer_type": "path_depth",
|
||||||
|
"config": {
|
||||||
|
"optimal_depth": 2,
|
||||||
|
"weight": 1.0
|
||||||
|
},
|
||||||
|
"urls": {
|
||||||
|
"https://example.com/a/b": 1.0,
|
||||||
|
"https://example.com/a": 0.5,
|
||||||
|
"https://example.com/a/b/c": 0.5,
|
||||||
|
"https://example.com": 0.33333333
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
# Content Type Scorer Tests
|
||||||
|
{
|
||||||
|
"scorer_type": "content_type",
|
||||||
|
"config": {
|
||||||
|
"type_weights": {
|
||||||
|
".html$": 1.0,
|
||||||
|
".pdf$": 0.8,
|
||||||
|
".jpg$": 0.6
|
||||||
|
},
|
||||||
|
"weight": 1.0
|
||||||
|
},
|
||||||
|
"urls": {
|
||||||
|
"https://example.com/doc.html": 1.0,
|
||||||
|
"https://example.com/doc.pdf": 0.8,
|
||||||
|
"https://example.com/img.jpg": 0.6,
|
||||||
|
"https://example.com/other.txt": 0.0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
# Freshness Scorer Tests
|
||||||
|
{
|
||||||
|
"scorer_type": "freshness",
|
||||||
|
"config": {
|
||||||
|
"weight": 1.0, # Remove current_year since original doesn't support it
|
||||||
|
},
|
||||||
|
"urls": {
|
||||||
|
"https://example.com/2024/01/post": 1.0,
|
||||||
|
"https://example.com/2023/12/post": 0.9,
|
||||||
|
"https://example.com/2022/post": 0.8,
|
||||||
|
"https://example.com/no-date": 0.5
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
# Domain Authority Scorer Tests
|
||||||
|
{
|
||||||
|
"scorer_type": "domain",
|
||||||
|
"config": {
|
||||||
|
"domain_weights": {
|
||||||
|
"python.org": 1.0,
|
||||||
|
"github.com": 0.8,
|
||||||
|
"medium.com": 0.6
|
||||||
|
},
|
||||||
|
"default_weight": 0.3,
|
||||||
|
"weight": 1.0
|
||||||
|
},
|
||||||
|
"urls": {
|
||||||
|
"https://python.org/about": 1.0,
|
||||||
|
"https://github.com/repo": 0.8,
|
||||||
|
"https://medium.com/post": 0.6,
|
||||||
|
"https://unknown.com": 0.3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
def create_scorer(scorer_type, config):
|
||||||
|
if scorer_type == "keyword":
|
||||||
|
return KeywordRelevanceScorer(**config)
|
||||||
|
elif scorer_type == "path_depth":
|
||||||
|
return PathDepthScorer(**config)
|
||||||
|
elif scorer_type == "content_type":
|
||||||
|
return ContentTypeScorer(**config)
|
||||||
|
elif scorer_type == "freshness":
|
||||||
|
return FreshnessScorer(**config,current_year=2024)
|
||||||
|
elif scorer_type == "domain":
|
||||||
|
return DomainAuthorityScorer(**config)
|
||||||
|
|
||||||
|
def run_accuracy_test():
|
||||||
|
print("\nAccuracy Tests:")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for test_case in test_cases:
|
||||||
|
print(f"\nTesting {test_case['scorer_type']} scorer:")
|
||||||
|
scorer = create_scorer(
|
||||||
|
test_case['scorer_type'],
|
||||||
|
test_case['config']
|
||||||
|
)
|
||||||
|
|
||||||
|
for url, expected in test_case['urls'].items():
|
||||||
|
score = round(scorer.score(url), 8)
|
||||||
|
expected = round(expected, 8)
|
||||||
|
|
||||||
|
if abs(score - expected) > 0.00001:
|
||||||
|
print(f"❌ Scorer Failed: URL '{url}'")
|
||||||
|
print(f" Expected: {expected}, Got: {score}")
|
||||||
|
all_passed = False
|
||||||
|
else:
|
||||||
|
print(f"✅ Scorer Passed: URL '{url}'")
|
||||||
|
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
def run_composite_test():
|
||||||
|
print("\nTesting Composite Scorer:")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
# Create test data
|
||||||
|
test_urls = {
|
||||||
|
"https://python.org/blog/2024/01/new-release.html":0.86666667,
|
||||||
|
"https://github.com/repo/old-code.pdf": 0.62,
|
||||||
|
"https://unknown.com/random": 0.26
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create composite scorers with all types
|
||||||
|
scorers = []
|
||||||
|
|
||||||
|
for test_case in test_cases:
|
||||||
|
scorer = create_scorer(
|
||||||
|
test_case['scorer_type'],
|
||||||
|
test_case['config']
|
||||||
|
)
|
||||||
|
scorers.append(scorer)
|
||||||
|
|
||||||
|
composite = CompositeScorer(scorers, normalize=True)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for url, expected in test_urls.items():
|
||||||
|
score = round(composite.score(url), 8)
|
||||||
|
|
||||||
|
if abs(score - expected) > 0.00001:
|
||||||
|
print(f"❌ Composite Failed: URL '{url}'")
|
||||||
|
print(f" Expected: {expected}, Got: {score}")
|
||||||
|
all_passed = False
|
||||||
|
else:
|
||||||
|
print(f"✅ Composite Passed: URL '{url}'")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
print("Running Scorer Tests...")
|
||||||
|
accuracy_passed = run_accuracy_test()
|
||||||
|
composite_passed = run_composite_test()
|
||||||
|
|
||||||
|
if accuracy_passed and composite_passed:
|
||||||
|
print("\n✨ All tests passed!")
|
||||||
|
# Note: Already have performance tests in run_scorer_performance_test()
|
||||||
|
else:
|
||||||
|
print("\n❌ Some tests failed!")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_scorers()
|
||||||
Reference in New Issue
Block a user