refactor(scraper): optimize URL validation and filter performance

- Replace validators library with built-in urlparse for URL validation - Optimize filter statistics update logic for better performance - Add performance benchmarking suite for filters - Add execution time tracking to scraper examples - Update gitignore with windsurfrules BREAKING CHANGE: Removed dependency on validators library for URL validation
2025-01-22 19:45:56 +08:00
parent 6e78c56dda
commit e6ef8d91ba
6 changed files with 1140 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -227,4 +227,7 @@ tree.md
 .do
 /plans
 .codeiumignore
-todo/
+todo/
 # windsurf rules
 .windsurfrules
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -5,7 +5,7 @@ import asyncio
 import logging
 from urllib.parse import urlparse
 from urllib.robotparser import RobotFileParser
-import validators
+# import validators
 from ..async_configs import CrawlerRunConfig
 from .models import CrawlResult
@@ -54,14 +54,22 @@ class BFSScraperStrategy(ScraperStrategy):
    async def can_process_url(self, url: str, depth: int) -> bool:
        """Check if URL can be processed based on robots.txt and filters
        This is our gatekeeper method that determines if a URL should be processed. It:
-            - Validates URL format using the validators library
+            - Validates URL format using a robust built-in method
            - Checks robots.txt permissions for the domain
            - Applies custom filters from the filter chain
            - Updates statistics for blocked URLs
            - Returns False early if any check fails
        """
-        if not validators.url(url):
+        try:
-            self.logger.warning(f"Invalid URL: {url}")
+            result = urlparse(url)
            if not all([result.scheme, result.netloc]):
                raise ValueError("Invalid URL")
            if result.scheme not in ('http', 'https'):
                raise ValueError("URL must be HTTP or HTTPS")
            if not result.netloc or '.' not in result.netloc:
                raise ValueError("Invalid domain")
        except Exception as e:
            self.logger.warning(f"Invalid URL: {url}. Error: {str(e)}")
            return False
        robot_parser = await self._get_robot_parser(url)
@@ -70,7 +78,7 @@ class BFSScraperStrategy(ScraperStrategy):
            self.logger.info(f"Blocked by robots.txt: {url}")
            return False
-        # Apply the filter chain it's not start page
+        # Apply the filter chain if it's not start page
        if depth != 0 and not self.filter_chain.apply(url):
            return False
--- a/crawl4ai/scraper/filters.py
+++ b/crawl4ai/scraper/filters.py
@@ -36,11 +36,15 @@ class URLFilter(ABC):
    def _update_stats(self, passed: bool):
        """Update filter statistics"""
        # INFO: Old trick to make things faster
        self.stats.total_urls += 1
-        if passed:
+        self.stats.passed_urls += passed
-            self.stats.passed_urls += 1
+        self.stats.rejected_urls += not passed        
-        else:
+        # self.stats.total_urls += 1
-            self.stats.rejected_urls += 1
+        # if passed:
        #     self.stats.passed_urls += 1
        # else:
        #     self.stats.rejected_urls += 1
 class FilterChain:
@@ -233,3 +237,49 @@ def create_common_filter_chain() -> FilterChain:
            DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
        ]
    )
 def run_performance_test():
    import time
    import random
    # Test URLs
    test_urls = [
        'https://example.com/article/123',
        'https://blog.example.com/post/456',
        'https://ads.example.com/tracking',
        'https://example.com/about.html',
        'https://analytics.example.com/script.js',
        'https://example.com/products.php',
        'https://subdomain.example.com/blog/post-123',
        'https://example.com/path/file.pdf',
    ] * 100000  # Create 800k URLs to test
    def benchmark(name: str, func, *args):
        start = time.perf_counter_ns()
        result = func(*args) if args[0] else func()
        elapsed = (time.perf_counter_ns() - start) / 1_000_000  # Convert to ms
        print(f"{name:<30} {elapsed:>8.3f} ms")
        return result
    # Test individual filters
    pattern_filter = URLPatternFilter(["*.html", "*/article/*"])
    content_filter = ContentTypeFilter(["text/html"])
    domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"])
    # Test chain
    chain = FilterChain([pattern_filter, content_filter, domain_filter])
    print("\nBenchmarking individual filters...")
    for url in test_urls[:5]:  # Show first 5 results
        print(f"\nTesting URL: {url}")
        benchmark("Pattern Filter", pattern_filter.apply, url)
        benchmark("Content Filter", content_filter.apply, url)
        benchmark("Domain Filter", domain_filter.apply, url)
    print("\nBenchmarking full chain...")
    benchmark("Full Chain", lambda: [chain.apply(url) for url in test_urls], None)
 if __name__ == "__main__":
    run_performance_test()
--- a/crawl4ai/scraper/filters_review.py
+++ b/crawl4ai/scraper/filters_review.py
@@ -0,0 +1,873 @@
 # from .url_filter import URLFilter, FilterChain
 # from .content_type_filter import ContentTypeFilter
 # from .url_pattern_filter import URLPatternFilter
 from abc import ABC, abstractmethod
 from typing import List, Pattern, Set, Union, FrozenSet
 import re, time
 from urllib.parse import urlparse
 from array import array
 import logging
 from functools import lru_cache
 import fnmatch
 from dataclasses import dataclass
 from typing import ClassVar
 import weakref
 import mimetypes
@dataclass
 class FilterStats:
    # PERF: Using dataclass creates overhead with __init__ and property access
    # PERF: Could use __slots__ to reduce memory footprint
    # PERF: Consider using array.array('I') for atomic increments
    total_urls: int = 0
    rejected_urls: int = 0
    passed_urls: int = 0
 class URLFilter(ABC):
    # PERF: Logger creation is expensive, consider lazy initialization
    # PERF: stats object creation adds overhead for each filter instance
    def __init__(self, name: str = None):
        self.name = name or self.__class__.__name__
        self.stats = FilterStats()
        self.logger = logging.getLogger(f"urlfilter.{self.name}")
    @abstractmethod
    def apply(self, url: str) -> bool:
        pass
    def _update_stats(self, passed: bool):
        # PERF: Already optimized but could use bitwise operations
        # PERF: Consider removing stats entirely in production/fast mode
        self.stats.total_urls += 1
        self.stats.passed_urls += passed
        self.stats.rejected_urls += not passed
 class FilterChain:
    # PERF: List traversal for each URL is expensive
    # PERF: Could use array.array instead of list for filters
    # PERF: Consider adding fast path for single filter case
    def __init__(self, filters: List[URLFilter] = None):
        self.filters = filters or []
        self.stats = FilterStats()
        self.logger = logging.getLogger("urlfilter.chain")
    def apply(self, url: str) -> bool:
        # PERF: Logging on every rejection is expensive
        # PERF: Could reorder filters by rejection rate
        # PERF: Consider batch processing mode
        self.stats.total_urls += 1
        for filter_ in self.filters:
            if not filter_.apply(url):
                self.stats.rejected_urls += 1
                self.logger.debug(f"URL {url} rejected by {filter_.name}")
                return False
        self.stats.passed_urls += 1
        return True
 class URLPatternFilter(URLFilter):
    # PERF: Converting glob to regex is expensive
    # PERF: Multiple regex compilation is slow
    # PERF: List of patterns causes multiple regex evaluations
    def __init__(
        self,
        patterns: Union[str, Pattern, List[Union[str, Pattern]]],
        use_glob: bool = True,
    ):
        super().__init__()
        self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
        self.use_glob = use_glob
        self._compiled_patterns = []
        # PERF: This could be consolidated into a single regex with OR conditions
        # PERF: glob_to_regex creates complex patterns, could be simplified
        for pattern in self.patterns:
            if isinstance(pattern, str) and use_glob:
                self._compiled_patterns.append(self._glob_to_regex(pattern))
            else:
                self._compiled_patterns.append(
                    re.compile(pattern) if isinstance(pattern, str) else pattern
                )
    def _glob_to_regex(self, pattern: str) -> Pattern:
        # PERF: fnmatch.translate creates overly complex patterns
        # PERF: Could cache common translations
        return re.compile(fnmatch.translate(pattern))
    def apply(self, url: str) -> bool:
        # PERF: any() with generator is slower than direct loop with early return
        # PERF: searching entire string is slower than anchored match
        matches = any(pattern.search(url) for pattern in self._compiled_patterns)
        self._update_stats(matches)
        return matches
 class ContentTypeFilter(URLFilter):
    # PERF: mimetypes guessing is extremely slow
    # PERF: URL parsing on every check is expensive
    # PERF: No caching of results for similar extensions
    def __init__(
        self, allowed_types: Union[str, List[str]], check_extension: bool = True
    ):
        super().__init__()
        self.allowed_types = (
            [allowed_types] if isinstance(allowed_types, str) else allowed_types
        )
        self.check_extension = check_extension
        self._normalize_types()
    def _normalize_types(self):
        """Normalize content type strings"""
        self.allowed_types = [t.lower() for t in self.allowed_types]
    def _check_extension(self, url: str) -> bool:
        # PERF: urlparse is called on every check
        # PERF: multiple string splits are expensive
        # PERF: mimetypes.guess_type is very slow
        ext = (
            urlparse(url).path.split(".")[-1].lower()
            if "." in urlparse(url).path
            else ""
        )
        if not ext:
            return True
        # PERF: guess_type is main bottleneck
        guessed_type = mimetypes.guess_type(url)[0]
        return any(
            allowed in (guessed_type or "").lower() for allowed in self.allowed_types
        )
    def apply(self, url: str) -> bool:
        """Check if URL's content type is allowed"""
        result = True
        if self.check_extension:
            result = self._check_extension(url)
        self._update_stats(result)
        return result
 class DomainFilter(URLFilter):
    # PERF: Set lookups are fast but string normalizations on init are not
    # PERF: Creating two sets doubles memory usage
    def __init__(
        self,
        allowed_domains: Union[str, List[str]] = None,
        blocked_domains: Union[str, List[str]] = None,
    ):
        super().__init__()
        # PERF: Normalizing domains on every init is wasteful
        # PERF: Could use frozenset for immutable lists
        self.allowed_domains = (
            set(self._normalize_domains(allowed_domains)) if allowed_domains else None
        )
        self.blocked_domains = (
            set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
        )
    def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
        # PERF: strip() and lower() create new strings for each domain
        # PERF: List comprehension creates intermediate list
        if isinstance(domains, str):
            domains = [domains]
        return [d.lower().strip() for d in domains]
    def _extract_domain(self, url: str) -> str:
        # PERF: urlparse is called for every URL check
        # PERF: lower() creates new string every time
        # PERF: Could cache recent results
        return urlparse(url).netloc.lower()
    def apply(self, url: str) -> bool:
        # PERF: Two separate set lookups in worst case
        # PERF: Domain extraction happens before knowing if we have any filters
        domain = self._extract_domain(url)
        if domain in self.blocked_domains:
            self._update_stats(False)
            return False
        if self.allowed_domains is not None and domain not in self.allowed_domains:
            self._update_stats(False)
            return False
        self._update_stats(True)
        return True
 # Example usage:
 def create_common_filter_chain() -> FilterChain:
    """Create a commonly used filter chain"""
    return FilterChain(
        [
            URLPatternFilter(
                [
                    "*.html",
                    "*.htm",  # HTML files
                    "*/article/*",
                    "*/blog/*",  # Common content paths
                ]
            ),
            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
            DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
        ]
    )
 ####################################################################################
 # Uncledoe: Optimized Version
 ####################################################################################
 # Use __slots__ and array for maximum memory/speed efficiency
 class FastFilterStats:
    __slots__ = ("_counters",)
    def __init__(self):
        # Use array of unsigned ints for atomic operations
        self._counters = array("I", [0, 0, 0])  # total, passed, rejected
    @property
    def total_urls(self):
        return self._counters[0]
    @property
    def passed_urls(self):
        return self._counters[1]
    @property
    def rejected_urls(self):
        return self._counters[2]
 class FastURLFilter(ABC):
    """Optimized base filter class"""
    __slots__ = ("name", "stats", "_logger_ref")
    def __init__(self, name: str = None):
        self.name = name or self.__class__.__name__
        self.stats = FastFilterStats()
        # Lazy logger initialization using weakref
        self._logger_ref = None
    @property
    def logger(self):
        if self._logger_ref is None or self._logger_ref() is None:
            logger = logging.getLogger(f"urlfilter.{self.name}")
            self._logger_ref = weakref.ref(logger)
        return self._logger_ref()
    @abstractmethod
    def apply(self, url: str) -> bool:
        pass
    def _update_stats(self, passed: bool):
        # Use direct array index for speed
        self.stats._counters[0] += 1  # total
        self.stats._counters[1] += passed  # passed
        self.stats._counters[2] += not passed  # rejected
 class FastFilterChain:
    """Optimized filter chain"""
    __slots__ = ("filters", "stats", "_logger_ref")
    def __init__(self, filters: List[FastURLFilter] = None):
        self.filters = tuple(filters or [])  # Immutable tuple for speed
        self.stats = FastFilterStats()
        self._logger_ref = None
    @property
    def logger(self):
        if self._logger_ref is None or self._logger_ref() is None:
            logger = logging.getLogger("urlfilter.chain")
            self._logger_ref = weakref.ref(logger)
        return self._logger_ref()
    def add_filter(self, filter_: FastURLFilter) -> "FastFilterChain":
        """Add a filter to the chain"""
        self.filters.append(filter_)
        return self  # Enable method chaining
    def apply(self, url: str) -> bool:
        """Optimized apply with minimal operations"""
        self.stats._counters[0] += 1  # total
        # Direct tuple iteration is faster than list
        for f in self.filters:
            if not f.apply(url):
                self.stats._counters[2] += 1  # rejected
                return False
        self.stats._counters[1] += 1  # passed
        return True
 class FastURLPatternFilter(FastURLFilter):
    """Pattern filter balancing speed and completeness"""
    __slots__ = ('_simple_suffixes', '_simple_prefixes', '_domain_patterns', '_path_patterns')
    PATTERN_TYPES = {
        'SUFFIX': 1,    # *.html
        'PREFIX': 2,    # /foo/*
        'DOMAIN': 3,    # *.example.com
        'PATH': 4 ,      # Everything else
        'REGEX': 5 
    }
    def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True):
        super().__init__()
        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
        self._simple_suffixes = set()
        self._simple_prefixes = set()
        self._domain_patterns = []
        self._path_patterns = []
        for pattern in patterns:
            pattern_type = self._categorize_pattern(pattern)
            self._add_pattern(pattern, pattern_type)
    def _categorize_pattern(self, pattern: str) -> int:
        """Categorize pattern for specialized handling"""
        if not isinstance(pattern, str):
            return self.PATTERN_TYPES['PATH']
        # Check if it's a regex pattern
        if pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern:
            return self.PATTERN_TYPES['REGEX']
        if pattern.count('*') == 1:
            if pattern.startswith('*.'):
                return self.PATTERN_TYPES['SUFFIX']
            if pattern.endswith('/*'):
                return self.PATTERN_TYPES['PREFIX']
        if '://' in pattern and pattern.startswith('*.'):
            return self.PATTERN_TYPES['DOMAIN']
        return self.PATTERN_TYPES['PATH']
    def _add_pattern(self, pattern: str, pattern_type: int):
        """Add pattern to appropriate matcher"""
        if pattern_type == self.PATTERN_TYPES['REGEX']:
            # For regex patterns, compile directly without glob translation
            if isinstance(pattern, str) and (pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern):
                self._path_patterns.append(re.compile(pattern))
                return
        elif pattern_type == self.PATTERN_TYPES['SUFFIX']:
            self._simple_suffixes.add(pattern[2:])
        elif pattern_type == self.PATTERN_TYPES['PREFIX']:
            self._simple_prefixes.add(pattern[:-2])
        elif pattern_type == self.PATTERN_TYPES['DOMAIN']:
            self._domain_patterns.append(
                re.compile(pattern.replace('*.', r'[^/]+\.'))
            )
        else:
            if isinstance(pattern, str):
                # Handle complex glob patterns
                if '**' in pattern:
                    pattern = pattern.replace('**', '.*')
                if '{' in pattern:
                    # Convert {a,b} to (a|b)
                    pattern = re.sub(r'\{([^}]+)\}', 
                                   lambda m: f'({"|".join(m.group(1).split(","))})',
                                   pattern)
                pattern = fnmatch.translate(pattern)
            self._path_patterns.append(
                pattern if isinstance(pattern, Pattern) else re.compile(pattern)
            )
    @lru_cache(maxsize=10000)
    def apply(self, url: str) -> bool:
        """Hierarchical pattern matching"""
        # Quick suffix check (*.html)
        if self._simple_suffixes:
            path = url.split('?')[0]
            if path.split('/')[-1].split('.')[-1] in self._simple_suffixes:
                self._update_stats(True)
                return True
        # Domain check
        if self._domain_patterns:
            for pattern in self._domain_patterns:
                if pattern.match(url):
                    self._update_stats(True)
                    return True
        # Prefix check (/foo/*)
        if self._simple_prefixes:
            path = url.split('?')[0]
            if any(path.startswith(p) for p in self._simple_prefixes):
                self._update_stats(True)
                return True
        # Complex patterns
        if self._path_patterns:
            if any(p.search(url) for p in self._path_patterns):
                self._update_stats(True)
                return True
        self._update_stats(False)
        return False
 class FastContentTypeFilter(FastURLFilter):
    """Optimized content type filter using fast lookups"""
    __slots__ = ("allowed_types", "_ext_map", "_check_extension")
    # Fast extension to mime type mapping
    _MIME_MAP = {
        # Text Formats
        "txt": "text/plain",
        "html": "text/html",
        "htm": "text/html",
        "xhtml": "application/xhtml+xml",
        "css": "text/css",
        "csv": "text/csv",
        "ics": "text/calendar",
        "js": "application/javascript",
        # Images
        "bmp": "image/bmp",
        "gif": "image/gif",
        "jpeg": "image/jpeg",
        "jpg": "image/jpeg",
        "png": "image/png",
        "svg": "image/svg+xml",
        "tiff": "image/tiff",
        "ico": "image/x-icon",
        "webp": "image/webp",
        # Audio
        "mp3": "audio/mpeg",
        "wav": "audio/wav",
        "ogg": "audio/ogg",
        "m4a": "audio/mp4",
        "aac": "audio/aac",
        # Video
        "mp4": "video/mp4",
        "mpeg": "video/mpeg",
        "webm": "video/webm",
        "avi": "video/x-msvideo",
        "mov": "video/quicktime",
        "flv": "video/x-flv",
        "wmv": "video/x-ms-wmv",
        "mkv": "video/x-matroska",
        # Applications
        "json": "application/json",
        "xml": "application/xml",
        "pdf": "application/pdf",
        "zip": "application/zip",
        "gz": "application/gzip",
        "tar": "application/x-tar",
        "rar": "application/vnd.rar",
        "7z": "application/x-7z-compressed",
        "exe": "application/vnd.microsoft.portable-executable",
        "msi": "application/x-msdownload",
        # Fonts
        "woff": "font/woff",
        "woff2": "font/woff2",
        "ttf": "font/ttf",
        "otf": "font/otf",
        # Microsoft Office
        "doc": "application/msword",
        "dot": "application/msword",
        "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        "xls": "application/vnd.ms-excel",
        "ppt": "application/vnd.ms-powerpoint",
        "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        # OpenDocument Formats
        "odt": "application/vnd.oasis.opendocument.text",
        "ods": "application/vnd.oasis.opendocument.spreadsheet",
        "odp": "application/vnd.oasis.opendocument.presentation",
        # Archives
        "tar.gz": "application/gzip",
        "tgz": "application/gzip",
        "bz2": "application/x-bzip2",
        # Others
        "rtf": "application/rtf",
        "apk": "application/vnd.android.package-archive",
        "epub": "application/epub+zip",
        "jar": "application/java-archive",
        "swf": "application/x-shockwave-flash",
        "midi": "audio/midi",
        "mid": "audio/midi",
        "ps": "application/postscript",
        "ai": "application/postscript",
        "eps": "application/postscript",
        # Custom or less common
        "bin": "application/octet-stream",
        "dmg": "application/x-apple-diskimage",
        "iso": "application/x-iso9660-image",
        "deb": "application/x-debian-package",
        "rpm": "application/x-rpm",
        "sqlite": "application/vnd.sqlite3",
        # Placeholder
        "unknown": "application/octet-stream",  # Fallback for unknown file types
    }
    @staticmethod
    @lru_cache(maxsize=1000)
    def _extract_extension(path: str) -> str:
        """Fast extension extraction with caching"""
        if "." not in path:
            return ""
        return path.rpartition(".")[-1].lower()
    def __init__(
        self, allowed_types: Union[str, List[str]], check_extension: bool = True
    ):
        super().__init__()
        # Normalize and store as frozenset for fast lookup
        self.allowed_types = frozenset(
            t.lower()
            for t in (
                allowed_types if isinstance(allowed_types, list) else [allowed_types]
            )
        )
        self._check_extension = check_extension
        # Pre-compute extension map for allowed types
        self._ext_map = frozenset(
            ext
            for ext, mime in self._MIME_MAP.items()
            if any(allowed in mime for allowed in self.allowed_types)
        )
    @lru_cache(maxsize=1000)
    def _check_url_cached(self, url: str) -> bool:
        """Cached URL checking"""
        if not self._check_extension:
            return True
        path = url.split("?")[0]  # Fast path split
        ext = self._extract_extension(path)
        if not ext:
            return True
        return ext in self._ext_map
    def apply(self, url: str) -> bool:
        """Fast extension check with caching"""
        result = self._check_url_cached(url)
        self._update_stats(result)
        return result
 class FastDomainFilter(FastURLFilter):
    """Optimized domain filter with fast lookups and caching"""
    __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
    # Regex for fast domain extraction
    _DOMAIN_REGEX = re.compile(r"://([^/]+)")
    def __init__(
        self,
        allowed_domains: Union[str, List[str]] = None,
        blocked_domains: Union[str, List[str]] = None,
    ):
        super().__init__()
        # Convert inputs to frozensets for immutable, fast lookups
        self._allowed_domains = (
            frozenset(self._normalize_domains(allowed_domains))
            if allowed_domains
            else None
        )
        self._blocked_domains = (
            frozenset(self._normalize_domains(blocked_domains))
            if blocked_domains
            else frozenset()
        )
    @staticmethod
    def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]:
        """Fast domain normalization"""
        if isinstance(domains, str):
            return {domains.lower()}
        return {d.lower() for d in domains}
    @staticmethod
    @lru_cache(maxsize=10000)
    def _extract_domain(url: str) -> str:
        """Ultra-fast domain extraction with regex and caching"""
        match = FastDomainFilter._DOMAIN_REGEX.search(url)
        return match.group(1).lower() if match else ""
    def apply(self, url: str) -> bool:
        """Optimized domain checking with early returns"""
        # Skip processing if no filters
        if not self._blocked_domains and self._allowed_domains is None:
            self._update_stats(True)
            return True
        domain = self._extract_domain(url)
        # Early return for blocked domains
        if domain in self._blocked_domains:
            self._update_stats(False)
            return False
        # If no allowed domains specified, accept all non-blocked
        if self._allowed_domains is None:
            self._update_stats(True)
            return True
        # Final allowed domains check
        result = domain in self._allowed_domains
        self._update_stats(result)
        return result
 def create_fast_filter_chain() -> FastFilterChain:
    """Create an optimized filter chain with filters ordered by rejection rate"""
    return FastFilterChain(
        [
            # Domain filter first (fastest rejection)
            FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]),
            # Content filter second (medium speed)
            FastContentTypeFilter(["text/html", "application/xhtml+xml"]),
            # Pattern filter last (most expensive)
            FastURLPatternFilter(
                [
                    "*.html",
                    "*.htm",
                    "*/article/*",
                    "*/blog/*",
                ]
            ),
        ]
    )
 def run_performance_test():
    import time
    import random
    from itertools import cycle
    # Generate test URLs
    base_urls = [
        "https://example.com/article/123",
        "https://blog.example.com/post/456",
        "https://ads.example.com/tracking",
        "https://example.com/about.html",
        "https://analytics.example.com/script.js",
        "https://example.com/products.php",
        "https://subdomain.example.com/blog/post-123",
        "https://example.com/path/file.pdf",
    ]
    # Create more varied test data
    test_urls = []
    for base in base_urls:
        # Add original
        test_urls.append(base)
        # Add variations
        parts = base.split("/")
        for i in range(10):
            parts[-1] = f"page_{i}.html"
            test_urls.append("/".join(parts))
    # Multiply to get enough test data
    test_urls = test_urls * 10000  # Creates ~800k URLs
    def benchmark(name: str, func, *args, warmup=True):
        if warmup:
            # Warmup run
            func(*args)
        # Actual timing
        start = time.perf_counter_ns()
        result = func(*args)
        elapsed = (time.perf_counter_ns() - start) / 1_000_000  # Convert to ms
        print(
            f"{name:<30} {elapsed:>8.3f} ms  ({len(test_urls)/elapsed*1000:,.0f} URLs/sec)"
        )
        return result
    print("\nBenchmarking original vs optimized implementations...")
    print("-" * 70)
    # Original implementation
    pattern_filter = URLPatternFilter(["*.html", "*/article/*"])
    content_filter = ContentTypeFilter(["text/html"])
    domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"])
    chain = FilterChain([pattern_filter, content_filter, domain_filter])
    # Optimized implementation
    fast_pattern_filter = FastURLPatternFilter(["*.html", "*/article/*"])
    fast_content_filter = FastContentTypeFilter(["text/html"])
    fast_domain_filter = FastDomainFilter(blocked_domains=["ads.*", "analytics.*"])
    fast_chain = FastFilterChain(
        [fast_domain_filter, fast_content_filter, fast_pattern_filter]
    )
    # Test individual filters
    print("\nSingle filter performance (first 1000 URLs):")
    test_subset = test_urls[:1000]
    print("\nPattern Filters:")
    benchmark(
        "Original Pattern Filter",
        lambda: [pattern_filter.apply(url) for url in test_subset],
    )
    benchmark(
        "Optimized Pattern Filter",
        lambda: [fast_pattern_filter.apply(url) for url in test_subset],
    )
    print("\nContent Filters:")
    benchmark(
        "Original Content Filter",
        lambda: [content_filter.apply(url) for url in test_subset],
    )
    benchmark(
        "Optimized Content Filter",
        lambda: [fast_content_filter.apply(url) for url in test_subset],
    )
    print("\nDomain Filters:")
    benchmark(
        "Original Domain Filter",
        lambda: [domain_filter.apply(url) for url in test_subset],
    )
    benchmark(
        "Optimized Domain Filter",
        lambda: [fast_domain_filter.apply(url) for url in test_subset],
    )
    print("\nFull Chain Performance (all URLs):")
    # Test chain
    benchmark("Original Chain", lambda: [chain.apply(url) for url in test_urls])
    benchmark("Optimized Chain", lambda: [fast_chain.apply(url) for url in test_urls])
    # Memory usage
    import sys
    print("\nMemory Usage per Filter:")
    print(f"Original Pattern Filter: {sys.getsizeof(pattern_filter):,} bytes")
    print(f"Optimized Pattern Filter: {sys.getsizeof(fast_pattern_filter):,} bytes")
    print(f"Original Content Filter: {sys.getsizeof(content_filter):,} bytes")
    print(f"Optimized Content Filter: {sys.getsizeof(fast_content_filter):,} bytes")
    print(f"Original Domain Filter: {sys.getsizeof(domain_filter):,} bytes")
    print(f"Optimized Domain Filter: {sys.getsizeof(fast_domain_filter):,} bytes")
 def test_pattern_filter():
    import time
    from itertools import chain
    # Test cases as list of tuples instead of dict for multiple patterns
    test_cases = [
        # Simple suffix patterns (*.html)
        ("*.html", {
            "https://example.com/page.html": True,
            "https://example.com/path/doc.html": True,
            "https://example.com/page.htm": False,
            "https://example.com/page.html?param=1": True,
        }),
        # Path prefix patterns (/foo/*)
        ("*/article/*", {
            "https://example.com/article/123": True,
            "https://example.com/blog/article/456": True,
            "https://example.com/articles/789": False,
            "https://example.com/article": False,
        }),
        # Complex patterns
        ("blog-*-[0-9]", {
            "https://example.com/blog-post-1": True,
            "https://example.com/blog-test-9": True,
            "https://example.com/blog-post": False,
            "https://example.com/blog-post-x": False,
        }),
        # Multiple patterns case
        (["*.pdf", "*/download/*"], {
            "https://example.com/doc.pdf": True,
            "https://example.com/download/file.txt": True,
            "https://example.com/path/download/doc": True,
            "https://example.com/uploads/file.txt": False,
        }),
        # Edge cases
        ("*", {
            "https://example.com": True,
            "": True,
            "http://test.com/path": True,
        }),
        # Complex regex
        (r"^https?://.*\.example\.com/\d+", {
            "https://sub.example.com/123": True,
            "http://test.example.com/456": True,
            "https://example.com/789": False,
            "https://sub.example.com/abc": False,
        })
    ]
    def run_accuracy_test():
        print("\nAccuracy Tests:")
        print("-" * 50)
        all_passed = True
        for patterns, test_urls in test_cases:
            filter_obj = FastURLPatternFilter(patterns)
            for url, expected in test_urls.items():
                result = filter_obj.apply(url)
                if result != expected:
                    print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
                    print(f"   Expected: {expected}, Got: {result}")
                    all_passed = False
                else:
                    print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
        return all_passed
    def run_speed_test():
        print("\nSpeed Tests:")
        print("-" * 50)
        # Create a large set of test URLs
        all_urls = list(chain.from_iterable(urls.keys() for _, urls in test_cases))
        test_urls = all_urls * 10000  # 100K+ URLs
        # Test both implementations
        original = URLPatternFilter(["*.html", "*/article/*", "blog-*"])
        optimized = FastURLPatternFilter(["*.html", "*/article/*", "blog-*"])
        def benchmark(name, filter_obj):
            start = time.perf_counter()
            for url in test_urls:
                filter_obj.apply(url)
            elapsed = time.perf_counter() - start
            urls_per_sec = len(test_urls) / elapsed
            print(f"{name:<20} {elapsed:.3f}s ({urls_per_sec:,.0f} URLs/sec)")
        benchmark("Original Filter:", original)
        benchmark("Optimized Filter:", optimized)
    # Run tests
    print("Running Pattern Filter Tests...")
    accuracy_passed = run_accuracy_test()
    if accuracy_passed:
        print("\n✨ All accuracy tests passed!")
        run_speed_test()
    else:
        print("\n❌ Some accuracy tests failed!")
 if __name__ == "__main__":
    run_performance_test()
    # test_pattern_filter()
--- a/docs/scraper/scraper_quickstart.py
+++ b/docs/scraper/scraper_quickstart.py
@@ -8,6 +8,7 @@ from crawl4ai.scraper import (
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
 import re
 import time
 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
@@ -55,6 +56,7 @@ async def basic_scraper_example():
 # advanced_scraper_example.py
 import logging
 from crawl4ai.scraper import (
    AsyncWebScraper,
    BFSScraperStrategy,
@@ -177,11 +179,15 @@ async def advanced_scraper_example():
 if __name__ == "__main__":
    import asyncio
    import time
    # Run basic example
    start_time = time.perf_counter()
    print("Running basic scraper example...")
    asyncio.run(basic_scraper_example())
    end_time = time.perf_counter()
    print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
-    # Run advanced example
+    # # Run advanced example
-    print("\nRunning advanced scraper example...")
+    # print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())
+    # asyncio.run(advanced_scraper_example())
--- a/docs/scraper/scraper_quickstart_review.py
+++ b/docs/scraper/scraper_quickstart_review.py
@@ -0,0 +1,187 @@
 # basic_scraper_example.py
 from crawl4ai.scraper import (
    AsyncWebScraper,
    BFSScraperStrategy,
    FilterChain,
    URLPatternFilter,
    ContentTypeFilter,
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
 import re
 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
 async def basic_scraper_example():
    """
    Basic example: Scrape a blog site for articles
    - Crawls only HTML pages
    - Stays within the blog section
    - Collects all results at once
    """
    # Create a simple filter chain
    filter_chain = FilterChain(
        [
            # Only crawl pages within the blog section
            URLPatternFilter("*/tutorial/*"),
            # Only process HTML pages
            ContentTypeFilter(["text/html"]),
        ]
    )
    # Initialize the strategy with basic configuration
    strategy = BFSScraperStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
        process_external_links=True,
    )
    # Create the crawler and scraper
    async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
        scraper = AsyncWebScraper(crawler, strategy)
        # Start scraping
        try:
            result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
            # Process results
            print(f"Crawled {len(result.crawled_urls)} pages:")
            for url, data in result.extracted_data.items():
                print(f"- {url}: {len(data.html)} bytes")
        except Exception as e:
            print(f"Error during scraping: {e}")
 # advanced_scraper_example.py
 import logging
 from crawl4ai.scraper import (
    AsyncWebScraper,
    BFSScraperStrategy,
    FilterChain,
    URLPatternFilter,
    ContentTypeFilter,
    DomainFilter,
    KeywordRelevanceScorer,
    PathDepthScorer,
    FreshnessScorer,
    CompositeScorer,
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 async def advanced_scraper_example():
    """
    Advanced example: Intelligent news site scraping
    - Uses all filter types
    - Implements sophisticated scoring
    - Streams results
    - Includes monitoring and logging
    """
    # Set up logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger("advanced_scraper")
    # Create sophisticated filter chain
    filter_chain = FilterChain(
        [
            # Domain control
            DomainFilter(
                allowed_domains=["techcrunch.com"],
                blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
            ),
            # URL patterns
            URLPatternFilter(
                [
                    "*/article/*",
                    "*/news/*",
                    "*/blog/*",
                    re.compile(r"\d{4}/\d{2}/.*"),  # Date-based URLs
                ]
            ),
            # Content types
            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
        ]
    )
    # Create composite scorer
    scorer = CompositeScorer(
        [
            # Prioritize by keywords
            KeywordRelevanceScorer(
                keywords=["news", "breaking", "update", "latest"], weight=1.0
            ),
            # Prefer optimal URL structure
            PathDepthScorer(optimal_depth=3, weight=0.7),
            # Prioritize fresh content
            FreshnessScorer(weight=0.9),
        ]
    )
    # Initialize strategy with advanced configuration
    strategy = BFSScraperStrategy(
        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
    )
    # Create crawler and scraper
    async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler:
        scraper = AsyncWebScraper(crawler, strategy)
        # Track statistics
        stats = {"processed": 0, "errors": 0, "total_size": 0}
        try:
            # Use streaming mode
            result_generator = await scraper.ascrape(
                "https://techcrunch.com", stream=True
            )
            async for result in result_generator:
                stats["processed"] += 1
                if result.success:
                    stats["total_size"] += len(result.html)
                    logger.info(f"Processed: {result.url}")
                else:
                    stats["errors"] += 1
                    logger.error(
                        f"Failed to process {result.url}: {result.error_message}"
                    )
                # Log progress regularly
                if stats["processed"] % 10 == 0:
                    logger.info(f"Progress: {stats['processed']} URLs processed")
        except Exception as e:
            logger.error(f"Scraping error: {e}")
        finally:
            # Print final statistics
            logger.info("Scraping completed:")
            logger.info(f"- URLs processed: {stats['processed']}")
            logger.info(f"- Errors: {stats['errors']}")
            logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
            # Print filter statistics
            for filter_ in filter_chain.filters:
                logger.info(f"{filter_.name} stats:")
                logger.info(f"- Passed: {filter_.stats.passed_urls}")
                logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
            # Print scorer statistics
            logger.info("Scoring statistics:")
            logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
            logger.info(
                f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
            )
 if __name__ == "__main__":
    import asyncio
    # Run basic example
    print("Running basic scraper example...")
    asyncio.run(basic_scraper_example())
    # Run advanced example
    print("\nRunning advanced scraper example...")
    asyncio.run(advanced_scraper_example())