refactor(scraper): optimize URL validation and filter performance

- Replace validators library with built-in urlparse for URL validation - Optimize filter statistics update logic for better performance - Add performance benchmarking suite for filters - Add execution time tracking to scraper examples - Update gitignore with windsurfrules BREAKING CHANGE: Removed dependency on validators library for URL validation
2025-01-22 19:45:56 +08:00
parent 6e78c56dda
commit e6ef8d91ba
6 changed files with 1140 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -227,4 +227,7 @@ tree.md
 .do
 /plans
 .codeiumignore
-todo/
+todo/
+
+# windsurf rules
+.windsurfrules
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -5,7 +5,7 @@ import asyncio
 import logging
 from urllib.parse import urlparse
 from urllib.robotparser import RobotFileParser
-import validators
+# import validators

 from ..async_configs import CrawlerRunConfig
 from .models import CrawlResult
@@ -54,14 +54,22 @@ class BFSScraperStrategy(ScraperStrategy):
    async def can_process_url(self, url: str, depth: int) -> bool:
        """Check if URL can be processed based on robots.txt and filters
        This is our gatekeeper method that determines if a URL should be processed. It:
-            - Validates URL format using the validators library
+            - Validates URL format using a robust built-in method
            - Checks robots.txt permissions for the domain
            - Applies custom filters from the filter chain
            - Updates statistics for blocked URLs
            - Returns False early if any check fails
        """
-        if not validators.url(url):
-            self.logger.warning(f"Invalid URL: {url}")
+        try:
+            result = urlparse(url)
+            if not all([result.scheme, result.netloc]):
+                raise ValueError("Invalid URL")
+            if result.scheme not in ('http', 'https'):
+                raise ValueError("URL must be HTTP or HTTPS")
+            if not result.netloc or '.' not in result.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}. Error: {str(e)}")
            return False

        robot_parser = await self._get_robot_parser(url)
@@ -70,7 +78,7 @@ class BFSScraperStrategy(ScraperStrategy):
            self.logger.info(f"Blocked by robots.txt: {url}")
            return False

-        # Apply the filter chain it's not start page
+        # Apply the filter chain if it's not start page
        if depth != 0 and not self.filter_chain.apply(url):
            return False

--- a/crawl4ai/scraper/filters.py
+++ b/crawl4ai/scraper/filters.py
@@ -36,11 +36,15 @@ class URLFilter(ABC):

    def _update_stats(self, passed: bool):
        """Update filter statistics"""
+        # INFO: Old trick to make things faster
        self.stats.total_urls += 1
-        if passed:
-            self.stats.passed_urls += 1
-        else:
-            self.stats.rejected_urls += 1
+        self.stats.passed_urls += passed
+        self.stats.rejected_urls += not passed        
+        # self.stats.total_urls += 1
+        # if passed:
+        #     self.stats.passed_urls += 1
+        # else:
+        #     self.stats.rejected_urls += 1


 class FilterChain:
@@ -233,3 +237,49 @@ def create_common_filter_chain() -> FilterChain:
            DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
        ]
    )
+
+
+
+def run_performance_test():
+    import time
+    import random
+    
+    # Test URLs
+    test_urls = [
+        'https://example.com/article/123',
+        'https://blog.example.com/post/456',
+        'https://ads.example.com/tracking',
+        'https://example.com/about.html',
+        'https://analytics.example.com/script.js',
+        'https://example.com/products.php',
+        'https://subdomain.example.com/blog/post-123',
+        'https://example.com/path/file.pdf',
+    ] * 100000  # Create 800k URLs to test
+    
+    def benchmark(name: str, func, *args):
+        start = time.perf_counter_ns()
+        result = func(*args) if args[0] else func()
+        elapsed = (time.perf_counter_ns() - start) / 1_000_000  # Convert to ms
+        print(f"{name:<30} {elapsed:>8.3f} ms")
+        return result
+
+    # Test individual filters
+    pattern_filter = URLPatternFilter(["*.html", "*/article/*"])
+    content_filter = ContentTypeFilter(["text/html"])
+    domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"])
+    
+    # Test chain
+    chain = FilterChain([pattern_filter, content_filter, domain_filter])
+    
+    print("\nBenchmarking individual filters...")
+    for url in test_urls[:5]:  # Show first 5 results
+        print(f"\nTesting URL: {url}")
+        benchmark("Pattern Filter", pattern_filter.apply, url)
+        benchmark("Content Filter", content_filter.apply, url)
+        benchmark("Domain Filter", domain_filter.apply, url)
+    
+    print("\nBenchmarking full chain...")
+    benchmark("Full Chain", lambda: [chain.apply(url) for url in test_urls], None)
+
+if __name__ == "__main__":
+    run_performance_test()
--- a/crawl4ai/scraper/filters_review.py
+++ b/crawl4ai/scraper/filters_review.py
@@ -0,0 +1,873 @@
+# from .url_filter import URLFilter, FilterChain
+# from .content_type_filter import ContentTypeFilter
+# from .url_pattern_filter import URLPatternFilter
+
+from abc import ABC, abstractmethod
+from typing import List, Pattern, Set, Union, FrozenSet
+import re, time
+from urllib.parse import urlparse
+from array import array
+import logging
+from functools import lru_cache
+import fnmatch
+from dataclasses import dataclass
+from typing import ClassVar
+import weakref
+import mimetypes
+
+
+@dataclass
+class FilterStats:
+    # PERF: Using dataclass creates overhead with __init__ and property access
+    # PERF: Could use __slots__ to reduce memory footprint
+    # PERF: Consider using array.array('I') for atomic increments
+    total_urls: int = 0
+    rejected_urls: int = 0
+    passed_urls: int = 0
+
+
+class URLFilter(ABC):
+    # PERF: Logger creation is expensive, consider lazy initialization
+    # PERF: stats object creation adds overhead for each filter instance
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FilterStats()
+        self.logger = logging.getLogger(f"urlfilter.{self.name}")
+
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+    def _update_stats(self, passed: bool):
+        # PERF: Already optimized but could use bitwise operations
+        # PERF: Consider removing stats entirely in production/fast mode
+        self.stats.total_urls += 1
+        self.stats.passed_urls += passed
+        self.stats.rejected_urls += not passed
+
+
+class FilterChain:
+    # PERF: List traversal for each URL is expensive
+    # PERF: Could use array.array instead of list for filters
+    # PERF: Consider adding fast path for single filter case
+    def __init__(self, filters: List[URLFilter] = None):
+        self.filters = filters or []
+        self.stats = FilterStats()
+        self.logger = logging.getLogger("urlfilter.chain")
+
+    def apply(self, url: str) -> bool:
+        # PERF: Logging on every rejection is expensive
+        # PERF: Could reorder filters by rejection rate
+        # PERF: Consider batch processing mode
+        self.stats.total_urls += 1
+
+        for filter_ in self.filters:
+            if not filter_.apply(url):
+                self.stats.rejected_urls += 1
+                self.logger.debug(f"URL {url} rejected by {filter_.name}")
+                return False
+
+        self.stats.passed_urls += 1
+        return True
+
+
+class URLPatternFilter(URLFilter):
+    # PERF: Converting glob to regex is expensive
+    # PERF: Multiple regex compilation is slow
+    # PERF: List of patterns causes multiple regex evaluations
+    def __init__(
+        self,
+        patterns: Union[str, Pattern, List[Union[str, Pattern]]],
+        use_glob: bool = True,
+    ):
+        super().__init__()
+        self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+        self.use_glob = use_glob
+        self._compiled_patterns = []
+
+        # PERF: This could be consolidated into a single regex with OR conditions
+        # PERF: glob_to_regex creates complex patterns, could be simplified
+        for pattern in self.patterns:
+            if isinstance(pattern, str) and use_glob:
+                self._compiled_patterns.append(self._glob_to_regex(pattern))
+            else:
+                self._compiled_patterns.append(
+                    re.compile(pattern) if isinstance(pattern, str) else pattern
+                )
+
+    def _glob_to_regex(self, pattern: str) -> Pattern:
+        # PERF: fnmatch.translate creates overly complex patterns
+        # PERF: Could cache common translations
+        return re.compile(fnmatch.translate(pattern))
+
+    def apply(self, url: str) -> bool:
+        # PERF: any() with generator is slower than direct loop with early return
+        # PERF: searching entire string is slower than anchored match
+        matches = any(pattern.search(url) for pattern in self._compiled_patterns)
+        self._update_stats(matches)
+        return matches
+
+
+class ContentTypeFilter(URLFilter):
+    # PERF: mimetypes guessing is extremely slow
+    # PERF: URL parsing on every check is expensive
+    # PERF: No caching of results for similar extensions
+    def __init__(
+        self, allowed_types: Union[str, List[str]], check_extension: bool = True
+    ):
+        super().__init__()
+        self.allowed_types = (
+            [allowed_types] if isinstance(allowed_types, str) else allowed_types
+        )
+        self.check_extension = check_extension
+        self._normalize_types()
+
+    def _normalize_types(self):
+        """Normalize content type strings"""
+        self.allowed_types = [t.lower() for t in self.allowed_types]
+
+    def _check_extension(self, url: str) -> bool:
+        # PERF: urlparse is called on every check
+        # PERF: multiple string splits are expensive
+        # PERF: mimetypes.guess_type is very slow
+        ext = (
+            urlparse(url).path.split(".")[-1].lower()
+            if "." in urlparse(url).path
+            else ""
+        )
+        if not ext:
+            return True
+
+        # PERF: guess_type is main bottleneck
+        guessed_type = mimetypes.guess_type(url)[0]
+        return any(
+            allowed in (guessed_type or "").lower() for allowed in self.allowed_types
+        )
+
+    def apply(self, url: str) -> bool:
+        """Check if URL's content type is allowed"""
+        result = True
+        if self.check_extension:
+            result = self._check_extension(url)
+        self._update_stats(result)
+        return result
+
+
+class DomainFilter(URLFilter):
+    # PERF: Set lookups are fast but string normalizations on init are not
+    # PERF: Creating two sets doubles memory usage
+    def __init__(
+        self,
+        allowed_domains: Union[str, List[str]] = None,
+        blocked_domains: Union[str, List[str]] = None,
+    ):
+        super().__init__()
+        # PERF: Normalizing domains on every init is wasteful
+        # PERF: Could use frozenset for immutable lists
+        self.allowed_domains = (
+            set(self._normalize_domains(allowed_domains)) if allowed_domains else None
+        )
+        self.blocked_domains = (
+            set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
+        )
+
+    def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
+        # PERF: strip() and lower() create new strings for each domain
+        # PERF: List comprehension creates intermediate list
+        if isinstance(domains, str):
+            domains = [domains]
+        return [d.lower().strip() for d in domains]
+
+    def _extract_domain(self, url: str) -> str:
+        # PERF: urlparse is called for every URL check
+        # PERF: lower() creates new string every time
+        # PERF: Could cache recent results
+        return urlparse(url).netloc.lower()
+
+    def apply(self, url: str) -> bool:
+        # PERF: Two separate set lookups in worst case
+        # PERF: Domain extraction happens before knowing if we have any filters
+        domain = self._extract_domain(url)
+
+        if domain in self.blocked_domains:
+            self._update_stats(False)
+            return False
+
+        if self.allowed_domains is not None and domain not in self.allowed_domains:
+            self._update_stats(False)
+            return False
+
+        self._update_stats(True)
+        return True
+
+
+# Example usage:
+def create_common_filter_chain() -> FilterChain:
+    """Create a commonly used filter chain"""
+    return FilterChain(
+        [
+            URLPatternFilter(
+                [
+                    "*.html",
+                    "*.htm",  # HTML files
+                    "*/article/*",
+                    "*/blog/*",  # Common content paths
+                ]
+            ),
+            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
+            DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
+        ]
+    )
+
+
+####################################################################################
+# Uncledoe: Optimized Version
+####################################################################################
+
+
+# Use __slots__ and array for maximum memory/speed efficiency
+class FastFilterStats:
+    __slots__ = ("_counters",)
+
+    def __init__(self):
+        # Use array of unsigned ints for atomic operations
+        self._counters = array("I", [0, 0, 0])  # total, passed, rejected
+
+    @property
+    def total_urls(self):
+        return self._counters[0]
+
+    @property
+    def passed_urls(self):
+        return self._counters[1]
+
+    @property
+    def rejected_urls(self):
+        return self._counters[2]
+
+
+class FastURLFilter(ABC):
+    """Optimized base filter class"""
+
+    __slots__ = ("name", "stats", "_logger_ref")
+
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FastFilterStats()
+        # Lazy logger initialization using weakref
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger(f"urlfilter.{self.name}")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+    def _update_stats(self, passed: bool):
+        # Use direct array index for speed
+        self.stats._counters[0] += 1  # total
+        self.stats._counters[1] += passed  # passed
+        self.stats._counters[2] += not passed  # rejected
+
+
+class FastFilterChain:
+    """Optimized filter chain"""
+
+    __slots__ = ("filters", "stats", "_logger_ref")
+
+    def __init__(self, filters: List[FastURLFilter] = None):
+        self.filters = tuple(filters or [])  # Immutable tuple for speed
+        self.stats = FastFilterStats()
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger("urlfilter.chain")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    def add_filter(self, filter_: FastURLFilter) -> "FastFilterChain":
+        """Add a filter to the chain"""
+        self.filters.append(filter_)
+        return self  # Enable method chaining
+
+    def apply(self, url: str) -> bool:
+        """Optimized apply with minimal operations"""
+        self.stats._counters[0] += 1  # total
+
+        # Direct tuple iteration is faster than list
+        for f in self.filters:
+            if not f.apply(url):
+                self.stats._counters[2] += 1  # rejected
+                return False
+
+        self.stats._counters[1] += 1  # passed
+        return True
+
+class FastURLPatternFilter(FastURLFilter):
+    """Pattern filter balancing speed and completeness"""
+    __slots__ = ('_simple_suffixes', '_simple_prefixes', '_domain_patterns', '_path_patterns')
+    
+    PATTERN_TYPES = {
+        'SUFFIX': 1,    # *.html
+        'PREFIX': 2,    # /foo/*
+        'DOMAIN': 3,    # *.example.com
+        'PATH': 4 ,      # Everything else
+        'REGEX': 5 
+    }
+    
+    def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True):
+        super().__init__()
+        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+        
+        self._simple_suffixes = set()
+        self._simple_prefixes = set()
+        self._domain_patterns = []
+        self._path_patterns = []
+        
+        for pattern in patterns:
+            pattern_type = self._categorize_pattern(pattern)
+            self._add_pattern(pattern, pattern_type)
+    
+    def _categorize_pattern(self, pattern: str) -> int:
+        """Categorize pattern for specialized handling"""
+        if not isinstance(pattern, str):
+            return self.PATTERN_TYPES['PATH']
+            
+        # Check if it's a regex pattern
+        if pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern:
+            return self.PATTERN_TYPES['REGEX']
+        
+        if pattern.count('*') == 1:
+            if pattern.startswith('*.'):
+                return self.PATTERN_TYPES['SUFFIX']
+            if pattern.endswith('/*'):
+                return self.PATTERN_TYPES['PREFIX']
+                
+        if '://' in pattern and pattern.startswith('*.'):
+            return self.PATTERN_TYPES['DOMAIN']
+            
+        return self.PATTERN_TYPES['PATH']
+    
+    def _add_pattern(self, pattern: str, pattern_type: int):
+        """Add pattern to appropriate matcher"""
+        if pattern_type == self.PATTERN_TYPES['REGEX']:
+            # For regex patterns, compile directly without glob translation
+            if isinstance(pattern, str) and (pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern):
+                self._path_patterns.append(re.compile(pattern))
+                return
+        elif pattern_type == self.PATTERN_TYPES['SUFFIX']:
+            self._simple_suffixes.add(pattern[2:])
+        elif pattern_type == self.PATTERN_TYPES['PREFIX']:
+            self._simple_prefixes.add(pattern[:-2])
+        elif pattern_type == self.PATTERN_TYPES['DOMAIN']:
+            self._domain_patterns.append(
+                re.compile(pattern.replace('*.', r'[^/]+\.'))
+            )
+        else:
+            if isinstance(pattern, str):
+                # Handle complex glob patterns
+                if '**' in pattern:
+                    pattern = pattern.replace('**', '.*')
+                if '{' in pattern:
+                    # Convert {a,b} to (a|b)
+                    pattern = re.sub(r'\{([^}]+)\}', 
+                                   lambda m: f'({"|".join(m.group(1).split(","))})',
+                                   pattern)
+                pattern = fnmatch.translate(pattern)
+            self._path_patterns.append(
+                pattern if isinstance(pattern, Pattern) else re.compile(pattern)
+            )
+
+    @lru_cache(maxsize=10000)
+    def apply(self, url: str) -> bool:
+        """Hierarchical pattern matching"""
+        # Quick suffix check (*.html)
+        if self._simple_suffixes:
+            path = url.split('?')[0]
+            if path.split('/')[-1].split('.')[-1] in self._simple_suffixes:
+                self._update_stats(True)
+                return True
+                
+        # Domain check
+        if self._domain_patterns:
+            for pattern in self._domain_patterns:
+                if pattern.match(url):
+                    self._update_stats(True)
+                    return True
+        
+        # Prefix check (/foo/*)
+        if self._simple_prefixes:
+            path = url.split('?')[0]
+            if any(path.startswith(p) for p in self._simple_prefixes):
+                self._update_stats(True)
+                return True
+                
+        # Complex patterns
+        if self._path_patterns:
+            if any(p.search(url) for p in self._path_patterns):
+                self._update_stats(True)
+                return True
+        
+        self._update_stats(False)
+        return False
+
+
+class FastContentTypeFilter(FastURLFilter):
+    """Optimized content type filter using fast lookups"""
+
+    __slots__ = ("allowed_types", "_ext_map", "_check_extension")
+
+    # Fast extension to mime type mapping
+    _MIME_MAP = {
+        # Text Formats
+        "txt": "text/plain",
+        "html": "text/html",
+        "htm": "text/html",
+        "xhtml": "application/xhtml+xml",
+        "css": "text/css",
+        "csv": "text/csv",
+        "ics": "text/calendar",
+        "js": "application/javascript",
+        # Images
+        "bmp": "image/bmp",
+        "gif": "image/gif",
+        "jpeg": "image/jpeg",
+        "jpg": "image/jpeg",
+        "png": "image/png",
+        "svg": "image/svg+xml",
+        "tiff": "image/tiff",
+        "ico": "image/x-icon",
+        "webp": "image/webp",
+        # Audio
+        "mp3": "audio/mpeg",
+        "wav": "audio/wav",
+        "ogg": "audio/ogg",
+        "m4a": "audio/mp4",
+        "aac": "audio/aac",
+        # Video
+        "mp4": "video/mp4",
+        "mpeg": "video/mpeg",
+        "webm": "video/webm",
+        "avi": "video/x-msvideo",
+        "mov": "video/quicktime",
+        "flv": "video/x-flv",
+        "wmv": "video/x-ms-wmv",
+        "mkv": "video/x-matroska",
+        # Applications
+        "json": "application/json",
+        "xml": "application/xml",
+        "pdf": "application/pdf",
+        "zip": "application/zip",
+        "gz": "application/gzip",
+        "tar": "application/x-tar",
+        "rar": "application/vnd.rar",
+        "7z": "application/x-7z-compressed",
+        "exe": "application/vnd.microsoft.portable-executable",
+        "msi": "application/x-msdownload",
+        # Fonts
+        "woff": "font/woff",
+        "woff2": "font/woff2",
+        "ttf": "font/ttf",
+        "otf": "font/otf",
+        # Microsoft Office
+        "doc": "application/msword",
+        "dot": "application/msword",
+        "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "xls": "application/vnd.ms-excel",
+        "ppt": "application/vnd.ms-powerpoint",
+        "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        # OpenDocument Formats
+        "odt": "application/vnd.oasis.opendocument.text",
+        "ods": "application/vnd.oasis.opendocument.spreadsheet",
+        "odp": "application/vnd.oasis.opendocument.presentation",
+        # Archives
+        "tar.gz": "application/gzip",
+        "tgz": "application/gzip",
+        "bz2": "application/x-bzip2",
+        # Others
+        "rtf": "application/rtf",
+        "apk": "application/vnd.android.package-archive",
+        "epub": "application/epub+zip",
+        "jar": "application/java-archive",
+        "swf": "application/x-shockwave-flash",
+        "midi": "audio/midi",
+        "mid": "audio/midi",
+        "ps": "application/postscript",
+        "ai": "application/postscript",
+        "eps": "application/postscript",
+        # Custom or less common
+        "bin": "application/octet-stream",
+        "dmg": "application/x-apple-diskimage",
+        "iso": "application/x-iso9660-image",
+        "deb": "application/x-debian-package",
+        "rpm": "application/x-rpm",
+        "sqlite": "application/vnd.sqlite3",
+        # Placeholder
+        "unknown": "application/octet-stream",  # Fallback for unknown file types
+    }
+
+    @staticmethod
+    @lru_cache(maxsize=1000)
+    def _extract_extension(path: str) -> str:
+        """Fast extension extraction with caching"""
+        if "." not in path:
+            return ""
+        return path.rpartition(".")[-1].lower()
+
+    def __init__(
+        self, allowed_types: Union[str, List[str]], check_extension: bool = True
+    ):
+        super().__init__()
+        # Normalize and store as frozenset for fast lookup
+        self.allowed_types = frozenset(
+            t.lower()
+            for t in (
+                allowed_types if isinstance(allowed_types, list) else [allowed_types]
+            )
+        )
+        self._check_extension = check_extension
+
+        # Pre-compute extension map for allowed types
+        self._ext_map = frozenset(
+            ext
+            for ext, mime in self._MIME_MAP.items()
+            if any(allowed in mime for allowed in self.allowed_types)
+        )
+
+    @lru_cache(maxsize=1000)
+    def _check_url_cached(self, url: str) -> bool:
+        """Cached URL checking"""
+        if not self._check_extension:
+            return True
+
+        path = url.split("?")[0]  # Fast path split
+        ext = self._extract_extension(path)
+        if not ext:
+            return True
+
+        return ext in self._ext_map
+
+    def apply(self, url: str) -> bool:
+        """Fast extension check with caching"""
+        result = self._check_url_cached(url)
+        self._update_stats(result)
+        return result
+
+
+class FastDomainFilter(FastURLFilter):
+    """Optimized domain filter with fast lookups and caching"""
+
+    __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
+
+    # Regex for fast domain extraction
+    _DOMAIN_REGEX = re.compile(r"://([^/]+)")
+
+    def __init__(
+        self,
+        allowed_domains: Union[str, List[str]] = None,
+        blocked_domains: Union[str, List[str]] = None,
+    ):
+        super().__init__()
+
+        # Convert inputs to frozensets for immutable, fast lookups
+        self._allowed_domains = (
+            frozenset(self._normalize_domains(allowed_domains))
+            if allowed_domains
+            else None
+        )
+        self._blocked_domains = (
+            frozenset(self._normalize_domains(blocked_domains))
+            if blocked_domains
+            else frozenset()
+        )
+
+    @staticmethod
+    def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]:
+        """Fast domain normalization"""
+        if isinstance(domains, str):
+            return {domains.lower()}
+        return {d.lower() for d in domains}
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Ultra-fast domain extraction with regex and caching"""
+        match = FastDomainFilter._DOMAIN_REGEX.search(url)
+        return match.group(1).lower() if match else ""
+
+    def apply(self, url: str) -> bool:
+        """Optimized domain checking with early returns"""
+        # Skip processing if no filters
+        if not self._blocked_domains and self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        domain = self._extract_domain(url)
+
+        # Early return for blocked domains
+        if domain in self._blocked_domains:
+            self._update_stats(False)
+            return False
+
+        # If no allowed domains specified, accept all non-blocked
+        if self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        # Final allowed domains check
+        result = domain in self._allowed_domains
+        self._update_stats(result)
+        return result
+
+
+def create_fast_filter_chain() -> FastFilterChain:
+    """Create an optimized filter chain with filters ordered by rejection rate"""
+    return FastFilterChain(
+        [
+            # Domain filter first (fastest rejection)
+            FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]),
+            # Content filter second (medium speed)
+            FastContentTypeFilter(["text/html", "application/xhtml+xml"]),
+            # Pattern filter last (most expensive)
+            FastURLPatternFilter(
+                [
+                    "*.html",
+                    "*.htm",
+                    "*/article/*",
+                    "*/blog/*",
+                ]
+            ),
+        ]
+    )
+
+
+def run_performance_test():
+    import time
+    import random
+    from itertools import cycle
+
+    # Generate test URLs
+    base_urls = [
+        "https://example.com/article/123",
+        "https://blog.example.com/post/456",
+        "https://ads.example.com/tracking",
+        "https://example.com/about.html",
+        "https://analytics.example.com/script.js",
+        "https://example.com/products.php",
+        "https://subdomain.example.com/blog/post-123",
+        "https://example.com/path/file.pdf",
+    ]
+
+    # Create more varied test data
+    test_urls = []
+    for base in base_urls:
+        # Add original
+        test_urls.append(base)
+        # Add variations
+        parts = base.split("/")
+        for i in range(10):
+            parts[-1] = f"page_{i}.html"
+            test_urls.append("/".join(parts))
+
+    # Multiply to get enough test data
+    test_urls = test_urls * 10000  # Creates ~800k URLs
+
+    def benchmark(name: str, func, *args, warmup=True):
+        if warmup:
+            # Warmup run
+            func(*args)
+
+        # Actual timing
+        start = time.perf_counter_ns()
+        result = func(*args)
+        elapsed = (time.perf_counter_ns() - start) / 1_000_000  # Convert to ms
+        print(
+            f"{name:<30} {elapsed:>8.3f} ms  ({len(test_urls)/elapsed*1000:,.0f} URLs/sec)"
+        )
+        return result
+
+    print("\nBenchmarking original vs optimized implementations...")
+    print("-" * 70)
+
+    # Original implementation
+    pattern_filter = URLPatternFilter(["*.html", "*/article/*"])
+    content_filter = ContentTypeFilter(["text/html"])
+    domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"])
+    chain = FilterChain([pattern_filter, content_filter, domain_filter])
+
+    # Optimized implementation
+    fast_pattern_filter = FastURLPatternFilter(["*.html", "*/article/*"])
+    fast_content_filter = FastContentTypeFilter(["text/html"])
+    fast_domain_filter = FastDomainFilter(blocked_domains=["ads.*", "analytics.*"])
+    fast_chain = FastFilterChain(
+        [fast_domain_filter, fast_content_filter, fast_pattern_filter]
+    )
+
+    # Test individual filters
+    print("\nSingle filter performance (first 1000 URLs):")
+    test_subset = test_urls[:1000]
+
+    print("\nPattern Filters:")
+    benchmark(
+        "Original Pattern Filter",
+        lambda: [pattern_filter.apply(url) for url in test_subset],
+    )
+    benchmark(
+        "Optimized Pattern Filter",
+        lambda: [fast_pattern_filter.apply(url) for url in test_subset],
+    )
+
+    print("\nContent Filters:")
+    benchmark(
+        "Original Content Filter",
+        lambda: [content_filter.apply(url) for url in test_subset],
+    )
+    benchmark(
+        "Optimized Content Filter",
+        lambda: [fast_content_filter.apply(url) for url in test_subset],
+    )
+
+    print("\nDomain Filters:")
+    benchmark(
+        "Original Domain Filter",
+        lambda: [domain_filter.apply(url) for url in test_subset],
+    )
+    benchmark(
+        "Optimized Domain Filter",
+        lambda: [fast_domain_filter.apply(url) for url in test_subset],
+    )
+
+    print("\nFull Chain Performance (all URLs):")
+    # Test chain
+    benchmark("Original Chain", lambda: [chain.apply(url) for url in test_urls])
+    benchmark("Optimized Chain", lambda: [fast_chain.apply(url) for url in test_urls])
+
+    # Memory usage
+    import sys
+
+    print("\nMemory Usage per Filter:")
+    print(f"Original Pattern Filter: {sys.getsizeof(pattern_filter):,} bytes")
+    print(f"Optimized Pattern Filter: {sys.getsizeof(fast_pattern_filter):,} bytes")
+    print(f"Original Content Filter: {sys.getsizeof(content_filter):,} bytes")
+    print(f"Optimized Content Filter: {sys.getsizeof(fast_content_filter):,} bytes")
+    print(f"Original Domain Filter: {sys.getsizeof(domain_filter):,} bytes")
+    print(f"Optimized Domain Filter: {sys.getsizeof(fast_domain_filter):,} bytes")
+
+
+def test_pattern_filter():
+    import time
+    from itertools import chain
+
+    # Test cases as list of tuples instead of dict for multiple patterns
+    test_cases = [
+        # Simple suffix patterns (*.html)
+        ("*.html", {
+            "https://example.com/page.html": True,
+            "https://example.com/path/doc.html": True,
+            "https://example.com/page.htm": False,
+            "https://example.com/page.html?param=1": True,
+        }),
+        
+        # Path prefix patterns (/foo/*)
+        ("*/article/*", {
+            "https://example.com/article/123": True,
+            "https://example.com/blog/article/456": True,
+            "https://example.com/articles/789": False,
+            "https://example.com/article": False,
+        }),
+        
+        # Complex patterns
+        ("blog-*-[0-9]", {
+            "https://example.com/blog-post-1": True,
+            "https://example.com/blog-test-9": True,
+            "https://example.com/blog-post": False,
+            "https://example.com/blog-post-x": False,
+        }),
+        
+        # Multiple patterns case
+        (["*.pdf", "*/download/*"], {
+            "https://example.com/doc.pdf": True,
+            "https://example.com/download/file.txt": True,
+            "https://example.com/path/download/doc": True,
+            "https://example.com/uploads/file.txt": False,
+        }),
+        
+        # Edge cases
+        ("*", {
+            "https://example.com": True,
+            "": True,
+            "http://test.com/path": True,
+        }),
+        
+        # Complex regex
+        (r"^https?://.*\.example\.com/\d+", {
+            "https://sub.example.com/123": True,
+            "http://test.example.com/456": True,
+            "https://example.com/789": False,
+            "https://sub.example.com/abc": False,
+        })
+    ]
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for patterns, test_urls in test_cases:
+            filter_obj = FastURLPatternFilter(patterns)
+            
+            for url, expected in test_urls.items():
+                result = filter_obj.apply(url)
+                if result != expected:
+                    print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {result}")
+                    all_passed = False
+                else:
+                    print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
+        
+        return all_passed
+
+    def run_speed_test():
+        print("\nSpeed Tests:")
+        print("-" * 50)
+        
+        # Create a large set of test URLs
+        all_urls = list(chain.from_iterable(urls.keys() for _, urls in test_cases))
+        test_urls = all_urls * 10000  # 100K+ URLs
+        
+        # Test both implementations
+        original = URLPatternFilter(["*.html", "*/article/*", "blog-*"])
+        optimized = FastURLPatternFilter(["*.html", "*/article/*", "blog-*"])
+        
+        def benchmark(name, filter_obj):
+            start = time.perf_counter()
+            for url in test_urls:
+                filter_obj.apply(url)
+            elapsed = time.perf_counter() - start
+            urls_per_sec = len(test_urls) / elapsed
+            print(f"{name:<20} {elapsed:.3f}s ({urls_per_sec:,.0f} URLs/sec)")
+        
+        benchmark("Original Filter:", original)
+        benchmark("Optimized Filter:", optimized)
+
+    # Run tests
+    print("Running Pattern Filter Tests...")
+    accuracy_passed = run_accuracy_test()
+    
+    if accuracy_passed:
+        print("\n✨ All accuracy tests passed!")
+        run_speed_test()
+    else:
+        print("\n❌ Some accuracy tests failed!")
+
+if __name__ == "__main__":
+    run_performance_test()
+    # test_pattern_filter()
--- a/docs/scraper/scraper_quickstart.py
+++ b/docs/scraper/scraper_quickstart.py
@@ -8,6 +8,7 @@ from crawl4ai.scraper import (
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
 import re
+import time

 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)

@@ -55,6 +56,7 @@ async def basic_scraper_example():

 # advanced_scraper_example.py
 import logging
+
 from crawl4ai.scraper import (
    AsyncWebScraper,
    BFSScraperStrategy,
@@ -177,11 +179,15 @@ async def advanced_scraper_example():

 if __name__ == "__main__":
    import asyncio
+    import time

    # Run basic example
+    start_time = time.perf_counter()
    print("Running basic scraper example...")
    asyncio.run(basic_scraper_example())
+    end_time = time.perf_counter()
+    print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")

-    # Run advanced example
-    print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())
+    # # Run advanced example
+    # print("\nRunning advanced scraper example...")
+    # asyncio.run(advanced_scraper_example())
--- a/docs/scraper/scraper_quickstart_review.py
+++ b/docs/scraper/scraper_quickstart_review.py
@@ -0,0 +1,187 @@
+# basic_scraper_example.py
+from crawl4ai.scraper import (
+    AsyncWebScraper,
+    BFSScraperStrategy,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter,
+)
+from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
+import re
+
+browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
+
+
+async def basic_scraper_example():
+    """
+    Basic example: Scrape a blog site for articles
+    - Crawls only HTML pages
+    - Stays within the blog section
+    - Collects all results at once
+    """
+    # Create a simple filter chain
+    filter_chain = FilterChain(
+        [
+            # Only crawl pages within the blog section
+            URLPatternFilter("*/tutorial/*"),
+            # Only process HTML pages
+            ContentTypeFilter(["text/html"]),
+        ]
+    )
+
+    # Initialize the strategy with basic configuration
+    strategy = BFSScraperStrategy(
+        max_depth=2,  # Only go 2 levels deep
+        filter_chain=filter_chain,
+        url_scorer=None,  # Use default scoring
+        process_external_links=True,
+    )
+
+    # Create the crawler and scraper
+    async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
+        scraper = AsyncWebScraper(crawler, strategy)
+        # Start scraping
+        try:
+            result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
+
+            # Process results
+            print(f"Crawled {len(result.crawled_urls)} pages:")
+            for url, data in result.extracted_data.items():
+                print(f"- {url}: {len(data.html)} bytes")
+
+        except Exception as e:
+            print(f"Error during scraping: {e}")
+
+
+# advanced_scraper_example.py
+import logging
+from crawl4ai.scraper import (
+    AsyncWebScraper,
+    BFSScraperStrategy,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter,
+    DomainFilter,
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    FreshnessScorer,
+    CompositeScorer,
+)
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+
+async def advanced_scraper_example():
+    """
+    Advanced example: Intelligent news site scraping
+    - Uses all filter types
+    - Implements sophisticated scoring
+    - Streams results
+    - Includes monitoring and logging
+    """
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("advanced_scraper")
+
+    # Create sophisticated filter chain
+    filter_chain = FilterChain(
+        [
+            # Domain control
+            DomainFilter(
+                allowed_domains=["techcrunch.com"],
+                blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
+            ),
+            # URL patterns
+            URLPatternFilter(
+                [
+                    "*/article/*",
+                    "*/news/*",
+                    "*/blog/*",
+                    re.compile(r"\d{4}/\d{2}/.*"),  # Date-based URLs
+                ]
+            ),
+            # Content types
+            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
+        ]
+    )
+
+    # Create composite scorer
+    scorer = CompositeScorer(
+        [
+            # Prioritize by keywords
+            KeywordRelevanceScorer(
+                keywords=["news", "breaking", "update", "latest"], weight=1.0
+            ),
+            # Prefer optimal URL structure
+            PathDepthScorer(optimal_depth=3, weight=0.7),
+            # Prioritize fresh content
+            FreshnessScorer(weight=0.9),
+        ]
+    )
+
+    # Initialize strategy with advanced configuration
+    strategy = BFSScraperStrategy(
+        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
+    )
+
+    # Create crawler and scraper
+    async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler:
+        scraper = AsyncWebScraper(crawler, strategy)
+
+        # Track statistics
+        stats = {"processed": 0, "errors": 0, "total_size": 0}
+
+        try:
+            # Use streaming mode
+            result_generator = await scraper.ascrape(
+                "https://techcrunch.com", stream=True
+            )
+            async for result in result_generator:
+                stats["processed"] += 1
+
+                if result.success:
+                    stats["total_size"] += len(result.html)
+                    logger.info(f"Processed: {result.url}")
+                else:
+                    stats["errors"] += 1
+                    logger.error(
+                        f"Failed to process {result.url}: {result.error_message}"
+                    )
+
+                # Log progress regularly
+                if stats["processed"] % 10 == 0:
+                    logger.info(f"Progress: {stats['processed']} URLs processed")
+
+        except Exception as e:
+            logger.error(f"Scraping error: {e}")
+
+        finally:
+            # Print final statistics
+            logger.info("Scraping completed:")
+            logger.info(f"- URLs processed: {stats['processed']}")
+            logger.info(f"- Errors: {stats['errors']}")
+            logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
+
+            # Print filter statistics
+            for filter_ in filter_chain.filters:
+                logger.info(f"{filter_.name} stats:")
+                logger.info(f"- Passed: {filter_.stats.passed_urls}")
+                logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
+
+            # Print scorer statistics
+            logger.info("Scoring statistics:")
+            logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
+            logger.info(
+                f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
+            )
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    # Run basic example
+    print("Running basic scraper example...")
+    asyncio.run(basic_scraper_example())
+
+    # Run advanced example
+    print("\nRunning advanced scraper example...")
+    asyncio.run(advanced_scraper_example())