feat(scraper): Enhance URL filtering and scoring systems

Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes. - Quick Start is created and added
2024-11-08 18:45:12 +08:00
parent d11c004fbb
commit bae4665949
10 changed files with 1451 additions and 157 deletions
--- a/crawl4ai/scraper/bfs_scraper_strategy
+++ b/crawl4ai/scraper/bfs_scraper_strategy
@@ -1,138 +0,0 @@
-from .scraper_strategy import ScraperStrategy
-from .filters import FilterChain
-from .scorers import URLScorer
-from ..models import CrawlResult
-from ..async_webcrawler import AsyncWebCrawler
-import asyncio
-import validators
-from urllib.parse import urljoin,urlparse,urlunparse
-from urllib.robotparser import RobotFileParser
-import time
-from aiolimiter import AsyncLimiter
-from tenacity import retry, stop_after_attempt, wait_exponential
-from collections import defaultdict
-import logging
-from typing import Dict, AsyncGenerator
-logging.basicConfig(level=logging.DEBUG)
-
-rate_limiter = AsyncLimiter(1, 1)  # 1 request per second
-
-class BFSScraperStrategy(ScraperStrategy):
-    def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1):
-        self.max_depth = max_depth
-        self.filter_chain = filter_chain
-        self.url_scorer = url_scorer
-        self.max_concurrent = max_concurrent
-        # For Crawl Politeness
-        self.last_crawl_time = defaultdict(float)
-        self.min_crawl_delay = min_crawl_delay  # 1 second delay between requests to the same domain
-        # For Robots.txt Compliance
-        self.robot_parsers = {}
-
-    # Robots.txt Parser
-    def get_robot_parser(self, url: str) -> RobotFileParser:
-        domain = urlparse(url)
-        scheme = domain.scheme if domain.scheme else 'http'  # Default to 'http' if no scheme provided
-        netloc = domain.netloc
-        if netloc not in self.robot_parsers:
-            rp = RobotFileParser()
-            rp.set_url(f"{scheme}://{netloc}/robots.txt")
-            try:
-                rp.read()
-            except Exception as e:
-                # Log the type of error, message, and the URL
-                logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}")
-                return None
-            self.robot_parsers[netloc] = rp
-        return self.robot_parsers[netloc]
-    
-    # Retry with exponential backoff
-    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
-    async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult:
-        return await crawler.arun(url)
-    
-    async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]:
-        def normalize_url(url: str) -> str:
-            parsed = urlparse(url)
-            return urlunparse(parsed._replace(fragment=""))
-        
-        # URL Validation
-        if not validators.url(url):
-            logging.warning(f"Invalid URL: {url}")
-            return None
-        
-        # Robots.txt Compliance
-        robot_parser = self.get_robot_parser(url)
-        if robot_parser is None:
-            logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.")
-        else:
-            # If robots.txt was fetched, check if crawling is allowed
-            if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url):
-                logging.info(f"Skipping {url} as per robots.txt")
-                return None
-    
-        # Crawl Politeness
-        domain = urlparse(url).netloc
-        time_since_last_crawl = time.time() - self.last_crawl_time[domain]
-        if time_since_last_crawl < self.min_crawl_delay:
-            await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl)
-        self.last_crawl_time[domain] = time.time()
-
-        # Rate Limiting
-        async with rate_limiter:
-            # Error Handling
-            try:
-                crawl_result = await self.retry_crawl(crawler, url)
-            except Exception as e:
-                logging.error(f"Error crawling {url}: {str(e)}")
-                crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e))
-        
-        if not crawl_result.success:
-            # Logging and Monitoring
-            logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}")
-            return crawl_result
-
-        # Process links
-        for link_type in ["internal", "external"]:
-            for link in crawl_result.links[link_type]:
-                absolute_link = urljoin(url, link['href'])
-                normalized_link = normalize_url(absolute_link)
-                if self.filter_chain.apply(normalized_link) and normalized_link not in visited:
-                    new_depth = depths[url] + 1
-                    if new_depth <= self.max_depth:
-                        # URL Scoring
-                        score = self.url_scorer.score(normalized_link)
-                        await queue.put((score, new_depth, normalized_link))
-                        depths[normalized_link] = new_depth
-        return crawl_result
-
-    async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]:
-        queue = asyncio.PriorityQueue()
-        queue.put_nowait((0, 0, start_url))
-        visited = set()
-        depths = {start_url: 0}
-        pending_tasks = set()
-
-        while not queue.empty() or pending_tasks:
-            while not queue.empty() and len(pending_tasks) < self.max_concurrent:
-                _, depth, url = await queue.get()
-                if url not in visited:
-                    # Adding URL to the visited set here itself, (instead of after result generation)
-                    # so that other tasks are not queued for same URL, found at different depth before
-                    # crawling and extraction of this task is completed.
-                    visited.add(url)
-                    if parallel_processing:
-                        task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths))
-                        pending_tasks.add(task)
-                    else:
-                        result = await self.process_url(url, depth, crawler, queue, visited, depths)
-                        if result:
-                            yield result 
-
-            # Wait for the first task to complete and yield results incrementally as each task is completed
-            if pending_tasks:
-                done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
-                for task in done:
-                    result = await task
-                    if result:
-                        yield result
--- a/crawl4ai/scraper/filters/init.py
+++ b/crawl4ai/scraper/filters/init.py
@@ -1,3 +1,205 @@
-from .url_filter import URLFilter, FilterChain
-from .content_type_filter import ContentTypeFilter
-from .url_pattern_filter import URLPatternFilter
+# from .url_filter import URLFilter, FilterChain
+# from .content_type_filter import ContentTypeFilter
+# from .url_pattern_filter import URLPatternFilter
+
+from abc import ABC, abstractmethod
+from typing import List, Pattern, Set, Union
+import re
+from urllib.parse import urlparse
+import mimetypes
+import logging
+from dataclasses import dataclass
+import fnmatch
+
+@dataclass
+class FilterStats:
+    """Statistics for filter applications"""
+    total_urls: int = 0
+    rejected_urls: int = 0
+    passed_urls: int = 0
+
+class URLFilter(ABC):
+    """Base class for URL filters"""
+    
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FilterStats()
+        self.logger = logging.getLogger(f"urlfilter.{self.name}")
+
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        """Apply the filter to a URL"""
+        pass
+
+    def _update_stats(self, passed: bool):
+        """Update filter statistics"""
+        self.stats.total_urls += 1
+        if passed:
+            self.stats.passed_urls += 1
+        else:
+            self.stats.rejected_urls += 1
+
+class FilterChain:
+    """Chain of URL filters."""
+    
+    def __init__(self, filters: List[URLFilter] = None):
+        self.filters = filters or []
+        self.stats = FilterStats()
+        self.logger = logging.getLogger("urlfilter.chain")
+
+    def add_filter(self, filter_: URLFilter) -> 'FilterChain':
+        """Add a filter to the chain"""
+        self.filters.append(filter_)
+        return self  # Enable method chaining
+
+    def apply(self, url: str) -> bool:
+        """Apply all filters in the chain"""
+        self.stats.total_urls += 1
+        
+        for filter_ in self.filters:
+            if not filter_.apply(url):
+                self.stats.rejected_urls += 1
+                self.logger.debug(f"URL {url} rejected by {filter_.name}")
+                return False
+        
+        self.stats.passed_urls += 1
+        return True
+
+class URLPatternFilter(URLFilter):
+    """Filter URLs based on glob patterns or regex.
+    
+    pattern_filter = URLPatternFilter([
+        "*.example.com/*",  # Glob pattern
+        "*/article/*",      # Path pattern
+        re.compile(r"blog-\d+") # Regex pattern
+    ])
+
+    - Supports glob patterns and regex
+    - Multiple patterns per filter
+    - Pattern pre-compilation for performance    
+    """
+    
+    def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], 
+                 use_glob: bool = True):
+        super().__init__()
+        self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+        self.use_glob = use_glob
+        self._compiled_patterns = []
+        
+        for pattern in self.patterns:
+            if isinstance(pattern, str) and use_glob:
+                self._compiled_patterns.append(self._glob_to_regex(pattern))
+            else:
+                self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern)
+
+    def _glob_to_regex(self, pattern: str) -> Pattern:
+        """Convert glob pattern to regex"""
+        return re.compile(fnmatch.translate(pattern))
+
+    def apply(self, url: str) -> bool:
+        """Check if URL matches any of the patterns"""
+        matches = any(pattern.search(url) for pattern in self._compiled_patterns)
+        self._update_stats(matches)
+        return matches
+
+class ContentTypeFilter(URLFilter):
+    """Filter URLs based on expected content type.
+    
+    content_filter = ContentTypeFilter([
+        "text/html",
+        "application/pdf"
+    ], check_extension=True)
+
+    - Filter by MIME types
+    - Extension checking
+    - Support for multiple content types
+    """
+    
+    def __init__(self, allowed_types: Union[str, List[str]], 
+                 check_extension: bool = True):
+        super().__init__()
+        self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types
+        self.check_extension = check_extension
+        self._normalize_types()
+
+    def _normalize_types(self):
+        """Normalize content type strings"""
+        self.allowed_types = [t.lower() for t in self.allowed_types]
+
+    def _check_extension(self, url: str) -> bool:
+        """Check URL's file extension"""
+        ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else ''
+        if not ext:
+            return True  # No extension, might be dynamic content
+            
+        guessed_type = mimetypes.guess_type(url)[0]
+        return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types)
+
+    def apply(self, url: str) -> bool:
+        """Check if URL's content type is allowed"""
+        result = True
+        if self.check_extension:
+            result = self._check_extension(url)
+        self._update_stats(result)
+        return result
+
+class DomainFilter(URLFilter):
+    """Filter URLs based on allowed/blocked domains.
+    
+    domain_filter = DomainFilter(
+        allowed_domains=["example.com", "blog.example.com"],
+        blocked_domains=["ads.example.com"]
+    )
+
+    - Allow/block specific domains
+    - Subdomain support
+    - Efficient domain matching
+    """
+    
+    def __init__(self, allowed_domains: Union[str, List[str]] = None, 
+                 blocked_domains: Union[str, List[str]] = None):
+        super().__init__()
+        self.allowed_domains = set(self._normalize_domains(allowed_domains)) if allowed_domains else None
+        self.blocked_domains = set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
+
+    def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
+        """Normalize domain strings"""
+        if isinstance(domains, str):
+            domains = [domains]
+        return [d.lower().strip() for d in domains]
+
+    def _extract_domain(self, url: str) -> str:
+        """Extract domain from URL"""
+        return urlparse(url).netloc.lower()
+
+    def apply(self, url: str) -> bool:
+        """Check if URL's domain is allowed"""
+        domain = self._extract_domain(url)
+        
+        if domain in self.blocked_domains:
+            self._update_stats(False)
+            return False
+            
+        if self.allowed_domains is not None and domain not in self.allowed_domains:
+            self._update_stats(False)
+            return False
+            
+        self._update_stats(True)
+        return True
+
+# Example usage:
+def create_common_filter_chain() -> FilterChain:
+    """Create a commonly used filter chain"""
+    return FilterChain([
+        URLPatternFilter([
+            "*.html", "*.htm",  # HTML files
+            "*/article/*", "*/blog/*"  # Common content paths
+        ]),
+        ContentTypeFilter([
+            "text/html",
+            "application/xhtml+xml"
+        ]),
+        DomainFilter(
+            blocked_domains=["ads.*", "analytics.*"]
+        )
+    ])
--- a/crawl4ai/scraper/filters/content_type_filter.py
+++ b/crawl4ai/scraper/filters/content_type_filter.py
@@ -1,8 +1,43 @@
 from .url_filter import URLFilter
+from typing import List, Union
+from urllib.parse import urlparse
+import mimetypes
+

 class ContentTypeFilter(URLFilter):
-    def __init__(self, contentType: str):
-        self.contentType = contentType
+    """Filter URLs based on expected content type"""
+    
+    def __init__(self, allowed_types: Union[str, List[str]], 
+                 check_extension: bool = True):
+        super().__init__()
+        self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types
+        self.check_extension = check_extension
+        self._normalize_types()
+
+    def _normalize_types(self):
+        """Normalize content type strings"""
+        self.allowed_types = [t.lower() for t in self.allowed_types]
+
+    def _check_extension(self, url: str) -> bool:
+        """Check URL's file extension"""
+        ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else ''
+        if not ext:
+            return True  # No extension, might be dynamic content
+            
+        guessed_type = mimetypes.guess_type(url)[0]
+        return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types)
+
    def apply(self, url: str) -> bool:
-        #TODO: This is a stub. Will implement this later
-        return True
+        """Check if URL's content type is allowed"""
+        result = True
+        if self.check_extension:
+            result = self._check_extension(url)
+        self._update_stats(result)
+        return result
+
+# class ContentTypeFilter(URLFilter):
+#     def __init__(self, contentType: str):
+#         self.contentType = contentType
+#     def apply(self, url: str) -> bool:
+#         #TODO: This is a stub. Will implement this later
+#         return True
--- a/crawl4ai/scraper/filters/url_filter.py
+++ b/crawl4ai/scraper/filters/url_filter.py
@@ -1,16 +1,72 @@
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
+import logging
+from typing import List
+@dataclass
+class FilterStats:
+    """Statistics for filter applications"""
+    total_urls: int = 0
+    rejected_urls: int = 0
+    passed_urls: int = 0

 class URLFilter(ABC):
+    """Base class for URL filters"""
+    
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FilterStats()
+        self.logger = logging.getLogger(f"urlfilter.{self.name}")
+
    @abstractmethod
    def apply(self, url: str) -> bool:
+        """Apply the filter to a URL"""
        pass

-class FilterChain:
-    def __init__(self):
-        self.filters = []
+    def _update_stats(self, passed: bool):
+        """Update filter statistics"""
+        self.stats.total_urls += 1
+        if passed:
+            self.stats.passed_urls += 1
+        else:
+            self.stats.rejected_urls += 1

-    def add_filter(self, filter: URLFilter):
-        self.filters.append(filter)
+class FilterChain:
+    """Chain of URL filters"""
+    
+    def __init__(self, filters: List[URLFilter] = None):
+        self.filters = filters or []
+        self.stats = FilterStats()
+        self.logger = logging.getLogger("urlfilter.chain")
+
+    def add_filter(self, filter_: URLFilter) -> 'FilterChain':
+        """Add a filter to the chain"""
+        self.filters.append(filter_)
+        return self  # Enable method chaining

    def apply(self, url: str) -> bool:
-        return all(filter.apply(url) for filter in self.filters)
+        """Apply all filters in the chain"""
+        self.stats.total_urls += 1
+        
+        for filter_ in self.filters:
+            if not filter_.apply(url):
+                self.stats.rejected_urls += 1
+                self.logger.debug(f"URL {url} rejected by {filter_.name}")
+                return False
+        
+        self.stats.passed_urls += 1
+        return True
+    
+# class URLFilter(ABC):
+#     @abstractmethod
+#     def apply(self, url: str) -> bool:
+#         pass
+
+# class FilterChain:
+#     def __init__(self):
+#         self.filters = []
+
+#     def add_filter(self, filter: URLFilter):
+#         self.filters.append(filter)
+
+#     def apply(self, url: str) -> bool:
+#         return all(filter.apply(url) for filter in self.filters)
--- a/crawl4ai/scraper/filters/url_pattern_filter.py
+++ b/crawl4ai/scraper/filters/url_pattern_filter.py
@@ -1,9 +1,39 @@
 from .url_filter import URLFilter
 from re import Pattern
+from typing import List, Union
+import re
+import fnmatch
+

 class URLPatternFilter(URLFilter):
-    def __init__(self, pattern: Pattern):
-        self.pattern = pattern
+    """Filter URLs based on glob patterns or regex"""
+    
+    def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], 
+                 use_glob: bool = True):
+        super().__init__()
+        self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+        self.use_glob = use_glob
+        self._compiled_patterns = []
+        
+        for pattern in self.patterns:
+            if isinstance(pattern, str) and use_glob:
+                self._compiled_patterns.append(self._glob_to_regex(pattern))
+            else:
+                self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern)
+
+    def _glob_to_regex(self, pattern: str) -> Pattern:
+        """Convert glob pattern to regex"""
+        return re.compile(fnmatch.translate(pattern))
+
    def apply(self, url: str) -> bool:
-        #TODO: This is a stub. Will implement this later.
-        return True
+        """Check if URL matches any of the patterns"""
+        matches = any(pattern.search(url) for pattern in self._compiled_patterns)
+        self._update_stats(matches)
+        return matches
+
+# class URLPatternFilter(URLFilter):
+#     def __init__(self, pattern: Pattern):
+#         self.pattern = pattern
+#     def apply(self, url: str) -> bool:
+#         #TODO: This is a stub. Will implement this later.
+#         return True
--- a/crawl4ai/scraper/scorers/init.py
+++ b/crawl4ai/scraper/scorers/init.py
@@ -1,2 +1,268 @@
-from .url_scorer import URLScorer
-from .keyword_relevance_scorer import KeywordRelevanceScorer
+# from .url_scorer import URLScorer
+# from .keyword_relevance_scorer import KeywordRelevanceScorer
+
+from abc import ABC, abstractmethod
+from typing import List, Dict, Optional, Union
+from dataclasses import dataclass
+from urllib.parse import urlparse, unquote
+import re
+from collections import defaultdict
+import math
+import logging
+
+@dataclass
+class ScoringStats:
+    """Statistics for URL scoring"""
+    urls_scored: int = 0
+    total_score: float = 0.0
+    min_score: float = float('inf')
+    max_score: float = float('-inf')
+    
+    def update(self, score: float):
+        """Update scoring statistics"""
+        self.urls_scored += 1
+        self.total_score += score
+        self.min_score = min(self.min_score, score)
+        self.max_score = max(self.max_score, score)
+    
+    @property
+    def average_score(self) -> float:
+        """Calculate average score"""
+        return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0
+
+class URLScorer(ABC):
+    """Base class for URL scoring strategies"""
+    
+    def __init__(self, weight: float = 1.0, name: str = None):
+        self.weight = weight
+        self.name = name or self.__class__.__name__
+        self.stats = ScoringStats()
+        self.logger = logging.getLogger(f"urlscorer.{self.name}")
+
+    @abstractmethod
+    def _calculate_score(self, url: str) -> float:
+        """Calculate the raw score for a URL"""
+        pass
+
+    def score(self, url: str) -> float:
+        """Calculate the weighted score for a URL"""
+        raw_score = self._calculate_score(url)
+        weighted_score = raw_score * self.weight
+        self.stats.update(weighted_score)
+        return weighted_score
+
+class CompositeScorer(URLScorer):
+    """Combines multiple scorers with weights"""
+    
+    def __init__(self, scorers: List[URLScorer], normalize: bool = True):
+        super().__init__(name="CompositeScorer")
+        self.scorers = scorers
+        self.normalize = normalize
+
+    def _calculate_score(self, url: str) -> float:
+        scores = [scorer.score(url) for scorer in self.scorers]
+        total_score = sum(scores)
+        
+        if self.normalize and scores:
+            total_score /= len(scores)
+            
+        return total_score
+
+class KeywordRelevanceScorer(URLScorer):
+    """Score URLs based on keyword relevance.
+
+    keyword_scorer = KeywordRelevanceScorer(
+        keywords=["python", "programming"],
+        weight=1.0,
+        case_sensitive=False
+    )
+
+    - Score based on keyword matches
+    - Case sensitivity options
+    - Weighted scoring
+    """
+    
+    def __init__(self, keywords: List[str], weight: float = 1.0,
+                 case_sensitive: bool = False):
+        super().__init__(weight=weight)
+        self.keywords = keywords
+        self.case_sensitive = case_sensitive
+        self._compile_keywords()
+
+    def _compile_keywords(self):
+        """Prepare keywords for matching"""
+        flags = 0 if self.case_sensitive else re.IGNORECASE
+        self.patterns = [re.compile(re.escape(k), flags) for k in self.keywords]
+
+    def _calculate_score(self, url: str) -> float:
+        """Calculate score based on keyword matches"""
+        decoded_url = unquote(url)
+        total_matches = sum(
+            1 for pattern in self.patterns
+            if pattern.search(decoded_url)
+        )
+        # Normalize score between 0 and 1
+        return total_matches / len(self.patterns) if self.patterns else 0.0
+
+class PathDepthScorer(URLScorer):
+    """Score URLs based on their path depth.
+        
+    path_scorer = PathDepthScorer(
+        optimal_depth=3,  # Preferred URL depth
+        weight=0.7
+    )
+
+    - Score based on URL path depth
+    - Configurable optimal depth
+    - Diminishing returns for deeper paths
+    """
+    
+    def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
+        super().__init__(weight=weight)
+        self.optimal_depth = optimal_depth
+
+    def _calculate_score(self, url: str) -> float:
+        """Calculate score based on path depth"""
+        path = urlparse(url).path
+        depth = len([x for x in path.split('/') if x])
+        
+        # Score decreases as we move away from optimal depth
+        distance_from_optimal = abs(depth - self.optimal_depth)
+        return 1.0 / (1.0 + distance_from_optimal)
+
+class ContentTypeScorer(URLScorer):
+    """Score URLs based on content type preferences.
+    
+    content_scorer = ContentTypeScorer({
+        r'\.html$': 1.0,
+        r'\.pdf$': 0.8,
+        r'\.xml$': 0.6
+    })
+
+    - Score based on file types
+    - Configurable type weights
+    - Pattern matching support
+    """
+    
+    def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
+        super().__init__(weight=weight)
+        self.type_weights = type_weights
+        self._compile_patterns()
+
+    def _compile_patterns(self):
+        """Prepare content type patterns"""
+        self.patterns = {
+            re.compile(pattern): weight
+            for pattern, weight in self.type_weights.items()
+        }
+
+    def _calculate_score(self, url: str) -> float:
+        """Calculate score based on content type matching"""
+        for pattern, weight in self.patterns.items():
+            if pattern.search(url):
+                return weight
+        return 0.0
+
+class FreshnessScorer(URLScorer):
+    """Score URLs based on freshness indicators.
+    
+    freshness_scorer = FreshnessScorer(weight=0.9)
+
+    Score based on date indicators in URLs
+    Multiple date format support
+    Recency weighting"""
+    
+    def __init__(self, weight: float = 1.0):
+        super().__init__(weight=weight)
+        self.date_patterns = [
+            r'/(\d{4})/(\d{2})/(\d{2})/',  # yyyy/mm/dd
+            r'(\d{4})[-_](\d{2})[-_](\d{2})',  # yyyy-mm-dd
+            r'/(\d{4})/',  # year only
+        ]
+        self._compile_patterns()
+
+    def _compile_patterns(self):
+        """Prepare date patterns"""
+        self.compiled_patterns = [re.compile(p) for p in self.date_patterns]
+
+    def _calculate_score(self, url: str) -> float:
+        """Calculate score based on date indicators"""
+        for pattern in self.compiled_patterns:
+            if match := pattern.search(url):
+                year = int(match.group(1))
+                # Score higher for more recent years
+                return 1.0 - (2024 - year) * 0.1
+        return 0.5  # Default score for URLs without dates
+
+class DomainAuthorityScorer(URLScorer):
+    """Score URLs based on domain authority.
+
+    authority_scorer = DomainAuthorityScorer({
+        "python.org": 1.0,
+        "github.com": 0.9,
+        "medium.com": 0.7
+    })
+
+    Score based on domain importance
+    Configurable domain weights
+    Default weight for unknown domains"""
+    
+    def __init__(self, domain_weights: Dict[str, float], 
+                 default_weight: float = 0.5, weight: float = 1.0):
+        super().__init__(weight=weight)
+        self.domain_weights = domain_weights
+        self.default_weight = default_weight
+
+    def _calculate_score(self, url: str) -> float:
+        """Calculate score based on domain authority"""
+        domain = urlparse(url).netloc.lower()
+        return self.domain_weights.get(domain, self.default_weight)
+
+def create_balanced_scorer() -> CompositeScorer:
+    """Create a balanced composite scorer"""
+    return CompositeScorer([
+        KeywordRelevanceScorer(
+            keywords=["article", "blog", "news", "research"],
+            weight=1.0
+        ),
+        PathDepthScorer(
+            optimal_depth=3,
+            weight=0.7
+        ),
+        ContentTypeScorer(
+            type_weights={
+                r'\.html?$': 1.0,
+                r'\.pdf$': 0.8,
+                r'\.xml$': 0.6
+            },
+            weight=0.8
+        ),
+        FreshnessScorer(
+            weight=0.9
+        )
+    ])
+
+# Example Usage:
+"""
+# Create a composite scorer
+scorer = CompositeScorer([
+    KeywordRelevanceScorer(["python", "programming"], weight=1.0),
+    PathDepthScorer(optimal_depth=2, weight=0.7),
+    FreshnessScorer(weight=0.8),
+    DomainAuthorityScorer(
+        domain_weights={
+            "python.org": 1.0,
+            "github.com": 0.9,
+            "medium.com": 0.7
+        },
+        weight=0.9
+    )
+])
+
+# Score a URL
+score = scorer.score("https://python.org/article/2024/01/new-features")
+
+# Access statistics
+print(f"Average score: {scorer.stats.average_score}")
+print(f"URLs scored: {scorer.stats.urls_scored}")
+"""
--- a/docs/scrapper/filters_scrorers.md
+++ b/docs/scrapper/filters_scrorers.md
@@ -0,0 +1,342 @@
+# URL Filters and Scorers
+
+The crawl4ai library provides powerful URL filtering and scoring capabilities that help you control and prioritize your web crawling. This guide explains how to use these features effectively.
+
+```mermaid
+flowchart TB
+    Start([URL Input]) --> Chain[Filter Chain]
+    
+    subgraph Chain Process
+        Chain --> Pattern{URL Pattern\nFilter}
+        Pattern -->|Match| Content{Content Type\nFilter}
+        Pattern -->|No Match| Reject1[Reject URL]
+        
+        Content -->|Allowed| Domain{Domain\nFilter}
+        Content -->|Not Allowed| Reject2[Reject URL]
+        
+        Domain -->|Allowed| Accept[Accept URL]
+        Domain -->|Blocked| Reject3[Reject URL]
+    end
+    
+    subgraph Statistics
+        Pattern --> UpdatePattern[Update Pattern Stats]
+        Content --> UpdateContent[Update Content Stats]
+        Domain --> UpdateDomain[Update Domain Stats]
+        Accept --> UpdateChain[Update Chain Stats]
+        Reject1 --> UpdateChain
+        Reject2 --> UpdateChain
+        Reject3 --> UpdateChain
+    end
+    
+    Accept --> End([End])
+    Reject1 --> End
+    Reject2 --> End
+    Reject3 --> End
+    
+    classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
+    classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
+    classDef reject fill:#ef9a9a,stroke:#000,stroke-width:2px;
+    classDef accept fill:#a5d6a7,stroke:#000,stroke-width:2px;
+    
+    class Start,End accept;
+    class Pattern,Content,Domain decision;
+    class Reject1,Reject2,Reject3 reject;
+    class Chain,UpdatePattern,UpdateContent,UpdateDomain,UpdateChain process;
+```
+
+## URL Filters
+
+URL filters help you control which URLs are crawled. Multiple filters can be chained together to create sophisticated filtering rules.
+
+### Available Filters
+
+1. **URL Pattern Filter**
+```python
+pattern_filter = URLPatternFilter([
+    "*.example.com/*",  # Glob pattern
+    "*/article/*",      # Path pattern
+    re.compile(r"blog-\d+") # Regex pattern
+])
+```
+- Supports glob patterns and regex
+- Multiple patterns per filter
+- Pattern pre-compilation for performance
+
+2. **Content Type Filter**
+```python
+content_filter = ContentTypeFilter([
+    "text/html",
+    "application/pdf"
+], check_extension=True)
+```
+- Filter by MIME types
+- Extension checking
+- Support for multiple content types
+
+3. **Domain Filter**
+```python
+domain_filter = DomainFilter(
+    allowed_domains=["example.com", "blog.example.com"],
+    blocked_domains=["ads.example.com"]
+)
+```
+- Allow/block specific domains
+- Subdomain support
+- Efficient domain matching
+
+### Creating Filter Chains
+
+```python
+# Create and configure a filter chain
+filter_chain = FilterChain([
+    URLPatternFilter(["*.example.com/*"]),
+    ContentTypeFilter(["text/html"]),
+    DomainFilter(blocked_domains=["ads.*"])
+])
+
+# Add more filters
+filter_chain.add_filter(
+    URLPatternFilter(["*/article/*"])
+)
+```
+
+```mermaid
+flowchart TB
+    Start([URL Input]) --> Composite[Composite Scorer]
+    
+    subgraph Scoring Process
+        Composite --> Keywords[Keyword Relevance]
+        Composite --> Path[Path Depth]
+        Composite --> Content[Content Type]
+        Composite --> Fresh[Freshness]
+        Composite --> Domain[Domain Authority]
+        
+        Keywords --> KeywordScore[Calculate Score]
+        Path --> PathScore[Calculate Score]
+        Content --> ContentScore[Calculate Score]
+        Fresh --> FreshScore[Calculate Score]
+        Domain --> DomainScore[Calculate Score]
+        
+        KeywordScore --> Weight1[Apply Weight]
+        PathScore --> Weight2[Apply Weight]
+        ContentScore --> Weight3[Apply Weight]
+        FreshScore --> Weight4[Apply Weight]
+        DomainScore --> Weight5[Apply Weight]
+    end
+    
+    Weight1 --> Combine[Combine Scores]
+    Weight2 --> Combine
+    Weight3 --> Combine
+    Weight4 --> Combine
+    Weight5 --> Combine
+    
+    Combine --> Normalize{Normalize?}
+    Normalize -->|Yes| NormalizeScore[Normalize Combined Score]
+    Normalize -->|No| FinalScore[Final Score]
+    NormalizeScore --> FinalScore
+    
+    FinalScore --> Stats[Update Statistics]
+    Stats --> End([End])
+    
+    classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
+    classDef scorer fill:#fff59d,stroke:#000,stroke-width:2px;
+    classDef calc fill:#a5d6a7,stroke:#000,stroke-width:2px;
+    classDef decision fill:#ef9a9a,stroke:#000,stroke-width:2px;
+    
+    class Start,End calc;
+    class Keywords,Path,Content,Fresh,Domain scorer;
+    class KeywordScore,PathScore,ContentScore,FreshScore,DomainScore process;
+    class Normalize decision;
+```
+
+## URL Scorers
+
+URL scorers help prioritize which URLs to crawl first. Higher scores indicate higher priority.
+
+### Available Scorers
+
+1. **Keyword Relevance Scorer**
+```python
+keyword_scorer = KeywordRelevanceScorer(
+    keywords=["python", "programming"],
+    weight=1.0,
+    case_sensitive=False
+)
+```
+- Score based on keyword matches
+- Case sensitivity options
+- Weighted scoring
+
+2. **Path Depth Scorer**
+```python
+path_scorer = PathDepthScorer(
+    optimal_depth=3,  # Preferred URL depth
+    weight=0.7
+)
+```
+- Score based on URL path depth
+- Configurable optimal depth
+- Diminishing returns for deeper paths
+
+3. **Content Type Scorer**
+```python
+content_scorer = ContentTypeScorer({
+    r'\.html$': 1.0,
+    r'\.pdf$': 0.8,
+    r'\.xml$': 0.6
+})
+```
+- Score based on file types
+- Configurable type weights
+- Pattern matching support
+
+4. **Freshness Scorer**
+```python
+freshness_scorer = FreshnessScorer(weight=0.9)
+```
+- Score based on date indicators in URLs
+- Multiple date format support
+- Recency weighting
+
+5. **Domain Authority Scorer**
+```python
+authority_scorer = DomainAuthorityScorer({
+    "python.org": 1.0,
+    "github.com": 0.9,
+    "medium.com": 0.7
+})
+```
+- Score based on domain importance
+- Configurable domain weights
+- Default weight for unknown domains
+
+### Combining Scorers
+
+```python
+# Create a composite scorer
+composite_scorer = CompositeScorer([
+    KeywordRelevanceScorer(["python"], weight=1.0),
+    PathDepthScorer(optimal_depth=2, weight=0.7),
+    FreshnessScorer(weight=0.8)
+], normalize=True)
+```
+
+## Best Practices
+
+### Filter Configuration
+
+1. **Start Restrictive**
+   ```python
+   # Begin with strict filters
+   filter_chain = FilterChain([
+       DomainFilter(allowed_domains=["example.com"]),
+       ContentTypeFilter(["text/html"])
+   ])
+   ```
+
+2. **Layer Filters**
+   ```python
+   # Add more specific filters
+   filter_chain.add_filter(
+       URLPatternFilter(["*/article/*", "*/blog/*"])
+   )
+   ```
+
+3. **Monitor Filter Statistics**
+   ```python
+   # Check filter performance
+   for filter in filter_chain.filters:
+       print(f"{filter.name}: {filter.stats.rejected_urls} rejected")
+   ```
+
+### Scorer Configuration
+
+1. **Balance Weights**
+   ```python
+   # Balanced scoring configuration
+   scorer = create_balanced_scorer()
+   ```
+
+2. **Customize for Content**
+   ```python
+   # News site configuration
+   news_scorer = CompositeScorer([
+       KeywordRelevanceScorer(["news", "article"], weight=1.0),
+       FreshnessScorer(weight=1.0),
+       PathDepthScorer(optimal_depth=2, weight=0.5)
+   ])
+   ```
+
+3. **Monitor Scoring Statistics**
+   ```python
+   # Check scoring distribution
+   print(f"Average score: {scorer.stats.average_score}")
+   print(f"Score range: {scorer.stats.min_score} - {scorer.stats.max_score}")
+   ```
+
+## Common Use Cases
+
+### Blog Crawling
+```python
+blog_config = {
+    'filters': FilterChain([
+        URLPatternFilter(["*/blog/*", "*/post/*"]),
+        ContentTypeFilter(["text/html"])
+    ]),
+    'scorer': CompositeScorer([
+        FreshnessScorer(weight=1.0),
+        KeywordRelevanceScorer(["blog", "article"], weight=0.8)
+    ])
+}
+```
+
+### Documentation Sites
+```python
+docs_config = {
+    'filters': FilterChain([
+        URLPatternFilter(["*/docs/*", "*/guide/*"]),
+        ContentTypeFilter(["text/html", "application/pdf"])
+    ]),
+    'scorer': CompositeScorer([
+        PathDepthScorer(optimal_depth=3, weight=1.0),
+        KeywordRelevanceScorer(["guide", "tutorial"], weight=0.9)
+    ])
+}
+```
+
+### E-commerce Sites
+```python
+ecommerce_config = {
+    'filters': FilterChain([
+        URLPatternFilter(["*/product/*", "*/category/*"]),
+        DomainFilter(blocked_domains=["ads.*", "tracker.*"])
+    ]),
+    'scorer': CompositeScorer([
+        PathDepthScorer(optimal_depth=2, weight=1.0),
+        ContentTypeScorer({
+            r'/product/': 1.0,
+            r'/category/': 0.8
+        })
+    ])
+}
+```
+
+## Advanced Topics
+
+### Custom Filters
+```python
+class CustomFilter(URLFilter):
+    def apply(self, url: str) -> bool:
+        # Your custom filtering logic
+        return True
+```
+
+### Custom Scorers
+```python
+class CustomScorer(URLScorer):
+    def _calculate_score(self, url: str) -> float:
+        # Your custom scoring logic
+        return 1.0
+```
+
+For more examples, check our [example repository](https://github.com/example/crawl4ai/examples).
--- a/docs/scrapper/how_to_use.md
+++ b/docs/scrapper/how_to_use.md
@@ -0,0 +1,206 @@
+# Scraper Examples Guide
+
+This guide provides two complete examples of using the crawl4ai scraper: a basic implementation for simple use cases and an advanced implementation showcasing all features.
+
+## Basic Example
+
+The basic example demonstrates a simple blog scraping scenario:
+
+```python
+from crawl4ai.scraper import AsyncWebScraper, BFSScraperStrategy, FilterChain
+
+# Create simple filter chain
+filter_chain = FilterChain([
+    URLPatternFilter("*/blog/*"),
+    ContentTypeFilter(["text/html"])
+])
+
+# Initialize strategy
+strategy = BFSScraperStrategy(
+    max_depth=2,
+    filter_chain=filter_chain,
+    url_scorer=None,
+    max_concurrent=3
+)
+
+# Create and run scraper
+crawler = AsyncWebCrawler()
+scraper = AsyncWebScraper(crawler, strategy)
+result = await scraper.ascrape("https://example.com/blog/")
+```
+
+### Features Demonstrated
+- Basic URL filtering
+- Simple content type filtering
+- Depth control
+- Concurrent request limiting
+- Result collection
+
+## Advanced Example
+
+The advanced example shows a sophisticated news site scraping setup with all features enabled:
+
+```python
+# Create comprehensive filter chain
+filter_chain = FilterChain([
+    DomainFilter(
+        allowed_domains=["example.com"],
+        blocked_domains=["ads.example.com"]
+    ),
+    URLPatternFilter([
+        "*/article/*",
+        re.compile(r"\d{4}/\d{2}/.*")
+    ]),
+    ContentTypeFilter(["text/html"])
+])
+
+# Create intelligent scorer
+scorer = CompositeScorer([
+    KeywordRelevanceScorer(
+        keywords=["news", "breaking"],
+        weight=1.0
+    ),
+    PathDepthScorer(optimal_depth=3, weight=0.7),
+    FreshnessScorer(weight=0.9)
+])
+
+# Initialize advanced strategy
+strategy = BFSScraperStrategy(
+    max_depth=4,
+    filter_chain=filter_chain,
+    url_scorer=scorer,
+    max_concurrent=5
+)
+```
+
+### Features Demonstrated
+1. **Advanced Filtering**
+   - Domain filtering
+   - Pattern matching
+   - Content type control
+
+2. **Intelligent Scoring**
+   - Keyword relevance
+   - Path optimization
+   - Freshness priority
+
+3. **Monitoring**
+   - Progress tracking
+   - Error handling
+   - Statistics collection
+
+4. **Resource Management**
+   - Concurrent processing
+   - Rate limiting
+   - Cleanup handling
+
+## Running the Examples
+
+```bash
+# Basic usage
+python basic_scraper_example.py
+
+# Advanced usage with logging
+PYTHONPATH=. python advanced_scraper_example.py
+```
+
+## Example Output
+
+### Basic Example
+```
+Crawled 15 pages:
+- https://example.com/blog/post1: 24560 bytes
+- https://example.com/blog/post2: 18920 bytes
+...
+```
+
+### Advanced Example
+```
+INFO: Starting crawl of https://example.com/news/
+INFO: Processed: https://example.com/news/breaking/story1
+DEBUG: KeywordScorer: 0.85
+DEBUG: FreshnessScorer: 0.95
+INFO: Progress: 10 URLs processed
+...
+INFO: Scraping completed:
+INFO: - URLs processed: 50
+INFO: - Errors: 2
+INFO: - Total content size: 1240.50 KB
+```
+
+## Customization
+
+### Adding Custom Filters
+```python
+class CustomFilter(URLFilter):
+    def apply(self, url: str) -> bool:
+        # Your custom filtering logic
+        return True
+
+filter_chain.add_filter(CustomFilter())
+```
+
+### Custom Scoring Logic
+```python
+class CustomScorer(URLScorer):
+    def _calculate_score(self, url: str) -> float:
+        # Your custom scoring logic
+        return 1.0
+
+scorer = CompositeScorer([
+    CustomScorer(weight=1.0),
+    ...
+])
+```
+
+## Best Practices
+
+1. **Start Simple**
+   - Begin with basic filtering
+   - Add features incrementally
+   - Test thoroughly at each step
+
+2. **Monitor Performance**
+   - Watch memory usage
+   - Track processing times
+   - Adjust concurrency as needed
+
+3. **Handle Errors**
+   - Implement proper error handling
+   - Log important events
+   - Track error statistics
+
+4. **Optimize Resources**
+   - Set appropriate delays
+   - Limit concurrent requests
+   - Use streaming for large crawls
+
+## Troubleshooting
+
+Common issues and solutions:
+
+1. **Too Many Requests**
+   ```python
+   strategy = BFSScraperStrategy(
+       max_concurrent=3,  # Reduce concurrent requests
+       min_crawl_delay=2  # Increase delay between requests
+   )
+   ```
+
+2. **Memory Issues**
+   ```python
+   # Use streaming mode for large crawls
+   async for result in scraper.ascrape(url, stream=True):
+       process_result(result)
+   ```
+
+3. **Missing Content**
+   ```python
+   # Check your filter chain
+   filter_chain = FilterChain([
+       URLPatternFilter("*"),  # Broaden patterns
+       ContentTypeFilter(["*"])  # Accept all content
+   ])
+   ```
+
+For more examples and use cases, visit our [GitHub repository](https://github.com/example/crawl4ai/examples).
--- a/docs/scrapper/web_crawler_quick_start.py
+++ b/docs/scrapper/web_crawler_quick_start.py
@@ -0,0 +1,111 @@
+import unittest, os
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
+from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy
+
+class TestWebCrawler(unittest.TestCase):
+    
+    def setUp(self):
+        self.crawler = WebCrawler()
+    
+    def test_warmup(self):
+        self.crawler.warmup()
+        self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
+    
+    def test_run_default_strategies(self):
+        result = self.crawler.run(
+            url='https://www.nbcnews.com/business',
+            word_count_threshold=5,
+            chunking_strategy=RegexChunking(),
+            extraction_strategy=CosineStrategy(), bypass_cache=True
+        )
+        self.assertTrue(result.success, "Failed to crawl and extract using default strategies")
+    
+    def test_run_different_strategies(self):
+        url = 'https://www.nbcnews.com/business'
+        
+        # Test with FixedLengthWordChunking and LLMExtractionStrategy
+        result = self.crawler.run(
+            url=url,
+            word_count_threshold=5,
+            chunking_strategy=FixedLengthWordChunking(chunk_size=100),
+            extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
+        )
+        self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")
+        
+        # Test with SlidingWindowChunking and TopicExtractionStrategy
+        result = self.crawler.run(
+            url=url,
+            word_count_threshold=5,
+            chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
+            extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
+        )
+        self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")
+    
+    def test_invalid_url(self):
+        with self.assertRaises(Exception) as context:
+            self.crawler.run(url='invalid_url', bypass_cache=True)
+        self.assertIn("Invalid URL", str(context.exception))
+    
+    def test_unsupported_extraction_strategy(self):
+        with self.assertRaises(Exception) as context:
+            self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
+        self.assertIn("Unsupported extraction strategy", str(context.exception))
+    
+    def test_invalid_css_selector(self):
+        with self.assertRaises(ValueError) as context:
+            self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
+        self.assertIn("Invalid CSS selector", str(context.exception))
+
+    
+    def test_crawl_with_cache_and_bypass_cache(self):
+        url = 'https://www.nbcnews.com/business'
+        
+        # First crawl with cache enabled
+        result = self.crawler.run(url=url, bypass_cache=False)
+        self.assertTrue(result.success, "Failed to crawl and cache the result")
+        
+        # Second crawl with bypass_cache=True
+        result = self.crawler.run(url=url, bypass_cache=True)
+        self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
+    
+    def test_fetch_multiple_pages(self):
+        urls = [
+            'https://www.nbcnews.com/business',
+            'https://www.bbc.com/news'
+        ]
+        results = []
+        for url in urls:
+            result = self.crawler.run(
+                url=url,
+                word_count_threshold=5,
+                chunking_strategy=RegexChunking(),
+                extraction_strategy=CosineStrategy(),
+                bypass_cache=True
+            )
+            results.append(result)
+        
+        self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
+        for result in results:
+            self.assertTrue(result.success, "Failed to crawl and extract a page in the list")
+    
+    def test_run_fixed_length_word_chunking_and_no_extraction(self):
+        result = self.crawler.run(
+            url='https://www.nbcnews.com/business',
+            word_count_threshold=5,
+            chunking_strategy=FixedLengthWordChunking(chunk_size=100),
+            extraction_strategy=NoExtractionStrategy(), bypass_cache=True
+        )
+        self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")
+
+    def test_run_sliding_window_and_no_extraction(self):
+        result = self.crawler.run(
+            url='https://www.nbcnews.com/business',
+            word_count_threshold=5,
+            chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
+            extraction_strategy=NoExtractionStrategy(), bypass_cache=True
+        )
+        self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py
@@ -0,0 +1,184 @@
+# basic_scraper_example.py
+from crawl4ai.scraper import (
+    AsyncWebScraper,
+    BFSScraperStrategy,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter
+)
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+async def basic_scraper_example():
+    """
+    Basic example: Scrape a blog site for articles
+    - Crawls only HTML pages
+    - Stays within the blog section
+    - Collects all results at once
+    """
+    # Create a simple filter chain
+    filter_chain = FilterChain([
+        # Only crawl pages within the blog section
+        URLPatternFilter("*/blog/*"),
+        # Only process HTML pages
+        ContentTypeFilter(["text/html"])
+    ])
+
+    # Initialize the strategy with basic configuration
+    strategy = BFSScraperStrategy(
+        max_depth=2,  # Only go 2 levels deep
+        filter_chain=filter_chain,
+        url_scorer=None,  # Use default scoring
+        max_concurrent=3  # Limit concurrent requests
+    )
+
+    # Create the crawler and scraper
+    crawler = AsyncWebCrawler()
+    scraper = AsyncWebScraper(crawler, strategy)
+
+    # Start scraping
+    try:
+        result = await scraper.ascrape("https://example.com/blog/")
+        
+        # Process results
+        print(f"Crawled {len(result.crawled_urls)} pages:")
+        for url, data in result.extracted_data.items():
+            print(f"- {url}: {len(data.html)} bytes")
+            
+    except Exception as e:
+        print(f"Error during scraping: {e}")
+
+# advanced_scraper_example.py
+import logging
+from crawl4ai.scraper import (
+    AsyncWebScraper,
+    BFSScraperStrategy,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter,
+    DomainFilter,
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    FreshnessScorer,
+    CompositeScorer
+)
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+async def advanced_scraper_example():
+    """
+    Advanced example: Intelligent news site scraping
+    - Uses all filter types
+    - Implements sophisticated scoring
+    - Streams results
+    - Includes monitoring and logging
+    """
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("advanced_scraper")
+
+    # Create sophisticated filter chain
+    filter_chain = FilterChain([
+        # Domain control
+        DomainFilter(
+            allowed_domains=["example.com", "blog.example.com"],
+            blocked_domains=["ads.example.com", "tracker.example.com"]
+        ),
+        # URL patterns
+        URLPatternFilter([
+            "*/article/*",
+            "*/news/*",
+            "*/blog/*",
+            re.compile(r"\d{4}/\d{2}/.*")  # Date-based URLs
+        ]),
+        # Content types
+        ContentTypeFilter([
+            "text/html",
+            "application/xhtml+xml"
+        ])
+    ])
+
+    # Create composite scorer
+    scorer = CompositeScorer([
+        # Prioritize by keywords
+        KeywordRelevanceScorer(
+            keywords=["news", "breaking", "update", "latest"],
+            weight=1.0
+        ),
+        # Prefer optimal URL structure
+        PathDepthScorer(
+            optimal_depth=3,
+            weight=0.7
+        ),
+        # Prioritize fresh content
+        FreshnessScorer(weight=0.9)
+    ])
+
+    # Initialize strategy with advanced configuration
+    strategy = BFSScraperStrategy(
+        max_depth=4,
+        filter_chain=filter_chain,
+        url_scorer=scorer,
+        max_concurrent=5,
+        min_crawl_delay=1
+    )
+
+    # Create crawler and scraper
+    crawler = AsyncWebCrawler()
+    scraper = AsyncWebScraper(crawler, strategy)
+
+    # Track statistics
+    stats = {
+        'processed': 0,
+        'errors': 0,
+        'total_size': 0
+    }
+
+    try:
+        # Use streaming mode
+        async for result in scraper.ascrape("https://example.com/news/", stream=True):
+            stats['processed'] += 1
+            
+            if result.success:
+                stats['total_size'] += len(result.html)
+                logger.info(f"Processed: {result.url}")
+                
+                # Print scoring information
+                for scorer_name, score in result.scores.items():
+                    logger.debug(f"{scorer_name}: {score:.2f}")
+            else:
+                stats['errors'] += 1
+                logger.error(f"Failed to process {result.url}: {result.error_message}")
+
+            # Log progress regularly
+            if stats['processed'] % 10 == 0:
+                logger.info(f"Progress: {stats['processed']} URLs processed")
+
+    except Exception as e:
+        logger.error(f"Scraping error: {e}")
+    
+    finally:
+        # Print final statistics
+        logger.info("Scraping completed:")
+        logger.info(f"- URLs processed: {stats['processed']}")
+        logger.info(f"- Errors: {stats['errors']}")
+        logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
+        
+        # Print filter statistics
+        for filter_ in filter_chain.filters:
+            logger.info(f"{filter_.name} stats:")
+            logger.info(f"- Passed: {filter_.stats.passed_urls}")
+            logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
+        
+        # Print scorer statistics
+        logger.info("Scoring statistics:")
+        logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
+        logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
+
+if __name__ == "__main__":
+    import asyncio
+    
+    # Run basic example
+    print("Running basic scraper example...")
+    asyncio.run(basic_scraper_example())
+    
+    print("\nRunning advanced scraper example...")
+    asyncio.run(advanced_scraper_example())