diff --git a/crawl4ai/scraper/bfs_scraper_strategy copy.py b/crawl4ai/scraper/bfs_scraper_strategy copy.py deleted file mode 100644 index 51bf9cb3..00000000 --- a/crawl4ai/scraper/bfs_scraper_strategy copy.py +++ /dev/null @@ -1,138 +0,0 @@ -from .scraper_strategy import ScraperStrategy -from .filters import FilterChain -from .scorers import URLScorer -from ..models import CrawlResult -from ..async_webcrawler import AsyncWebCrawler -import asyncio -import validators -from urllib.parse import urljoin,urlparse,urlunparse -from urllib.robotparser import RobotFileParser -import time -from aiolimiter import AsyncLimiter -from tenacity import retry, stop_after_attempt, wait_exponential -from collections import defaultdict -import logging -from typing import Dict, AsyncGenerator -logging.basicConfig(level=logging.DEBUG) - -rate_limiter = AsyncLimiter(1, 1) # 1 request per second - -class BFSScraperStrategy(ScraperStrategy): - def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1): - self.max_depth = max_depth - self.filter_chain = filter_chain - self.url_scorer = url_scorer - self.max_concurrent = max_concurrent - # For Crawl Politeness - self.last_crawl_time = defaultdict(float) - self.min_crawl_delay = min_crawl_delay # 1 second delay between requests to the same domain - # For Robots.txt Compliance - self.robot_parsers = {} - - # Robots.txt Parser - def get_robot_parser(self, url: str) -> RobotFileParser: - domain = urlparse(url) - scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided - netloc = domain.netloc - if netloc not in self.robot_parsers: - rp = RobotFileParser() - rp.set_url(f"{scheme}://{netloc}/robots.txt") - try: - rp.read() - except Exception as e: - # Log the type of error, message, and the URL - logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}") - return None - self.robot_parsers[netloc] = rp - return self.robot_parsers[netloc] - - # Retry with exponential backoff - @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) - async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: - return await crawler.arun(url) - - async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]: - def normalize_url(url: str) -> str: - parsed = urlparse(url) - return urlunparse(parsed._replace(fragment="")) - - # URL Validation - if not validators.url(url): - logging.warning(f"Invalid URL: {url}") - return None - - # Robots.txt Compliance - robot_parser = self.get_robot_parser(url) - if robot_parser is None: - logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.") - else: - # If robots.txt was fetched, check if crawling is allowed - if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url): - logging.info(f"Skipping {url} as per robots.txt") - return None - - # Crawl Politeness - domain = urlparse(url).netloc - time_since_last_crawl = time.time() - self.last_crawl_time[domain] - if time_since_last_crawl < self.min_crawl_delay: - await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl) - self.last_crawl_time[domain] = time.time() - - # Rate Limiting - async with rate_limiter: - # Error Handling - try: - crawl_result = await self.retry_crawl(crawler, url) - except Exception as e: - logging.error(f"Error crawling {url}: {str(e)}") - crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e)) - - if not crawl_result.success: - # Logging and Monitoring - logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}") - return crawl_result - - # Process links - for link_type in ["internal", "external"]: - for link in crawl_result.links[link_type]: - absolute_link = urljoin(url, link['href']) - normalized_link = normalize_url(absolute_link) - if self.filter_chain.apply(normalized_link) and normalized_link not in visited: - new_depth = depths[url] + 1 - if new_depth <= self.max_depth: - # URL Scoring - score = self.url_scorer.score(normalized_link) - await queue.put((score, new_depth, normalized_link)) - depths[normalized_link] = new_depth - return crawl_result - - async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]: - queue = asyncio.PriorityQueue() - queue.put_nowait((0, 0, start_url)) - visited = set() - depths = {start_url: 0} - pending_tasks = set() - - while not queue.empty() or pending_tasks: - while not queue.empty() and len(pending_tasks) < self.max_concurrent: - _, depth, url = await queue.get() - if url not in visited: - # Adding URL to the visited set here itself, (instead of after result generation) - # so that other tasks are not queued for same URL, found at different depth before - # crawling and extraction of this task is completed. - visited.add(url) - if parallel_processing: - task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths)) - pending_tasks.add(task) - else: - result = await self.process_url(url, depth, crawler, queue, visited, depths) - if result: - yield result - - # Wait for the first task to complete and yield results incrementally as each task is completed - if pending_tasks: - done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED) - for task in done: - result = await task - if result: - yield result \ No newline at end of file diff --git a/crawl4ai/scraper/filters/__init__.py b/crawl4ai/scraper/filters/__init__.py index 525c9bdb..df5d13aa 100644 --- a/crawl4ai/scraper/filters/__init__.py +++ b/crawl4ai/scraper/filters/__init__.py @@ -1,3 +1,205 @@ -from .url_filter import URLFilter, FilterChain -from .content_type_filter import ContentTypeFilter -from .url_pattern_filter import URLPatternFilter \ No newline at end of file +# from .url_filter import URLFilter, FilterChain +# from .content_type_filter import ContentTypeFilter +# from .url_pattern_filter import URLPatternFilter + +from abc import ABC, abstractmethod +from typing import List, Pattern, Set, Union +import re +from urllib.parse import urlparse +import mimetypes +import logging +from dataclasses import dataclass +import fnmatch + +@dataclass +class FilterStats: + """Statistics for filter applications""" + total_urls: int = 0 + rejected_urls: int = 0 + passed_urls: int = 0 + +class URLFilter(ABC): + """Base class for URL filters""" + + def __init__(self, name: str = None): + self.name = name or self.__class__.__name__ + self.stats = FilterStats() + self.logger = logging.getLogger(f"urlfilter.{self.name}") + + @abstractmethod + def apply(self, url: str) -> bool: + """Apply the filter to a URL""" + pass + + def _update_stats(self, passed: bool): + """Update filter statistics""" + self.stats.total_urls += 1 + if passed: + self.stats.passed_urls += 1 + else: + self.stats.rejected_urls += 1 + +class FilterChain: + """Chain of URL filters.""" + + def __init__(self, filters: List[URLFilter] = None): + self.filters = filters or [] + self.stats = FilterStats() + self.logger = logging.getLogger("urlfilter.chain") + + def add_filter(self, filter_: URLFilter) -> 'FilterChain': + """Add a filter to the chain""" + self.filters.append(filter_) + return self # Enable method chaining + + def apply(self, url: str) -> bool: + """Apply all filters in the chain""" + self.stats.total_urls += 1 + + for filter_ in self.filters: + if not filter_.apply(url): + self.stats.rejected_urls += 1 + self.logger.debug(f"URL {url} rejected by {filter_.name}") + return False + + self.stats.passed_urls += 1 + return True + +class URLPatternFilter(URLFilter): + """Filter URLs based on glob patterns or regex. + + pattern_filter = URLPatternFilter([ + "*.example.com/*", # Glob pattern + "*/article/*", # Path pattern + re.compile(r"blog-\d+") # Regex pattern + ]) + + - Supports glob patterns and regex + - Multiple patterns per filter + - Pattern pre-compilation for performance + """ + + def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], + use_glob: bool = True): + super().__init__() + self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns + self.use_glob = use_glob + self._compiled_patterns = [] + + for pattern in self.patterns: + if isinstance(pattern, str) and use_glob: + self._compiled_patterns.append(self._glob_to_regex(pattern)) + else: + self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern) + + def _glob_to_regex(self, pattern: str) -> Pattern: + """Convert glob pattern to regex""" + return re.compile(fnmatch.translate(pattern)) + + def apply(self, url: str) -> bool: + """Check if URL matches any of the patterns""" + matches = any(pattern.search(url) for pattern in self._compiled_patterns) + self._update_stats(matches) + return matches + +class ContentTypeFilter(URLFilter): + """Filter URLs based on expected content type. + + content_filter = ContentTypeFilter([ + "text/html", + "application/pdf" + ], check_extension=True) + + - Filter by MIME types + - Extension checking + - Support for multiple content types + """ + + def __init__(self, allowed_types: Union[str, List[str]], + check_extension: bool = True): + super().__init__() + self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types + self.check_extension = check_extension + self._normalize_types() + + def _normalize_types(self): + """Normalize content type strings""" + self.allowed_types = [t.lower() for t in self.allowed_types] + + def _check_extension(self, url: str) -> bool: + """Check URL's file extension""" + ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else '' + if not ext: + return True # No extension, might be dynamic content + + guessed_type = mimetypes.guess_type(url)[0] + return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types) + + def apply(self, url: str) -> bool: + """Check if URL's content type is allowed""" + result = True + if self.check_extension: + result = self._check_extension(url) + self._update_stats(result) + return result + +class DomainFilter(URLFilter): + """Filter URLs based on allowed/blocked domains. + + domain_filter = DomainFilter( + allowed_domains=["example.com", "blog.example.com"], + blocked_domains=["ads.example.com"] + ) + + - Allow/block specific domains + - Subdomain support + - Efficient domain matching + """ + + def __init__(self, allowed_domains: Union[str, List[str]] = None, + blocked_domains: Union[str, List[str]] = None): + super().__init__() + self.allowed_domains = set(self._normalize_domains(allowed_domains)) if allowed_domains else None + self.blocked_domains = set(self._normalize_domains(blocked_domains)) if blocked_domains else set() + + def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]: + """Normalize domain strings""" + if isinstance(domains, str): + domains = [domains] + return [d.lower().strip() for d in domains] + + def _extract_domain(self, url: str) -> str: + """Extract domain from URL""" + return urlparse(url).netloc.lower() + + def apply(self, url: str) -> bool: + """Check if URL's domain is allowed""" + domain = self._extract_domain(url) + + if domain in self.blocked_domains: + self._update_stats(False) + return False + + if self.allowed_domains is not None and domain not in self.allowed_domains: + self._update_stats(False) + return False + + self._update_stats(True) + return True + +# Example usage: +def create_common_filter_chain() -> FilterChain: + """Create a commonly used filter chain""" + return FilterChain([ + URLPatternFilter([ + "*.html", "*.htm", # HTML files + "*/article/*", "*/blog/*" # Common content paths + ]), + ContentTypeFilter([ + "text/html", + "application/xhtml+xml" + ]), + DomainFilter( + blocked_domains=["ads.*", "analytics.*"] + ) + ]) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/content_type_filter.py b/crawl4ai/scraper/filters/content_type_filter.py index 9173eb4a..6966afdb 100644 --- a/crawl4ai/scraper/filters/content_type_filter.py +++ b/crawl4ai/scraper/filters/content_type_filter.py @@ -1,8 +1,43 @@ from .url_filter import URLFilter +from typing import List, Union +from urllib.parse import urlparse +import mimetypes + class ContentTypeFilter(URLFilter): - def __init__(self, contentType: str): - self.contentType = contentType + """Filter URLs based on expected content type""" + + def __init__(self, allowed_types: Union[str, List[str]], + check_extension: bool = True): + super().__init__() + self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types + self.check_extension = check_extension + self._normalize_types() + + def _normalize_types(self): + """Normalize content type strings""" + self.allowed_types = [t.lower() for t in self.allowed_types] + + def _check_extension(self, url: str) -> bool: + """Check URL's file extension""" + ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else '' + if not ext: + return True # No extension, might be dynamic content + + guessed_type = mimetypes.guess_type(url)[0] + return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types) + def apply(self, url: str) -> bool: - #TODO: This is a stub. Will implement this later - return True \ No newline at end of file + """Check if URL's content type is allowed""" + result = True + if self.check_extension: + result = self._check_extension(url) + self._update_stats(result) + return result + +# class ContentTypeFilter(URLFilter): +# def __init__(self, contentType: str): +# self.contentType = contentType +# def apply(self, url: str) -> bool: +# #TODO: This is a stub. Will implement this later +# return True \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_filter.py b/crawl4ai/scraper/filters/url_filter.py index 2b8bd6eb..88a2c60a 100644 --- a/crawl4ai/scraper/filters/url_filter.py +++ b/crawl4ai/scraper/filters/url_filter.py @@ -1,16 +1,72 @@ from abc import ABC, abstractmethod +from dataclasses import dataclass +import logging +from typing import List +@dataclass +class FilterStats: + """Statistics for filter applications""" + total_urls: int = 0 + rejected_urls: int = 0 + passed_urls: int = 0 class URLFilter(ABC): + """Base class for URL filters""" + + def __init__(self, name: str = None): + self.name = name or self.__class__.__name__ + self.stats = FilterStats() + self.logger = logging.getLogger(f"urlfilter.{self.name}") + @abstractmethod def apply(self, url: str) -> bool: + """Apply the filter to a URL""" pass -class FilterChain: - def __init__(self): - self.filters = [] + def _update_stats(self, passed: bool): + """Update filter statistics""" + self.stats.total_urls += 1 + if passed: + self.stats.passed_urls += 1 + else: + self.stats.rejected_urls += 1 - def add_filter(self, filter: URLFilter): - self.filters.append(filter) +class FilterChain: + """Chain of URL filters""" + + def __init__(self, filters: List[URLFilter] = None): + self.filters = filters or [] + self.stats = FilterStats() + self.logger = logging.getLogger("urlfilter.chain") + + def add_filter(self, filter_: URLFilter) -> 'FilterChain': + """Add a filter to the chain""" + self.filters.append(filter_) + return self # Enable method chaining def apply(self, url: str) -> bool: - return all(filter.apply(url) for filter in self.filters) \ No newline at end of file + """Apply all filters in the chain""" + self.stats.total_urls += 1 + + for filter_ in self.filters: + if not filter_.apply(url): + self.stats.rejected_urls += 1 + self.logger.debug(f"URL {url} rejected by {filter_.name}") + return False + + self.stats.passed_urls += 1 + return True + +# class URLFilter(ABC): +# @abstractmethod +# def apply(self, url: str) -> bool: +# pass + +# class FilterChain: +# def __init__(self): +# self.filters = [] + +# def add_filter(self, filter: URLFilter): +# self.filters.append(filter) + +# def apply(self, url: str) -> bool: +# return all(filter.apply(url) for filter in self.filters) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_pattern_filter.py b/crawl4ai/scraper/filters/url_pattern_filter.py index fd5df133..1e02b4a6 100644 --- a/crawl4ai/scraper/filters/url_pattern_filter.py +++ b/crawl4ai/scraper/filters/url_pattern_filter.py @@ -1,9 +1,39 @@ from .url_filter import URLFilter from re import Pattern +from typing import List, Union +import re +import fnmatch + class URLPatternFilter(URLFilter): - def __init__(self, pattern: Pattern): - self.pattern = pattern + """Filter URLs based on glob patterns or regex""" + + def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], + use_glob: bool = True): + super().__init__() + self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns + self.use_glob = use_glob + self._compiled_patterns = [] + + for pattern in self.patterns: + if isinstance(pattern, str) and use_glob: + self._compiled_patterns.append(self._glob_to_regex(pattern)) + else: + self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern) + + def _glob_to_regex(self, pattern: str) -> Pattern: + """Convert glob pattern to regex""" + return re.compile(fnmatch.translate(pattern)) + def apply(self, url: str) -> bool: - #TODO: This is a stub. Will implement this later. - return True \ No newline at end of file + """Check if URL matches any of the patterns""" + matches = any(pattern.search(url) for pattern in self._compiled_patterns) + self._update_stats(matches) + return matches + +# class URLPatternFilter(URLFilter): +# def __init__(self, pattern: Pattern): +# self.pattern = pattern +# def apply(self, url: str) -> bool: +# #TODO: This is a stub. Will implement this later. +# return True \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/__init__.py b/crawl4ai/scraper/scorers/__init__.py index 05c61c94..548b80f0 100644 --- a/crawl4ai/scraper/scorers/__init__.py +++ b/crawl4ai/scraper/scorers/__init__.py @@ -1,2 +1,268 @@ -from .url_scorer import URLScorer -from .keyword_relevance_scorer import KeywordRelevanceScorer \ No newline at end of file +# from .url_scorer import URLScorer +# from .keyword_relevance_scorer import KeywordRelevanceScorer + +from abc import ABC, abstractmethod +from typing import List, Dict, Optional, Union +from dataclasses import dataclass +from urllib.parse import urlparse, unquote +import re +from collections import defaultdict +import math +import logging + +@dataclass +class ScoringStats: + """Statistics for URL scoring""" + urls_scored: int = 0 + total_score: float = 0.0 + min_score: float = float('inf') + max_score: float = float('-inf') + + def update(self, score: float): + """Update scoring statistics""" + self.urls_scored += 1 + self.total_score += score + self.min_score = min(self.min_score, score) + self.max_score = max(self.max_score, score) + + @property + def average_score(self) -> float: + """Calculate average score""" + return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0 + +class URLScorer(ABC): + """Base class for URL scoring strategies""" + + def __init__(self, weight: float = 1.0, name: str = None): + self.weight = weight + self.name = name or self.__class__.__name__ + self.stats = ScoringStats() + self.logger = logging.getLogger(f"urlscorer.{self.name}") + + @abstractmethod + def _calculate_score(self, url: str) -> float: + """Calculate the raw score for a URL""" + pass + + def score(self, url: str) -> float: + """Calculate the weighted score for a URL""" + raw_score = self._calculate_score(url) + weighted_score = raw_score * self.weight + self.stats.update(weighted_score) + return weighted_score + +class CompositeScorer(URLScorer): + """Combines multiple scorers with weights""" + + def __init__(self, scorers: List[URLScorer], normalize: bool = True): + super().__init__(name="CompositeScorer") + self.scorers = scorers + self.normalize = normalize + + def _calculate_score(self, url: str) -> float: + scores = [scorer.score(url) for scorer in self.scorers] + total_score = sum(scores) + + if self.normalize and scores: + total_score /= len(scores) + + return total_score + +class KeywordRelevanceScorer(URLScorer): + """Score URLs based on keyword relevance. + + keyword_scorer = KeywordRelevanceScorer( + keywords=["python", "programming"], + weight=1.0, + case_sensitive=False + ) + + - Score based on keyword matches + - Case sensitivity options + - Weighted scoring + """ + + def __init__(self, keywords: List[str], weight: float = 1.0, + case_sensitive: bool = False): + super().__init__(weight=weight) + self.keywords = keywords + self.case_sensitive = case_sensitive + self._compile_keywords() + + def _compile_keywords(self): + """Prepare keywords for matching""" + flags = 0 if self.case_sensitive else re.IGNORECASE + self.patterns = [re.compile(re.escape(k), flags) for k in self.keywords] + + def _calculate_score(self, url: str) -> float: + """Calculate score based on keyword matches""" + decoded_url = unquote(url) + total_matches = sum( + 1 for pattern in self.patterns + if pattern.search(decoded_url) + ) + # Normalize score between 0 and 1 + return total_matches / len(self.patterns) if self.patterns else 0.0 + +class PathDepthScorer(URLScorer): + """Score URLs based on their path depth. + + path_scorer = PathDepthScorer( + optimal_depth=3, # Preferred URL depth + weight=0.7 + ) + + - Score based on URL path depth + - Configurable optimal depth + - Diminishing returns for deeper paths + """ + + def __init__(self, optimal_depth: int = 3, weight: float = 1.0): + super().__init__(weight=weight) + self.optimal_depth = optimal_depth + + def _calculate_score(self, url: str) -> float: + """Calculate score based on path depth""" + path = urlparse(url).path + depth = len([x for x in path.split('/') if x]) + + # Score decreases as we move away from optimal depth + distance_from_optimal = abs(depth - self.optimal_depth) + return 1.0 / (1.0 + distance_from_optimal) + +class ContentTypeScorer(URLScorer): + """Score URLs based on content type preferences. + + content_scorer = ContentTypeScorer({ + r'\.html$': 1.0, + r'\.pdf$': 0.8, + r'\.xml$': 0.6 + }) + + - Score based on file types + - Configurable type weights + - Pattern matching support + """ + + def __init__(self, type_weights: Dict[str, float], weight: float = 1.0): + super().__init__(weight=weight) + self.type_weights = type_weights + self._compile_patterns() + + def _compile_patterns(self): + """Prepare content type patterns""" + self.patterns = { + re.compile(pattern): weight + for pattern, weight in self.type_weights.items() + } + + def _calculate_score(self, url: str) -> float: + """Calculate score based on content type matching""" + for pattern, weight in self.patterns.items(): + if pattern.search(url): + return weight + return 0.0 + +class FreshnessScorer(URLScorer): + """Score URLs based on freshness indicators. + + freshness_scorer = FreshnessScorer(weight=0.9) + + Score based on date indicators in URLs + Multiple date format support + Recency weighting""" + + def __init__(self, weight: float = 1.0): + super().__init__(weight=weight) + self.date_patterns = [ + r'/(\d{4})/(\d{2})/(\d{2})/', # yyyy/mm/dd + r'(\d{4})[-_](\d{2})[-_](\d{2})', # yyyy-mm-dd + r'/(\d{4})/', # year only + ] + self._compile_patterns() + + def _compile_patterns(self): + """Prepare date patterns""" + self.compiled_patterns = [re.compile(p) for p in self.date_patterns] + + def _calculate_score(self, url: str) -> float: + """Calculate score based on date indicators""" + for pattern in self.compiled_patterns: + if match := pattern.search(url): + year = int(match.group(1)) + # Score higher for more recent years + return 1.0 - (2024 - year) * 0.1 + return 0.5 # Default score for URLs without dates + +class DomainAuthorityScorer(URLScorer): + """Score URLs based on domain authority. + + authority_scorer = DomainAuthorityScorer({ + "python.org": 1.0, + "github.com": 0.9, + "medium.com": 0.7 + }) + + Score based on domain importance + Configurable domain weights + Default weight for unknown domains""" + + def __init__(self, domain_weights: Dict[str, float], + default_weight: float = 0.5, weight: float = 1.0): + super().__init__(weight=weight) + self.domain_weights = domain_weights + self.default_weight = default_weight + + def _calculate_score(self, url: str) -> float: + """Calculate score based on domain authority""" + domain = urlparse(url).netloc.lower() + return self.domain_weights.get(domain, self.default_weight) + +def create_balanced_scorer() -> CompositeScorer: + """Create a balanced composite scorer""" + return CompositeScorer([ + KeywordRelevanceScorer( + keywords=["article", "blog", "news", "research"], + weight=1.0 + ), + PathDepthScorer( + optimal_depth=3, + weight=0.7 + ), + ContentTypeScorer( + type_weights={ + r'\.html?$': 1.0, + r'\.pdf$': 0.8, + r'\.xml$': 0.6 + }, + weight=0.8 + ), + FreshnessScorer( + weight=0.9 + ) + ]) + +# Example Usage: +""" +# Create a composite scorer +scorer = CompositeScorer([ + KeywordRelevanceScorer(["python", "programming"], weight=1.0), + PathDepthScorer(optimal_depth=2, weight=0.7), + FreshnessScorer(weight=0.8), + DomainAuthorityScorer( + domain_weights={ + "python.org": 1.0, + "github.com": 0.9, + "medium.com": 0.7 + }, + weight=0.9 + ) +]) + +# Score a URL +score = scorer.score("https://python.org/article/2024/01/new-features") + +# Access statistics +print(f"Average score: {scorer.stats.average_score}") +print(f"URLs scored: {scorer.stats.urls_scored}") +""" \ No newline at end of file diff --git a/docs/scrapper/filters_scrorers.md b/docs/scrapper/filters_scrorers.md new file mode 100644 index 00000000..22b846c6 --- /dev/null +++ b/docs/scrapper/filters_scrorers.md @@ -0,0 +1,342 @@ +# URL Filters and Scorers + +The crawl4ai library provides powerful URL filtering and scoring capabilities that help you control and prioritize your web crawling. This guide explains how to use these features effectively. + +```mermaid +flowchart TB + Start([URL Input]) --> Chain[Filter Chain] + + subgraph Chain Process + Chain --> Pattern{URL Pattern\nFilter} + Pattern -->|Match| Content{Content Type\nFilter} + Pattern -->|No Match| Reject1[Reject URL] + + Content -->|Allowed| Domain{Domain\nFilter} + Content -->|Not Allowed| Reject2[Reject URL] + + Domain -->|Allowed| Accept[Accept URL] + Domain -->|Blocked| Reject3[Reject URL] + end + + subgraph Statistics + Pattern --> UpdatePattern[Update Pattern Stats] + Content --> UpdateContent[Update Content Stats] + Domain --> UpdateDomain[Update Domain Stats] + Accept --> UpdateChain[Update Chain Stats] + Reject1 --> UpdateChain + Reject2 --> UpdateChain + Reject3 --> UpdateChain + end + + Accept --> End([End]) + Reject1 --> End + Reject2 --> End + Reject3 --> End + + classDef process fill:#90caf9,stroke:#000,stroke-width:2px; + classDef decision fill:#fff59d,stroke:#000,stroke-width:2px; + classDef reject fill:#ef9a9a,stroke:#000,stroke-width:2px; + classDef accept fill:#a5d6a7,stroke:#000,stroke-width:2px; + + class Start,End accept; + class Pattern,Content,Domain decision; + class Reject1,Reject2,Reject3 reject; + class Chain,UpdatePattern,UpdateContent,UpdateDomain,UpdateChain process; +``` + +## URL Filters + +URL filters help you control which URLs are crawled. Multiple filters can be chained together to create sophisticated filtering rules. + +### Available Filters + +1. **URL Pattern Filter** +```python +pattern_filter = URLPatternFilter([ + "*.example.com/*", # Glob pattern + "*/article/*", # Path pattern + re.compile(r"blog-\d+") # Regex pattern +]) +``` +- Supports glob patterns and regex +- Multiple patterns per filter +- Pattern pre-compilation for performance + +2. **Content Type Filter** +```python +content_filter = ContentTypeFilter([ + "text/html", + "application/pdf" +], check_extension=True) +``` +- Filter by MIME types +- Extension checking +- Support for multiple content types + +3. **Domain Filter** +```python +domain_filter = DomainFilter( + allowed_domains=["example.com", "blog.example.com"], + blocked_domains=["ads.example.com"] +) +``` +- Allow/block specific domains +- Subdomain support +- Efficient domain matching + +### Creating Filter Chains + +```python +# Create and configure a filter chain +filter_chain = FilterChain([ + URLPatternFilter(["*.example.com/*"]), + ContentTypeFilter(["text/html"]), + DomainFilter(blocked_domains=["ads.*"]) +]) + +# Add more filters +filter_chain.add_filter( + URLPatternFilter(["*/article/*"]) +) +``` + +```mermaid +flowchart TB + Start([URL Input]) --> Composite[Composite Scorer] + + subgraph Scoring Process + Composite --> Keywords[Keyword Relevance] + Composite --> Path[Path Depth] + Composite --> Content[Content Type] + Composite --> Fresh[Freshness] + Composite --> Domain[Domain Authority] + + Keywords --> KeywordScore[Calculate Score] + Path --> PathScore[Calculate Score] + Content --> ContentScore[Calculate Score] + Fresh --> FreshScore[Calculate Score] + Domain --> DomainScore[Calculate Score] + + KeywordScore --> Weight1[Apply Weight] + PathScore --> Weight2[Apply Weight] + ContentScore --> Weight3[Apply Weight] + FreshScore --> Weight4[Apply Weight] + DomainScore --> Weight5[Apply Weight] + end + + Weight1 --> Combine[Combine Scores] + Weight2 --> Combine + Weight3 --> Combine + Weight4 --> Combine + Weight5 --> Combine + + Combine --> Normalize{Normalize?} + Normalize -->|Yes| NormalizeScore[Normalize Combined Score] + Normalize -->|No| FinalScore[Final Score] + NormalizeScore --> FinalScore + + FinalScore --> Stats[Update Statistics] + Stats --> End([End]) + + classDef process fill:#90caf9,stroke:#000,stroke-width:2px; + classDef scorer fill:#fff59d,stroke:#000,stroke-width:2px; + classDef calc fill:#a5d6a7,stroke:#000,stroke-width:2px; + classDef decision fill:#ef9a9a,stroke:#000,stroke-width:2px; + + class Start,End calc; + class Keywords,Path,Content,Fresh,Domain scorer; + class KeywordScore,PathScore,ContentScore,FreshScore,DomainScore process; + class Normalize decision; +``` + +## URL Scorers + +URL scorers help prioritize which URLs to crawl first. Higher scores indicate higher priority. + +### Available Scorers + +1. **Keyword Relevance Scorer** +```python +keyword_scorer = KeywordRelevanceScorer( + keywords=["python", "programming"], + weight=1.0, + case_sensitive=False +) +``` +- Score based on keyword matches +- Case sensitivity options +- Weighted scoring + +2. **Path Depth Scorer** +```python +path_scorer = PathDepthScorer( + optimal_depth=3, # Preferred URL depth + weight=0.7 +) +``` +- Score based on URL path depth +- Configurable optimal depth +- Diminishing returns for deeper paths + +3. **Content Type Scorer** +```python +content_scorer = ContentTypeScorer({ + r'\.html$': 1.0, + r'\.pdf$': 0.8, + r'\.xml$': 0.6 +}) +``` +- Score based on file types +- Configurable type weights +- Pattern matching support + +4. **Freshness Scorer** +```python +freshness_scorer = FreshnessScorer(weight=0.9) +``` +- Score based on date indicators in URLs +- Multiple date format support +- Recency weighting + +5. **Domain Authority Scorer** +```python +authority_scorer = DomainAuthorityScorer({ + "python.org": 1.0, + "github.com": 0.9, + "medium.com": 0.7 +}) +``` +- Score based on domain importance +- Configurable domain weights +- Default weight for unknown domains + +### Combining Scorers + +```python +# Create a composite scorer +composite_scorer = CompositeScorer([ + KeywordRelevanceScorer(["python"], weight=1.0), + PathDepthScorer(optimal_depth=2, weight=0.7), + FreshnessScorer(weight=0.8) +], normalize=True) +``` + +## Best Practices + +### Filter Configuration + +1. **Start Restrictive** + ```python + # Begin with strict filters + filter_chain = FilterChain([ + DomainFilter(allowed_domains=["example.com"]), + ContentTypeFilter(["text/html"]) + ]) + ``` + +2. **Layer Filters** + ```python + # Add more specific filters + filter_chain.add_filter( + URLPatternFilter(["*/article/*", "*/blog/*"]) + ) + ``` + +3. **Monitor Filter Statistics** + ```python + # Check filter performance + for filter in filter_chain.filters: + print(f"{filter.name}: {filter.stats.rejected_urls} rejected") + ``` + +### Scorer Configuration + +1. **Balance Weights** + ```python + # Balanced scoring configuration + scorer = create_balanced_scorer() + ``` + +2. **Customize for Content** + ```python + # News site configuration + news_scorer = CompositeScorer([ + KeywordRelevanceScorer(["news", "article"], weight=1.0), + FreshnessScorer(weight=1.0), + PathDepthScorer(optimal_depth=2, weight=0.5) + ]) + ``` + +3. **Monitor Scoring Statistics** + ```python + # Check scoring distribution + print(f"Average score: {scorer.stats.average_score}") + print(f"Score range: {scorer.stats.min_score} - {scorer.stats.max_score}") + ``` + +## Common Use Cases + +### Blog Crawling +```python +blog_config = { + 'filters': FilterChain([ + URLPatternFilter(["*/blog/*", "*/post/*"]), + ContentTypeFilter(["text/html"]) + ]), + 'scorer': CompositeScorer([ + FreshnessScorer(weight=1.0), + KeywordRelevanceScorer(["blog", "article"], weight=0.8) + ]) +} +``` + +### Documentation Sites +```python +docs_config = { + 'filters': FilterChain([ + URLPatternFilter(["*/docs/*", "*/guide/*"]), + ContentTypeFilter(["text/html", "application/pdf"]) + ]), + 'scorer': CompositeScorer([ + PathDepthScorer(optimal_depth=3, weight=1.0), + KeywordRelevanceScorer(["guide", "tutorial"], weight=0.9) + ]) +} +``` + +### E-commerce Sites +```python +ecommerce_config = { + 'filters': FilterChain([ + URLPatternFilter(["*/product/*", "*/category/*"]), + DomainFilter(blocked_domains=["ads.*", "tracker.*"]) + ]), + 'scorer': CompositeScorer([ + PathDepthScorer(optimal_depth=2, weight=1.0), + ContentTypeScorer({ + r'/product/': 1.0, + r'/category/': 0.8 + }) + ]) +} +``` + +## Advanced Topics + +### Custom Filters +```python +class CustomFilter(URLFilter): + def apply(self, url: str) -> bool: + # Your custom filtering logic + return True +``` + +### Custom Scorers +```python +class CustomScorer(URLScorer): + def _calculate_score(self, url: str) -> float: + # Your custom scoring logic + return 1.0 +``` + +For more examples, check our [example repository](https://github.com/example/crawl4ai/examples). \ No newline at end of file diff --git a/docs/scrapper/how_to_use.md b/docs/scrapper/how_to_use.md new file mode 100644 index 00000000..79f7912f --- /dev/null +++ b/docs/scrapper/how_to_use.md @@ -0,0 +1,206 @@ +# Scraper Examples Guide + +This guide provides two complete examples of using the crawl4ai scraper: a basic implementation for simple use cases and an advanced implementation showcasing all features. + +## Basic Example + +The basic example demonstrates a simple blog scraping scenario: + +```python +from crawl4ai.scraper import AsyncWebScraper, BFSScraperStrategy, FilterChain + +# Create simple filter chain +filter_chain = FilterChain([ + URLPatternFilter("*/blog/*"), + ContentTypeFilter(["text/html"]) +]) + +# Initialize strategy +strategy = BFSScraperStrategy( + max_depth=2, + filter_chain=filter_chain, + url_scorer=None, + max_concurrent=3 +) + +# Create and run scraper +crawler = AsyncWebCrawler() +scraper = AsyncWebScraper(crawler, strategy) +result = await scraper.ascrape("https://example.com/blog/") +``` + +### Features Demonstrated +- Basic URL filtering +- Simple content type filtering +- Depth control +- Concurrent request limiting +- Result collection + +## Advanced Example + +The advanced example shows a sophisticated news site scraping setup with all features enabled: + +```python +# Create comprehensive filter chain +filter_chain = FilterChain([ + DomainFilter( + allowed_domains=["example.com"], + blocked_domains=["ads.example.com"] + ), + URLPatternFilter([ + "*/article/*", + re.compile(r"\d{4}/\d{2}/.*") + ]), + ContentTypeFilter(["text/html"]) +]) + +# Create intelligent scorer +scorer = CompositeScorer([ + KeywordRelevanceScorer( + keywords=["news", "breaking"], + weight=1.0 + ), + PathDepthScorer(optimal_depth=3, weight=0.7), + FreshnessScorer(weight=0.9) +]) + +# Initialize advanced strategy +strategy = BFSScraperStrategy( + max_depth=4, + filter_chain=filter_chain, + url_scorer=scorer, + max_concurrent=5 +) +``` + +### Features Demonstrated +1. **Advanced Filtering** + - Domain filtering + - Pattern matching + - Content type control + +2. **Intelligent Scoring** + - Keyword relevance + - Path optimization + - Freshness priority + +3. **Monitoring** + - Progress tracking + - Error handling + - Statistics collection + +4. **Resource Management** + - Concurrent processing + - Rate limiting + - Cleanup handling + +## Running the Examples + +```bash +# Basic usage +python basic_scraper_example.py + +# Advanced usage with logging +PYTHONPATH=. python advanced_scraper_example.py +``` + +## Example Output + +### Basic Example +``` +Crawled 15 pages: +- https://example.com/blog/post1: 24560 bytes +- https://example.com/blog/post2: 18920 bytes +... +``` + +### Advanced Example +``` +INFO: Starting crawl of https://example.com/news/ +INFO: Processed: https://example.com/news/breaking/story1 +DEBUG: KeywordScorer: 0.85 +DEBUG: FreshnessScorer: 0.95 +INFO: Progress: 10 URLs processed +... +INFO: Scraping completed: +INFO: - URLs processed: 50 +INFO: - Errors: 2 +INFO: - Total content size: 1240.50 KB +``` + +## Customization + +### Adding Custom Filters +```python +class CustomFilter(URLFilter): + def apply(self, url: str) -> bool: + # Your custom filtering logic + return True + +filter_chain.add_filter(CustomFilter()) +``` + +### Custom Scoring Logic +```python +class CustomScorer(URLScorer): + def _calculate_score(self, url: str) -> float: + # Your custom scoring logic + return 1.0 + +scorer = CompositeScorer([ + CustomScorer(weight=1.0), + ... +]) +``` + +## Best Practices + +1. **Start Simple** + - Begin with basic filtering + - Add features incrementally + - Test thoroughly at each step + +2. **Monitor Performance** + - Watch memory usage + - Track processing times + - Adjust concurrency as needed + +3. **Handle Errors** + - Implement proper error handling + - Log important events + - Track error statistics + +4. **Optimize Resources** + - Set appropriate delays + - Limit concurrent requests + - Use streaming for large crawls + +## Troubleshooting + +Common issues and solutions: + +1. **Too Many Requests** + ```python + strategy = BFSScraperStrategy( + max_concurrent=3, # Reduce concurrent requests + min_crawl_delay=2 # Increase delay between requests + ) + ``` + +2. **Memory Issues** + ```python + # Use streaming mode for large crawls + async for result in scraper.ascrape(url, stream=True): + process_result(result) + ``` + +3. **Missing Content** + ```python + # Check your filter chain + filter_chain = FilterChain([ + URLPatternFilter("*"), # Broaden patterns + ContentTypeFilter(["*"]) # Accept all content + ]) + ``` + +For more examples and use cases, visit our [GitHub repository](https://github.com/example/crawl4ai/examples). \ No newline at end of file diff --git a/docs/scrapper/web_crawler_quick_start.py b/docs/scrapper/web_crawler_quick_start.py new file mode 100644 index 00000000..99360f42 --- /dev/null +++ b/docs/scrapper/web_crawler_quick_start.py @@ -0,0 +1,111 @@ +import unittest, os +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking +from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy + +class TestWebCrawler(unittest.TestCase): + + def setUp(self): + self.crawler = WebCrawler() + + def test_warmup(self): + self.crawler.warmup() + self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up") + + def test_run_default_strategies(self): + result = self.crawler.run( + url='https://www.nbcnews.com/business', + word_count_threshold=5, + chunking_strategy=RegexChunking(), + extraction_strategy=CosineStrategy(), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract using default strategies") + + def test_run_different_strategies(self): + url = 'https://www.nbcnews.com/business' + + # Test with FixedLengthWordChunking and LLMExtractionStrategy + result = self.crawler.run( + url=url, + word_count_threshold=5, + chunking_strategy=FixedLengthWordChunking(chunk_size=100), + extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy") + + # Test with SlidingWindowChunking and TopicExtractionStrategy + result = self.crawler.run( + url=url, + word_count_threshold=5, + chunking_strategy=SlidingWindowChunking(window_size=100, step=50), + extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy") + + def test_invalid_url(self): + with self.assertRaises(Exception) as context: + self.crawler.run(url='invalid_url', bypass_cache=True) + self.assertIn("Invalid URL", str(context.exception)) + + def test_unsupported_extraction_strategy(self): + with self.assertRaises(Exception) as context: + self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True) + self.assertIn("Unsupported extraction strategy", str(context.exception)) + + def test_invalid_css_selector(self): + with self.assertRaises(ValueError) as context: + self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True) + self.assertIn("Invalid CSS selector", str(context.exception)) + + + def test_crawl_with_cache_and_bypass_cache(self): + url = 'https://www.nbcnews.com/business' + + # First crawl with cache enabled + result = self.crawler.run(url=url, bypass_cache=False) + self.assertTrue(result.success, "Failed to crawl and cache the result") + + # Second crawl with bypass_cache=True + result = self.crawler.run(url=url, bypass_cache=True) + self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data") + + def test_fetch_multiple_pages(self): + urls = [ + 'https://www.nbcnews.com/business', + 'https://www.bbc.com/news' + ] + results = [] + for url in urls: + result = self.crawler.run( + url=url, + word_count_threshold=5, + chunking_strategy=RegexChunking(), + extraction_strategy=CosineStrategy(), + bypass_cache=True + ) + results.append(result) + + self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages") + for result in results: + self.assertTrue(result.success, "Failed to crawl and extract a page in the list") + + def test_run_fixed_length_word_chunking_and_no_extraction(self): + result = self.crawler.run( + url='https://www.nbcnews.com/business', + word_count_threshold=5, + chunking_strategy=FixedLengthWordChunking(chunk_size=100), + extraction_strategy=NoExtractionStrategy(), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy") + + def test_run_sliding_window_and_no_extraction(self): + result = self.crawler.run( + url='https://www.nbcnews.com/business', + word_count_threshold=5, + chunking_strategy=SlidingWindowChunking(window_size=100, step=50), + extraction_strategy=NoExtractionStrategy(), bypass_cache=True + ) + self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy") + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_scraper.py b/tests/test_scraper.py new file mode 100644 index 00000000..a2c7a239 --- /dev/null +++ b/tests/test_scraper.py @@ -0,0 +1,184 @@ +# basic_scraper_example.py +from crawl4ai.scraper import ( + AsyncWebScraper, + BFSScraperStrategy, + FilterChain, + URLPatternFilter, + ContentTypeFilter +) +from crawl4ai.async_webcrawler import AsyncWebCrawler + +async def basic_scraper_example(): + """ + Basic example: Scrape a blog site for articles + - Crawls only HTML pages + - Stays within the blog section + - Collects all results at once + """ + # Create a simple filter chain + filter_chain = FilterChain([ + # Only crawl pages within the blog section + URLPatternFilter("*/blog/*"), + # Only process HTML pages + ContentTypeFilter(["text/html"]) + ]) + + # Initialize the strategy with basic configuration + strategy = BFSScraperStrategy( + max_depth=2, # Only go 2 levels deep + filter_chain=filter_chain, + url_scorer=None, # Use default scoring + max_concurrent=3 # Limit concurrent requests + ) + + # Create the crawler and scraper + crawler = AsyncWebCrawler() + scraper = AsyncWebScraper(crawler, strategy) + + # Start scraping + try: + result = await scraper.ascrape("https://example.com/blog/") + + # Process results + print(f"Crawled {len(result.crawled_urls)} pages:") + for url, data in result.extracted_data.items(): + print(f"- {url}: {len(data.html)} bytes") + + except Exception as e: + print(f"Error during scraping: {e}") + +# advanced_scraper_example.py +import logging +from crawl4ai.scraper import ( + AsyncWebScraper, + BFSScraperStrategy, + FilterChain, + URLPatternFilter, + ContentTypeFilter, + DomainFilter, + KeywordRelevanceScorer, + PathDepthScorer, + FreshnessScorer, + CompositeScorer +) +from crawl4ai.async_webcrawler import AsyncWebCrawler + +async def advanced_scraper_example(): + """ + Advanced example: Intelligent news site scraping + - Uses all filter types + - Implements sophisticated scoring + - Streams results + - Includes monitoring and logging + """ + # Set up logging + logging.basicConfig(level=logging.INFO) + logger = logging.getLogger("advanced_scraper") + + # Create sophisticated filter chain + filter_chain = FilterChain([ + # Domain control + DomainFilter( + allowed_domains=["example.com", "blog.example.com"], + blocked_domains=["ads.example.com", "tracker.example.com"] + ), + # URL patterns + URLPatternFilter([ + "*/article/*", + "*/news/*", + "*/blog/*", + re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs + ]), + # Content types + ContentTypeFilter([ + "text/html", + "application/xhtml+xml" + ]) + ]) + + # Create composite scorer + scorer = CompositeScorer([ + # Prioritize by keywords + KeywordRelevanceScorer( + keywords=["news", "breaking", "update", "latest"], + weight=1.0 + ), + # Prefer optimal URL structure + PathDepthScorer( + optimal_depth=3, + weight=0.7 + ), + # Prioritize fresh content + FreshnessScorer(weight=0.9) + ]) + + # Initialize strategy with advanced configuration + strategy = BFSScraperStrategy( + max_depth=4, + filter_chain=filter_chain, + url_scorer=scorer, + max_concurrent=5, + min_crawl_delay=1 + ) + + # Create crawler and scraper + crawler = AsyncWebCrawler() + scraper = AsyncWebScraper(crawler, strategy) + + # Track statistics + stats = { + 'processed': 0, + 'errors': 0, + 'total_size': 0 + } + + try: + # Use streaming mode + async for result in scraper.ascrape("https://example.com/news/", stream=True): + stats['processed'] += 1 + + if result.success: + stats['total_size'] += len(result.html) + logger.info(f"Processed: {result.url}") + + # Print scoring information + for scorer_name, score in result.scores.items(): + logger.debug(f"{scorer_name}: {score:.2f}") + else: + stats['errors'] += 1 + logger.error(f"Failed to process {result.url}: {result.error_message}") + + # Log progress regularly + if stats['processed'] % 10 == 0: + logger.info(f"Progress: {stats['processed']} URLs processed") + + except Exception as e: + logger.error(f"Scraping error: {e}") + + finally: + # Print final statistics + logger.info("Scraping completed:") + logger.info(f"- URLs processed: {stats['processed']}") + logger.info(f"- Errors: {stats['errors']}") + logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB") + + # Print filter statistics + for filter_ in filter_chain.filters: + logger.info(f"{filter_.name} stats:") + logger.info(f"- Passed: {filter_.stats.passed_urls}") + logger.info(f"- Rejected: {filter_.stats.rejected_urls}") + + # Print scorer statistics + logger.info("Scoring statistics:") + logger.info(f"- Average score: {scorer.stats.average_score:.2f}") + logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}") + +if __name__ == "__main__": + import asyncio + + # Run basic example + print("Running basic scraper example...") + asyncio.run(basic_scraper_example()) + + print("\nRunning advanced scraper example...") + asyncio.run(advanced_scraper_example()) \ No newline at end of file