feat(scraper): Enhance URL filtering and scoring systems
Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes. - Quick Start is created and added
This commit is contained in:
@@ -1,138 +0,0 @@
|
||||
from .scraper_strategy import ScraperStrategy
|
||||
from .filters import FilterChain
|
||||
from .scorers import URLScorer
|
||||
from ..models import CrawlResult
|
||||
from ..async_webcrawler import AsyncWebCrawler
|
||||
import asyncio
|
||||
import validators
|
||||
from urllib.parse import urljoin,urlparse,urlunparse
|
||||
from urllib.robotparser import RobotFileParser
|
||||
import time
|
||||
from aiolimiter import AsyncLimiter
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
from collections import defaultdict
|
||||
import logging
|
||||
from typing import Dict, AsyncGenerator
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
rate_limiter = AsyncLimiter(1, 1) # 1 request per second
|
||||
|
||||
class BFSScraperStrategy(ScraperStrategy):
|
||||
def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1):
|
||||
self.max_depth = max_depth
|
||||
self.filter_chain = filter_chain
|
||||
self.url_scorer = url_scorer
|
||||
self.max_concurrent = max_concurrent
|
||||
# For Crawl Politeness
|
||||
self.last_crawl_time = defaultdict(float)
|
||||
self.min_crawl_delay = min_crawl_delay # 1 second delay between requests to the same domain
|
||||
# For Robots.txt Compliance
|
||||
self.robot_parsers = {}
|
||||
|
||||
# Robots.txt Parser
|
||||
def get_robot_parser(self, url: str) -> RobotFileParser:
|
||||
domain = urlparse(url)
|
||||
scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided
|
||||
netloc = domain.netloc
|
||||
if netloc not in self.robot_parsers:
|
||||
rp = RobotFileParser()
|
||||
rp.set_url(f"{scheme}://{netloc}/robots.txt")
|
||||
try:
|
||||
rp.read()
|
||||
except Exception as e:
|
||||
# Log the type of error, message, and the URL
|
||||
logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}")
|
||||
return None
|
||||
self.robot_parsers[netloc] = rp
|
||||
return self.robot_parsers[netloc]
|
||||
|
||||
# Retry with exponential backoff
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||
async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult:
|
||||
return await crawler.arun(url)
|
||||
|
||||
async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]:
|
||||
def normalize_url(url: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
return urlunparse(parsed._replace(fragment=""))
|
||||
|
||||
# URL Validation
|
||||
if not validators.url(url):
|
||||
logging.warning(f"Invalid URL: {url}")
|
||||
return None
|
||||
|
||||
# Robots.txt Compliance
|
||||
robot_parser = self.get_robot_parser(url)
|
||||
if robot_parser is None:
|
||||
logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.")
|
||||
else:
|
||||
# If robots.txt was fetched, check if crawling is allowed
|
||||
if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url):
|
||||
logging.info(f"Skipping {url} as per robots.txt")
|
||||
return None
|
||||
|
||||
# Crawl Politeness
|
||||
domain = urlparse(url).netloc
|
||||
time_since_last_crawl = time.time() - self.last_crawl_time[domain]
|
||||
if time_since_last_crawl < self.min_crawl_delay:
|
||||
await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl)
|
||||
self.last_crawl_time[domain] = time.time()
|
||||
|
||||
# Rate Limiting
|
||||
async with rate_limiter:
|
||||
# Error Handling
|
||||
try:
|
||||
crawl_result = await self.retry_crawl(crawler, url)
|
||||
except Exception as e:
|
||||
logging.error(f"Error crawling {url}: {str(e)}")
|
||||
crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e))
|
||||
|
||||
if not crawl_result.success:
|
||||
# Logging and Monitoring
|
||||
logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}")
|
||||
return crawl_result
|
||||
|
||||
# Process links
|
||||
for link_type in ["internal", "external"]:
|
||||
for link in crawl_result.links[link_type]:
|
||||
absolute_link = urljoin(url, link['href'])
|
||||
normalized_link = normalize_url(absolute_link)
|
||||
if self.filter_chain.apply(normalized_link) and normalized_link not in visited:
|
||||
new_depth = depths[url] + 1
|
||||
if new_depth <= self.max_depth:
|
||||
# URL Scoring
|
||||
score = self.url_scorer.score(normalized_link)
|
||||
await queue.put((score, new_depth, normalized_link))
|
||||
depths[normalized_link] = new_depth
|
||||
return crawl_result
|
||||
|
||||
async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]:
|
||||
queue = asyncio.PriorityQueue()
|
||||
queue.put_nowait((0, 0, start_url))
|
||||
visited = set()
|
||||
depths = {start_url: 0}
|
||||
pending_tasks = set()
|
||||
|
||||
while not queue.empty() or pending_tasks:
|
||||
while not queue.empty() and len(pending_tasks) < self.max_concurrent:
|
||||
_, depth, url = await queue.get()
|
||||
if url not in visited:
|
||||
# Adding URL to the visited set here itself, (instead of after result generation)
|
||||
# so that other tasks are not queued for same URL, found at different depth before
|
||||
# crawling and extraction of this task is completed.
|
||||
visited.add(url)
|
||||
if parallel_processing:
|
||||
task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths))
|
||||
pending_tasks.add(task)
|
||||
else:
|
||||
result = await self.process_url(url, depth, crawler, queue, visited, depths)
|
||||
if result:
|
||||
yield result
|
||||
|
||||
# Wait for the first task to complete and yield results incrementally as each task is completed
|
||||
if pending_tasks:
|
||||
done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
|
||||
for task in done:
|
||||
result = await task
|
||||
if result:
|
||||
yield result
|
||||
@@ -1,3 +1,205 @@
|
||||
from .url_filter import URLFilter, FilterChain
|
||||
from .content_type_filter import ContentTypeFilter
|
||||
from .url_pattern_filter import URLPatternFilter
|
||||
# from .url_filter import URLFilter, FilterChain
|
||||
# from .content_type_filter import ContentTypeFilter
|
||||
# from .url_pattern_filter import URLPatternFilter
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Pattern, Set, Union
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
import mimetypes
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
import fnmatch
|
||||
|
||||
@dataclass
|
||||
class FilterStats:
|
||||
"""Statistics for filter applications"""
|
||||
total_urls: int = 0
|
||||
rejected_urls: int = 0
|
||||
passed_urls: int = 0
|
||||
|
||||
class URLFilter(ABC):
|
||||
"""Base class for URL filters"""
|
||||
|
||||
def __init__(self, name: str = None):
|
||||
self.name = name or self.__class__.__name__
|
||||
self.stats = FilterStats()
|
||||
self.logger = logging.getLogger(f"urlfilter.{self.name}")
|
||||
|
||||
@abstractmethod
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Apply the filter to a URL"""
|
||||
pass
|
||||
|
||||
def _update_stats(self, passed: bool):
|
||||
"""Update filter statistics"""
|
||||
self.stats.total_urls += 1
|
||||
if passed:
|
||||
self.stats.passed_urls += 1
|
||||
else:
|
||||
self.stats.rejected_urls += 1
|
||||
|
||||
class FilterChain:
|
||||
"""Chain of URL filters."""
|
||||
|
||||
def __init__(self, filters: List[URLFilter] = None):
|
||||
self.filters = filters or []
|
||||
self.stats = FilterStats()
|
||||
self.logger = logging.getLogger("urlfilter.chain")
|
||||
|
||||
def add_filter(self, filter_: URLFilter) -> 'FilterChain':
|
||||
"""Add a filter to the chain"""
|
||||
self.filters.append(filter_)
|
||||
return self # Enable method chaining
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Apply all filters in the chain"""
|
||||
self.stats.total_urls += 1
|
||||
|
||||
for filter_ in self.filters:
|
||||
if not filter_.apply(url):
|
||||
self.stats.rejected_urls += 1
|
||||
self.logger.debug(f"URL {url} rejected by {filter_.name}")
|
||||
return False
|
||||
|
||||
self.stats.passed_urls += 1
|
||||
return True
|
||||
|
||||
class URLPatternFilter(URLFilter):
|
||||
"""Filter URLs based on glob patterns or regex.
|
||||
|
||||
pattern_filter = URLPatternFilter([
|
||||
"*.example.com/*", # Glob pattern
|
||||
"*/article/*", # Path pattern
|
||||
re.compile(r"blog-\d+") # Regex pattern
|
||||
])
|
||||
|
||||
- Supports glob patterns and regex
|
||||
- Multiple patterns per filter
|
||||
- Pattern pre-compilation for performance
|
||||
"""
|
||||
|
||||
def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||||
use_glob: bool = True):
|
||||
super().__init__()
|
||||
self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||
self.use_glob = use_glob
|
||||
self._compiled_patterns = []
|
||||
|
||||
for pattern in self.patterns:
|
||||
if isinstance(pattern, str) and use_glob:
|
||||
self._compiled_patterns.append(self._glob_to_regex(pattern))
|
||||
else:
|
||||
self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern)
|
||||
|
||||
def _glob_to_regex(self, pattern: str) -> Pattern:
|
||||
"""Convert glob pattern to regex"""
|
||||
return re.compile(fnmatch.translate(pattern))
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Check if URL matches any of the patterns"""
|
||||
matches = any(pattern.search(url) for pattern in self._compiled_patterns)
|
||||
self._update_stats(matches)
|
||||
return matches
|
||||
|
||||
class ContentTypeFilter(URLFilter):
|
||||
"""Filter URLs based on expected content type.
|
||||
|
||||
content_filter = ContentTypeFilter([
|
||||
"text/html",
|
||||
"application/pdf"
|
||||
], check_extension=True)
|
||||
|
||||
- Filter by MIME types
|
||||
- Extension checking
|
||||
- Support for multiple content types
|
||||
"""
|
||||
|
||||
def __init__(self, allowed_types: Union[str, List[str]],
|
||||
check_extension: bool = True):
|
||||
super().__init__()
|
||||
self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types
|
||||
self.check_extension = check_extension
|
||||
self._normalize_types()
|
||||
|
||||
def _normalize_types(self):
|
||||
"""Normalize content type strings"""
|
||||
self.allowed_types = [t.lower() for t in self.allowed_types]
|
||||
|
||||
def _check_extension(self, url: str) -> bool:
|
||||
"""Check URL's file extension"""
|
||||
ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else ''
|
||||
if not ext:
|
||||
return True # No extension, might be dynamic content
|
||||
|
||||
guessed_type = mimetypes.guess_type(url)[0]
|
||||
return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types)
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Check if URL's content type is allowed"""
|
||||
result = True
|
||||
if self.check_extension:
|
||||
result = self._check_extension(url)
|
||||
self._update_stats(result)
|
||||
return result
|
||||
|
||||
class DomainFilter(URLFilter):
|
||||
"""Filter URLs based on allowed/blocked domains.
|
||||
|
||||
domain_filter = DomainFilter(
|
||||
allowed_domains=["example.com", "blog.example.com"],
|
||||
blocked_domains=["ads.example.com"]
|
||||
)
|
||||
|
||||
- Allow/block specific domains
|
||||
- Subdomain support
|
||||
- Efficient domain matching
|
||||
"""
|
||||
|
||||
def __init__(self, allowed_domains: Union[str, List[str]] = None,
|
||||
blocked_domains: Union[str, List[str]] = None):
|
||||
super().__init__()
|
||||
self.allowed_domains = set(self._normalize_domains(allowed_domains)) if allowed_domains else None
|
||||
self.blocked_domains = set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
|
||||
|
||||
def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
|
||||
"""Normalize domain strings"""
|
||||
if isinstance(domains, str):
|
||||
domains = [domains]
|
||||
return [d.lower().strip() for d in domains]
|
||||
|
||||
def _extract_domain(self, url: str) -> str:
|
||||
"""Extract domain from URL"""
|
||||
return urlparse(url).netloc.lower()
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Check if URL's domain is allowed"""
|
||||
domain = self._extract_domain(url)
|
||||
|
||||
if domain in self.blocked_domains:
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
if self.allowed_domains is not None and domain not in self.allowed_domains:
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
# Example usage:
|
||||
def create_common_filter_chain() -> FilterChain:
|
||||
"""Create a commonly used filter chain"""
|
||||
return FilterChain([
|
||||
URLPatternFilter([
|
||||
"*.html", "*.htm", # HTML files
|
||||
"*/article/*", "*/blog/*" # Common content paths
|
||||
]),
|
||||
ContentTypeFilter([
|
||||
"text/html",
|
||||
"application/xhtml+xml"
|
||||
]),
|
||||
DomainFilter(
|
||||
blocked_domains=["ads.*", "analytics.*"]
|
||||
)
|
||||
])
|
||||
@@ -1,8 +1,43 @@
|
||||
from .url_filter import URLFilter
|
||||
from typing import List, Union
|
||||
from urllib.parse import urlparse
|
||||
import mimetypes
|
||||
|
||||
|
||||
class ContentTypeFilter(URLFilter):
|
||||
def __init__(self, contentType: str):
|
||||
self.contentType = contentType
|
||||
"""Filter URLs based on expected content type"""
|
||||
|
||||
def __init__(self, allowed_types: Union[str, List[str]],
|
||||
check_extension: bool = True):
|
||||
super().__init__()
|
||||
self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types
|
||||
self.check_extension = check_extension
|
||||
self._normalize_types()
|
||||
|
||||
def _normalize_types(self):
|
||||
"""Normalize content type strings"""
|
||||
self.allowed_types = [t.lower() for t in self.allowed_types]
|
||||
|
||||
def _check_extension(self, url: str) -> bool:
|
||||
"""Check URL's file extension"""
|
||||
ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else ''
|
||||
if not ext:
|
||||
return True # No extension, might be dynamic content
|
||||
|
||||
guessed_type = mimetypes.guess_type(url)[0]
|
||||
return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types)
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
#TODO: This is a stub. Will implement this later
|
||||
return True
|
||||
"""Check if URL's content type is allowed"""
|
||||
result = True
|
||||
if self.check_extension:
|
||||
result = self._check_extension(url)
|
||||
self._update_stats(result)
|
||||
return result
|
||||
|
||||
# class ContentTypeFilter(URLFilter):
|
||||
# def __init__(self, contentType: str):
|
||||
# self.contentType = contentType
|
||||
# def apply(self, url: str) -> bool:
|
||||
# #TODO: This is a stub. Will implement this later
|
||||
# return True
|
||||
@@ -1,16 +1,72 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
from typing import List
|
||||
@dataclass
|
||||
class FilterStats:
|
||||
"""Statistics for filter applications"""
|
||||
total_urls: int = 0
|
||||
rejected_urls: int = 0
|
||||
passed_urls: int = 0
|
||||
|
||||
class URLFilter(ABC):
|
||||
"""Base class for URL filters"""
|
||||
|
||||
def __init__(self, name: str = None):
|
||||
self.name = name or self.__class__.__name__
|
||||
self.stats = FilterStats()
|
||||
self.logger = logging.getLogger(f"urlfilter.{self.name}")
|
||||
|
||||
@abstractmethod
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Apply the filter to a URL"""
|
||||
pass
|
||||
|
||||
class FilterChain:
|
||||
def __init__(self):
|
||||
self.filters = []
|
||||
def _update_stats(self, passed: bool):
|
||||
"""Update filter statistics"""
|
||||
self.stats.total_urls += 1
|
||||
if passed:
|
||||
self.stats.passed_urls += 1
|
||||
else:
|
||||
self.stats.rejected_urls += 1
|
||||
|
||||
def add_filter(self, filter: URLFilter):
|
||||
self.filters.append(filter)
|
||||
class FilterChain:
|
||||
"""Chain of URL filters"""
|
||||
|
||||
def __init__(self, filters: List[URLFilter] = None):
|
||||
self.filters = filters or []
|
||||
self.stats = FilterStats()
|
||||
self.logger = logging.getLogger("urlfilter.chain")
|
||||
|
||||
def add_filter(self, filter_: URLFilter) -> 'FilterChain':
|
||||
"""Add a filter to the chain"""
|
||||
self.filters.append(filter_)
|
||||
return self # Enable method chaining
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
return all(filter.apply(url) for filter in self.filters)
|
||||
"""Apply all filters in the chain"""
|
||||
self.stats.total_urls += 1
|
||||
|
||||
for filter_ in self.filters:
|
||||
if not filter_.apply(url):
|
||||
self.stats.rejected_urls += 1
|
||||
self.logger.debug(f"URL {url} rejected by {filter_.name}")
|
||||
return False
|
||||
|
||||
self.stats.passed_urls += 1
|
||||
return True
|
||||
|
||||
# class URLFilter(ABC):
|
||||
# @abstractmethod
|
||||
# def apply(self, url: str) -> bool:
|
||||
# pass
|
||||
|
||||
# class FilterChain:
|
||||
# def __init__(self):
|
||||
# self.filters = []
|
||||
|
||||
# def add_filter(self, filter: URLFilter):
|
||||
# self.filters.append(filter)
|
||||
|
||||
# def apply(self, url: str) -> bool:
|
||||
# return all(filter.apply(url) for filter in self.filters)
|
||||
@@ -1,9 +1,39 @@
|
||||
from .url_filter import URLFilter
|
||||
from re import Pattern
|
||||
from typing import List, Union
|
||||
import re
|
||||
import fnmatch
|
||||
|
||||
|
||||
class URLPatternFilter(URLFilter):
|
||||
def __init__(self, pattern: Pattern):
|
||||
self.pattern = pattern
|
||||
"""Filter URLs based on glob patterns or regex"""
|
||||
|
||||
def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||||
use_glob: bool = True):
|
||||
super().__init__()
|
||||
self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||
self.use_glob = use_glob
|
||||
self._compiled_patterns = []
|
||||
|
||||
for pattern in self.patterns:
|
||||
if isinstance(pattern, str) and use_glob:
|
||||
self._compiled_patterns.append(self._glob_to_regex(pattern))
|
||||
else:
|
||||
self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern)
|
||||
|
||||
def _glob_to_regex(self, pattern: str) -> Pattern:
|
||||
"""Convert glob pattern to regex"""
|
||||
return re.compile(fnmatch.translate(pattern))
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
#TODO: This is a stub. Will implement this later.
|
||||
return True
|
||||
"""Check if URL matches any of the patterns"""
|
||||
matches = any(pattern.search(url) for pattern in self._compiled_patterns)
|
||||
self._update_stats(matches)
|
||||
return matches
|
||||
|
||||
# class URLPatternFilter(URLFilter):
|
||||
# def __init__(self, pattern: Pattern):
|
||||
# self.pattern = pattern
|
||||
# def apply(self, url: str) -> bool:
|
||||
# #TODO: This is a stub. Will implement this later.
|
||||
# return True
|
||||
@@ -1,2 +1,268 @@
|
||||
from .url_scorer import URLScorer
|
||||
from .keyword_relevance_scorer import KeywordRelevanceScorer
|
||||
# from .url_scorer import URLScorer
|
||||
# from .keyword_relevance_scorer import KeywordRelevanceScorer
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Optional, Union
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import urlparse, unquote
|
||||
import re
|
||||
from collections import defaultdict
|
||||
import math
|
||||
import logging
|
||||
|
||||
@dataclass
|
||||
class ScoringStats:
|
||||
"""Statistics for URL scoring"""
|
||||
urls_scored: int = 0
|
||||
total_score: float = 0.0
|
||||
min_score: float = float('inf')
|
||||
max_score: float = float('-inf')
|
||||
|
||||
def update(self, score: float):
|
||||
"""Update scoring statistics"""
|
||||
self.urls_scored += 1
|
||||
self.total_score += score
|
||||
self.min_score = min(self.min_score, score)
|
||||
self.max_score = max(self.max_score, score)
|
||||
|
||||
@property
|
||||
def average_score(self) -> float:
|
||||
"""Calculate average score"""
|
||||
return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0
|
||||
|
||||
class URLScorer(ABC):
|
||||
"""Base class for URL scoring strategies"""
|
||||
|
||||
def __init__(self, weight: float = 1.0, name: str = None):
|
||||
self.weight = weight
|
||||
self.name = name or self.__class__.__name__
|
||||
self.stats = ScoringStats()
|
||||
self.logger = logging.getLogger(f"urlscorer.{self.name}")
|
||||
|
||||
@abstractmethod
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
"""Calculate the raw score for a URL"""
|
||||
pass
|
||||
|
||||
def score(self, url: str) -> float:
|
||||
"""Calculate the weighted score for a URL"""
|
||||
raw_score = self._calculate_score(url)
|
||||
weighted_score = raw_score * self.weight
|
||||
self.stats.update(weighted_score)
|
||||
return weighted_score
|
||||
|
||||
class CompositeScorer(URLScorer):
|
||||
"""Combines multiple scorers with weights"""
|
||||
|
||||
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
|
||||
super().__init__(name="CompositeScorer")
|
||||
self.scorers = scorers
|
||||
self.normalize = normalize
|
||||
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
scores = [scorer.score(url) for scorer in self.scorers]
|
||||
total_score = sum(scores)
|
||||
|
||||
if self.normalize and scores:
|
||||
total_score /= len(scores)
|
||||
|
||||
return total_score
|
||||
|
||||
class KeywordRelevanceScorer(URLScorer):
|
||||
"""Score URLs based on keyword relevance.
|
||||
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["python", "programming"],
|
||||
weight=1.0,
|
||||
case_sensitive=False
|
||||
)
|
||||
|
||||
- Score based on keyword matches
|
||||
- Case sensitivity options
|
||||
- Weighted scoring
|
||||
"""
|
||||
|
||||
def __init__(self, keywords: List[str], weight: float = 1.0,
|
||||
case_sensitive: bool = False):
|
||||
super().__init__(weight=weight)
|
||||
self.keywords = keywords
|
||||
self.case_sensitive = case_sensitive
|
||||
self._compile_keywords()
|
||||
|
||||
def _compile_keywords(self):
|
||||
"""Prepare keywords for matching"""
|
||||
flags = 0 if self.case_sensitive else re.IGNORECASE
|
||||
self.patterns = [re.compile(re.escape(k), flags) for k in self.keywords]
|
||||
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
"""Calculate score based on keyword matches"""
|
||||
decoded_url = unquote(url)
|
||||
total_matches = sum(
|
||||
1 for pattern in self.patterns
|
||||
if pattern.search(decoded_url)
|
||||
)
|
||||
# Normalize score between 0 and 1
|
||||
return total_matches / len(self.patterns) if self.patterns else 0.0
|
||||
|
||||
class PathDepthScorer(URLScorer):
|
||||
"""Score URLs based on their path depth.
|
||||
|
||||
path_scorer = PathDepthScorer(
|
||||
optimal_depth=3, # Preferred URL depth
|
||||
weight=0.7
|
||||
)
|
||||
|
||||
- Score based on URL path depth
|
||||
- Configurable optimal depth
|
||||
- Diminishing returns for deeper paths
|
||||
"""
|
||||
|
||||
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
|
||||
super().__init__(weight=weight)
|
||||
self.optimal_depth = optimal_depth
|
||||
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
"""Calculate score based on path depth"""
|
||||
path = urlparse(url).path
|
||||
depth = len([x for x in path.split('/') if x])
|
||||
|
||||
# Score decreases as we move away from optimal depth
|
||||
distance_from_optimal = abs(depth - self.optimal_depth)
|
||||
return 1.0 / (1.0 + distance_from_optimal)
|
||||
|
||||
class ContentTypeScorer(URLScorer):
|
||||
"""Score URLs based on content type preferences.
|
||||
|
||||
content_scorer = ContentTypeScorer({
|
||||
r'\.html$': 1.0,
|
||||
r'\.pdf$': 0.8,
|
||||
r'\.xml$': 0.6
|
||||
})
|
||||
|
||||
- Score based on file types
|
||||
- Configurable type weights
|
||||
- Pattern matching support
|
||||
"""
|
||||
|
||||
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
|
||||
super().__init__(weight=weight)
|
||||
self.type_weights = type_weights
|
||||
self._compile_patterns()
|
||||
|
||||
def _compile_patterns(self):
|
||||
"""Prepare content type patterns"""
|
||||
self.patterns = {
|
||||
re.compile(pattern): weight
|
||||
for pattern, weight in self.type_weights.items()
|
||||
}
|
||||
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
"""Calculate score based on content type matching"""
|
||||
for pattern, weight in self.patterns.items():
|
||||
if pattern.search(url):
|
||||
return weight
|
||||
return 0.0
|
||||
|
||||
class FreshnessScorer(URLScorer):
|
||||
"""Score URLs based on freshness indicators.
|
||||
|
||||
freshness_scorer = FreshnessScorer(weight=0.9)
|
||||
|
||||
Score based on date indicators in URLs
|
||||
Multiple date format support
|
||||
Recency weighting"""
|
||||
|
||||
def __init__(self, weight: float = 1.0):
|
||||
super().__init__(weight=weight)
|
||||
self.date_patterns = [
|
||||
r'/(\d{4})/(\d{2})/(\d{2})/', # yyyy/mm/dd
|
||||
r'(\d{4})[-_](\d{2})[-_](\d{2})', # yyyy-mm-dd
|
||||
r'/(\d{4})/', # year only
|
||||
]
|
||||
self._compile_patterns()
|
||||
|
||||
def _compile_patterns(self):
|
||||
"""Prepare date patterns"""
|
||||
self.compiled_patterns = [re.compile(p) for p in self.date_patterns]
|
||||
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
"""Calculate score based on date indicators"""
|
||||
for pattern in self.compiled_patterns:
|
||||
if match := pattern.search(url):
|
||||
year = int(match.group(1))
|
||||
# Score higher for more recent years
|
||||
return 1.0 - (2024 - year) * 0.1
|
||||
return 0.5 # Default score for URLs without dates
|
||||
|
||||
class DomainAuthorityScorer(URLScorer):
|
||||
"""Score URLs based on domain authority.
|
||||
|
||||
authority_scorer = DomainAuthorityScorer({
|
||||
"python.org": 1.0,
|
||||
"github.com": 0.9,
|
||||
"medium.com": 0.7
|
||||
})
|
||||
|
||||
Score based on domain importance
|
||||
Configurable domain weights
|
||||
Default weight for unknown domains"""
|
||||
|
||||
def __init__(self, domain_weights: Dict[str, float],
|
||||
default_weight: float = 0.5, weight: float = 1.0):
|
||||
super().__init__(weight=weight)
|
||||
self.domain_weights = domain_weights
|
||||
self.default_weight = default_weight
|
||||
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
"""Calculate score based on domain authority"""
|
||||
domain = urlparse(url).netloc.lower()
|
||||
return self.domain_weights.get(domain, self.default_weight)
|
||||
|
||||
def create_balanced_scorer() -> CompositeScorer:
|
||||
"""Create a balanced composite scorer"""
|
||||
return CompositeScorer([
|
||||
KeywordRelevanceScorer(
|
||||
keywords=["article", "blog", "news", "research"],
|
||||
weight=1.0
|
||||
),
|
||||
PathDepthScorer(
|
||||
optimal_depth=3,
|
||||
weight=0.7
|
||||
),
|
||||
ContentTypeScorer(
|
||||
type_weights={
|
||||
r'\.html?$': 1.0,
|
||||
r'\.pdf$': 0.8,
|
||||
r'\.xml$': 0.6
|
||||
},
|
||||
weight=0.8
|
||||
),
|
||||
FreshnessScorer(
|
||||
weight=0.9
|
||||
)
|
||||
])
|
||||
|
||||
# Example Usage:
|
||||
"""
|
||||
# Create a composite scorer
|
||||
scorer = CompositeScorer([
|
||||
KeywordRelevanceScorer(["python", "programming"], weight=1.0),
|
||||
PathDepthScorer(optimal_depth=2, weight=0.7),
|
||||
FreshnessScorer(weight=0.8),
|
||||
DomainAuthorityScorer(
|
||||
domain_weights={
|
||||
"python.org": 1.0,
|
||||
"github.com": 0.9,
|
||||
"medium.com": 0.7
|
||||
},
|
||||
weight=0.9
|
||||
)
|
||||
])
|
||||
|
||||
# Score a URL
|
||||
score = scorer.score("https://python.org/article/2024/01/new-features")
|
||||
|
||||
# Access statistics
|
||||
print(f"Average score: {scorer.stats.average_score}")
|
||||
print(f"URLs scored: {scorer.stats.urls_scored}")
|
||||
"""
|
||||
342
docs/scrapper/filters_scrorers.md
Normal file
342
docs/scrapper/filters_scrorers.md
Normal file
@@ -0,0 +1,342 @@
|
||||
# URL Filters and Scorers
|
||||
|
||||
The crawl4ai library provides powerful URL filtering and scoring capabilities that help you control and prioritize your web crawling. This guide explains how to use these features effectively.
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
Start([URL Input]) --> Chain[Filter Chain]
|
||||
|
||||
subgraph Chain Process
|
||||
Chain --> Pattern{URL Pattern\nFilter}
|
||||
Pattern -->|Match| Content{Content Type\nFilter}
|
||||
Pattern -->|No Match| Reject1[Reject URL]
|
||||
|
||||
Content -->|Allowed| Domain{Domain\nFilter}
|
||||
Content -->|Not Allowed| Reject2[Reject URL]
|
||||
|
||||
Domain -->|Allowed| Accept[Accept URL]
|
||||
Domain -->|Blocked| Reject3[Reject URL]
|
||||
end
|
||||
|
||||
subgraph Statistics
|
||||
Pattern --> UpdatePattern[Update Pattern Stats]
|
||||
Content --> UpdateContent[Update Content Stats]
|
||||
Domain --> UpdateDomain[Update Domain Stats]
|
||||
Accept --> UpdateChain[Update Chain Stats]
|
||||
Reject1 --> UpdateChain
|
||||
Reject2 --> UpdateChain
|
||||
Reject3 --> UpdateChain
|
||||
end
|
||||
|
||||
Accept --> End([End])
|
||||
Reject1 --> End
|
||||
Reject2 --> End
|
||||
Reject3 --> End
|
||||
|
||||
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
||||
classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
|
||||
classDef reject fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
||||
classDef accept fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
||||
|
||||
class Start,End accept;
|
||||
class Pattern,Content,Domain decision;
|
||||
class Reject1,Reject2,Reject3 reject;
|
||||
class Chain,UpdatePattern,UpdateContent,UpdateDomain,UpdateChain process;
|
||||
```
|
||||
|
||||
## URL Filters
|
||||
|
||||
URL filters help you control which URLs are crawled. Multiple filters can be chained together to create sophisticated filtering rules.
|
||||
|
||||
### Available Filters
|
||||
|
||||
1. **URL Pattern Filter**
|
||||
```python
|
||||
pattern_filter = URLPatternFilter([
|
||||
"*.example.com/*", # Glob pattern
|
||||
"*/article/*", # Path pattern
|
||||
re.compile(r"blog-\d+") # Regex pattern
|
||||
])
|
||||
```
|
||||
- Supports glob patterns and regex
|
||||
- Multiple patterns per filter
|
||||
- Pattern pre-compilation for performance
|
||||
|
||||
2. **Content Type Filter**
|
||||
```python
|
||||
content_filter = ContentTypeFilter([
|
||||
"text/html",
|
||||
"application/pdf"
|
||||
], check_extension=True)
|
||||
```
|
||||
- Filter by MIME types
|
||||
- Extension checking
|
||||
- Support for multiple content types
|
||||
|
||||
3. **Domain Filter**
|
||||
```python
|
||||
domain_filter = DomainFilter(
|
||||
allowed_domains=["example.com", "blog.example.com"],
|
||||
blocked_domains=["ads.example.com"]
|
||||
)
|
||||
```
|
||||
- Allow/block specific domains
|
||||
- Subdomain support
|
||||
- Efficient domain matching
|
||||
|
||||
### Creating Filter Chains
|
||||
|
||||
```python
|
||||
# Create and configure a filter chain
|
||||
filter_chain = FilterChain([
|
||||
URLPatternFilter(["*.example.com/*"]),
|
||||
ContentTypeFilter(["text/html"]),
|
||||
DomainFilter(blocked_domains=["ads.*"])
|
||||
])
|
||||
|
||||
# Add more filters
|
||||
filter_chain.add_filter(
|
||||
URLPatternFilter(["*/article/*"])
|
||||
)
|
||||
```
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
Start([URL Input]) --> Composite[Composite Scorer]
|
||||
|
||||
subgraph Scoring Process
|
||||
Composite --> Keywords[Keyword Relevance]
|
||||
Composite --> Path[Path Depth]
|
||||
Composite --> Content[Content Type]
|
||||
Composite --> Fresh[Freshness]
|
||||
Composite --> Domain[Domain Authority]
|
||||
|
||||
Keywords --> KeywordScore[Calculate Score]
|
||||
Path --> PathScore[Calculate Score]
|
||||
Content --> ContentScore[Calculate Score]
|
||||
Fresh --> FreshScore[Calculate Score]
|
||||
Domain --> DomainScore[Calculate Score]
|
||||
|
||||
KeywordScore --> Weight1[Apply Weight]
|
||||
PathScore --> Weight2[Apply Weight]
|
||||
ContentScore --> Weight3[Apply Weight]
|
||||
FreshScore --> Weight4[Apply Weight]
|
||||
DomainScore --> Weight5[Apply Weight]
|
||||
end
|
||||
|
||||
Weight1 --> Combine[Combine Scores]
|
||||
Weight2 --> Combine
|
||||
Weight3 --> Combine
|
||||
Weight4 --> Combine
|
||||
Weight5 --> Combine
|
||||
|
||||
Combine --> Normalize{Normalize?}
|
||||
Normalize -->|Yes| NormalizeScore[Normalize Combined Score]
|
||||
Normalize -->|No| FinalScore[Final Score]
|
||||
NormalizeScore --> FinalScore
|
||||
|
||||
FinalScore --> Stats[Update Statistics]
|
||||
Stats --> End([End])
|
||||
|
||||
classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
|
||||
classDef scorer fill:#fff59d,stroke:#000,stroke-width:2px;
|
||||
classDef calc fill:#a5d6a7,stroke:#000,stroke-width:2px;
|
||||
classDef decision fill:#ef9a9a,stroke:#000,stroke-width:2px;
|
||||
|
||||
class Start,End calc;
|
||||
class Keywords,Path,Content,Fresh,Domain scorer;
|
||||
class KeywordScore,PathScore,ContentScore,FreshScore,DomainScore process;
|
||||
class Normalize decision;
|
||||
```
|
||||
|
||||
## URL Scorers
|
||||
|
||||
URL scorers help prioritize which URLs to crawl first. Higher scores indicate higher priority.
|
||||
|
||||
### Available Scorers
|
||||
|
||||
1. **Keyword Relevance Scorer**
|
||||
```python
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["python", "programming"],
|
||||
weight=1.0,
|
||||
case_sensitive=False
|
||||
)
|
||||
```
|
||||
- Score based on keyword matches
|
||||
- Case sensitivity options
|
||||
- Weighted scoring
|
||||
|
||||
2. **Path Depth Scorer**
|
||||
```python
|
||||
path_scorer = PathDepthScorer(
|
||||
optimal_depth=3, # Preferred URL depth
|
||||
weight=0.7
|
||||
)
|
||||
```
|
||||
- Score based on URL path depth
|
||||
- Configurable optimal depth
|
||||
- Diminishing returns for deeper paths
|
||||
|
||||
3. **Content Type Scorer**
|
||||
```python
|
||||
content_scorer = ContentTypeScorer({
|
||||
r'\.html$': 1.0,
|
||||
r'\.pdf$': 0.8,
|
||||
r'\.xml$': 0.6
|
||||
})
|
||||
```
|
||||
- Score based on file types
|
||||
- Configurable type weights
|
||||
- Pattern matching support
|
||||
|
||||
4. **Freshness Scorer**
|
||||
```python
|
||||
freshness_scorer = FreshnessScorer(weight=0.9)
|
||||
```
|
||||
- Score based on date indicators in URLs
|
||||
- Multiple date format support
|
||||
- Recency weighting
|
||||
|
||||
5. **Domain Authority Scorer**
|
||||
```python
|
||||
authority_scorer = DomainAuthorityScorer({
|
||||
"python.org": 1.0,
|
||||
"github.com": 0.9,
|
||||
"medium.com": 0.7
|
||||
})
|
||||
```
|
||||
- Score based on domain importance
|
||||
- Configurable domain weights
|
||||
- Default weight for unknown domains
|
||||
|
||||
### Combining Scorers
|
||||
|
||||
```python
|
||||
# Create a composite scorer
|
||||
composite_scorer = CompositeScorer([
|
||||
KeywordRelevanceScorer(["python"], weight=1.0),
|
||||
PathDepthScorer(optimal_depth=2, weight=0.7),
|
||||
FreshnessScorer(weight=0.8)
|
||||
], normalize=True)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Filter Configuration
|
||||
|
||||
1. **Start Restrictive**
|
||||
```python
|
||||
# Begin with strict filters
|
||||
filter_chain = FilterChain([
|
||||
DomainFilter(allowed_domains=["example.com"]),
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
```
|
||||
|
||||
2. **Layer Filters**
|
||||
```python
|
||||
# Add more specific filters
|
||||
filter_chain.add_filter(
|
||||
URLPatternFilter(["*/article/*", "*/blog/*"])
|
||||
)
|
||||
```
|
||||
|
||||
3. **Monitor Filter Statistics**
|
||||
```python
|
||||
# Check filter performance
|
||||
for filter in filter_chain.filters:
|
||||
print(f"{filter.name}: {filter.stats.rejected_urls} rejected")
|
||||
```
|
||||
|
||||
### Scorer Configuration
|
||||
|
||||
1. **Balance Weights**
|
||||
```python
|
||||
# Balanced scoring configuration
|
||||
scorer = create_balanced_scorer()
|
||||
```
|
||||
|
||||
2. **Customize for Content**
|
||||
```python
|
||||
# News site configuration
|
||||
news_scorer = CompositeScorer([
|
||||
KeywordRelevanceScorer(["news", "article"], weight=1.0),
|
||||
FreshnessScorer(weight=1.0),
|
||||
PathDepthScorer(optimal_depth=2, weight=0.5)
|
||||
])
|
||||
```
|
||||
|
||||
3. **Monitor Scoring Statistics**
|
||||
```python
|
||||
# Check scoring distribution
|
||||
print(f"Average score: {scorer.stats.average_score}")
|
||||
print(f"Score range: {scorer.stats.min_score} - {scorer.stats.max_score}")
|
||||
```
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### Blog Crawling
|
||||
```python
|
||||
blog_config = {
|
||||
'filters': FilterChain([
|
||||
URLPatternFilter(["*/blog/*", "*/post/*"]),
|
||||
ContentTypeFilter(["text/html"])
|
||||
]),
|
||||
'scorer': CompositeScorer([
|
||||
FreshnessScorer(weight=1.0),
|
||||
KeywordRelevanceScorer(["blog", "article"], weight=0.8)
|
||||
])
|
||||
}
|
||||
```
|
||||
|
||||
### Documentation Sites
|
||||
```python
|
||||
docs_config = {
|
||||
'filters': FilterChain([
|
||||
URLPatternFilter(["*/docs/*", "*/guide/*"]),
|
||||
ContentTypeFilter(["text/html", "application/pdf"])
|
||||
]),
|
||||
'scorer': CompositeScorer([
|
||||
PathDepthScorer(optimal_depth=3, weight=1.0),
|
||||
KeywordRelevanceScorer(["guide", "tutorial"], weight=0.9)
|
||||
])
|
||||
}
|
||||
```
|
||||
|
||||
### E-commerce Sites
|
||||
```python
|
||||
ecommerce_config = {
|
||||
'filters': FilterChain([
|
||||
URLPatternFilter(["*/product/*", "*/category/*"]),
|
||||
DomainFilter(blocked_domains=["ads.*", "tracker.*"])
|
||||
]),
|
||||
'scorer': CompositeScorer([
|
||||
PathDepthScorer(optimal_depth=2, weight=1.0),
|
||||
ContentTypeScorer({
|
||||
r'/product/': 1.0,
|
||||
r'/category/': 0.8
|
||||
})
|
||||
])
|
||||
}
|
||||
```
|
||||
|
||||
## Advanced Topics
|
||||
|
||||
### Custom Filters
|
||||
```python
|
||||
class CustomFilter(URLFilter):
|
||||
def apply(self, url: str) -> bool:
|
||||
# Your custom filtering logic
|
||||
return True
|
||||
```
|
||||
|
||||
### Custom Scorers
|
||||
```python
|
||||
class CustomScorer(URLScorer):
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
# Your custom scoring logic
|
||||
return 1.0
|
||||
```
|
||||
|
||||
For more examples, check our [example repository](https://github.com/example/crawl4ai/examples).
|
||||
206
docs/scrapper/how_to_use.md
Normal file
206
docs/scrapper/how_to_use.md
Normal file
@@ -0,0 +1,206 @@
|
||||
# Scraper Examples Guide
|
||||
|
||||
This guide provides two complete examples of using the crawl4ai scraper: a basic implementation for simple use cases and an advanced implementation showcasing all features.
|
||||
|
||||
## Basic Example
|
||||
|
||||
The basic example demonstrates a simple blog scraping scenario:
|
||||
|
||||
```python
|
||||
from crawl4ai.scraper import AsyncWebScraper, BFSScraperStrategy, FilterChain
|
||||
|
||||
# Create simple filter chain
|
||||
filter_chain = FilterChain([
|
||||
URLPatternFilter("*/blog/*"),
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
|
||||
# Initialize strategy
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=2,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=None,
|
||||
max_concurrent=3
|
||||
)
|
||||
|
||||
# Create and run scraper
|
||||
crawler = AsyncWebCrawler()
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
result = await scraper.ascrape("https://example.com/blog/")
|
||||
```
|
||||
|
||||
### Features Demonstrated
|
||||
- Basic URL filtering
|
||||
- Simple content type filtering
|
||||
- Depth control
|
||||
- Concurrent request limiting
|
||||
- Result collection
|
||||
|
||||
## Advanced Example
|
||||
|
||||
The advanced example shows a sophisticated news site scraping setup with all features enabled:
|
||||
|
||||
```python
|
||||
# Create comprehensive filter chain
|
||||
filter_chain = FilterChain([
|
||||
DomainFilter(
|
||||
allowed_domains=["example.com"],
|
||||
blocked_domains=["ads.example.com"]
|
||||
),
|
||||
URLPatternFilter([
|
||||
"*/article/*",
|
||||
re.compile(r"\d{4}/\d{2}/.*")
|
||||
]),
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
|
||||
# Create intelligent scorer
|
||||
scorer = CompositeScorer([
|
||||
KeywordRelevanceScorer(
|
||||
keywords=["news", "breaking"],
|
||||
weight=1.0
|
||||
),
|
||||
PathDepthScorer(optimal_depth=3, weight=0.7),
|
||||
FreshnessScorer(weight=0.9)
|
||||
])
|
||||
|
||||
# Initialize advanced strategy
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=4,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=scorer,
|
||||
max_concurrent=5
|
||||
)
|
||||
```
|
||||
|
||||
### Features Demonstrated
|
||||
1. **Advanced Filtering**
|
||||
- Domain filtering
|
||||
- Pattern matching
|
||||
- Content type control
|
||||
|
||||
2. **Intelligent Scoring**
|
||||
- Keyword relevance
|
||||
- Path optimization
|
||||
- Freshness priority
|
||||
|
||||
3. **Monitoring**
|
||||
- Progress tracking
|
||||
- Error handling
|
||||
- Statistics collection
|
||||
|
||||
4. **Resource Management**
|
||||
- Concurrent processing
|
||||
- Rate limiting
|
||||
- Cleanup handling
|
||||
|
||||
## Running the Examples
|
||||
|
||||
```bash
|
||||
# Basic usage
|
||||
python basic_scraper_example.py
|
||||
|
||||
# Advanced usage with logging
|
||||
PYTHONPATH=. python advanced_scraper_example.py
|
||||
```
|
||||
|
||||
## Example Output
|
||||
|
||||
### Basic Example
|
||||
```
|
||||
Crawled 15 pages:
|
||||
- https://example.com/blog/post1: 24560 bytes
|
||||
- https://example.com/blog/post2: 18920 bytes
|
||||
...
|
||||
```
|
||||
|
||||
### Advanced Example
|
||||
```
|
||||
INFO: Starting crawl of https://example.com/news/
|
||||
INFO: Processed: https://example.com/news/breaking/story1
|
||||
DEBUG: KeywordScorer: 0.85
|
||||
DEBUG: FreshnessScorer: 0.95
|
||||
INFO: Progress: 10 URLs processed
|
||||
...
|
||||
INFO: Scraping completed:
|
||||
INFO: - URLs processed: 50
|
||||
INFO: - Errors: 2
|
||||
INFO: - Total content size: 1240.50 KB
|
||||
```
|
||||
|
||||
## Customization
|
||||
|
||||
### Adding Custom Filters
|
||||
```python
|
||||
class CustomFilter(URLFilter):
|
||||
def apply(self, url: str) -> bool:
|
||||
# Your custom filtering logic
|
||||
return True
|
||||
|
||||
filter_chain.add_filter(CustomFilter())
|
||||
```
|
||||
|
||||
### Custom Scoring Logic
|
||||
```python
|
||||
class CustomScorer(URLScorer):
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
# Your custom scoring logic
|
||||
return 1.0
|
||||
|
||||
scorer = CompositeScorer([
|
||||
CustomScorer(weight=1.0),
|
||||
...
|
||||
])
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Start Simple**
|
||||
- Begin with basic filtering
|
||||
- Add features incrementally
|
||||
- Test thoroughly at each step
|
||||
|
||||
2. **Monitor Performance**
|
||||
- Watch memory usage
|
||||
- Track processing times
|
||||
- Adjust concurrency as needed
|
||||
|
||||
3. **Handle Errors**
|
||||
- Implement proper error handling
|
||||
- Log important events
|
||||
- Track error statistics
|
||||
|
||||
4. **Optimize Resources**
|
||||
- Set appropriate delays
|
||||
- Limit concurrent requests
|
||||
- Use streaming for large crawls
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
Common issues and solutions:
|
||||
|
||||
1. **Too Many Requests**
|
||||
```python
|
||||
strategy = BFSScraperStrategy(
|
||||
max_concurrent=3, # Reduce concurrent requests
|
||||
min_crawl_delay=2 # Increase delay between requests
|
||||
)
|
||||
```
|
||||
|
||||
2. **Memory Issues**
|
||||
```python
|
||||
# Use streaming mode for large crawls
|
||||
async for result in scraper.ascrape(url, stream=True):
|
||||
process_result(result)
|
||||
```
|
||||
|
||||
3. **Missing Content**
|
||||
```python
|
||||
# Check your filter chain
|
||||
filter_chain = FilterChain([
|
||||
URLPatternFilter("*"), # Broaden patterns
|
||||
ContentTypeFilter(["*"]) # Accept all content
|
||||
])
|
||||
```
|
||||
|
||||
For more examples and use cases, visit our [GitHub repository](https://github.com/example/crawl4ai/examples).
|
||||
111
docs/scrapper/web_crawler_quick_start.py
Normal file
111
docs/scrapper/web_crawler_quick_start.py
Normal file
@@ -0,0 +1,111 @@
|
||||
import unittest, os
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
|
||||
from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy
|
||||
|
||||
class TestWebCrawler(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.crawler = WebCrawler()
|
||||
|
||||
def test_warmup(self):
|
||||
self.crawler.warmup()
|
||||
self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
|
||||
|
||||
def test_run_default_strategies(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=RegexChunking(),
|
||||
extraction_strategy=CosineStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract using default strategies")
|
||||
|
||||
def test_run_different_strategies(self):
|
||||
url = 'https://www.nbcnews.com/business'
|
||||
|
||||
# Test with FixedLengthWordChunking and LLMExtractionStrategy
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")
|
||||
|
||||
# Test with SlidingWindowChunking and TopicExtractionStrategy
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
||||
extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")
|
||||
|
||||
def test_invalid_url(self):
|
||||
with self.assertRaises(Exception) as context:
|
||||
self.crawler.run(url='invalid_url', bypass_cache=True)
|
||||
self.assertIn("Invalid URL", str(context.exception))
|
||||
|
||||
def test_unsupported_extraction_strategy(self):
|
||||
with self.assertRaises(Exception) as context:
|
||||
self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
|
||||
self.assertIn("Unsupported extraction strategy", str(context.exception))
|
||||
|
||||
def test_invalid_css_selector(self):
|
||||
with self.assertRaises(ValueError) as context:
|
||||
self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
|
||||
self.assertIn("Invalid CSS selector", str(context.exception))
|
||||
|
||||
|
||||
def test_crawl_with_cache_and_bypass_cache(self):
|
||||
url = 'https://www.nbcnews.com/business'
|
||||
|
||||
# First crawl with cache enabled
|
||||
result = self.crawler.run(url=url, bypass_cache=False)
|
||||
self.assertTrue(result.success, "Failed to crawl and cache the result")
|
||||
|
||||
# Second crawl with bypass_cache=True
|
||||
result = self.crawler.run(url=url, bypass_cache=True)
|
||||
self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
|
||||
|
||||
def test_fetch_multiple_pages(self):
|
||||
urls = [
|
||||
'https://www.nbcnews.com/business',
|
||||
'https://www.bbc.com/news'
|
||||
]
|
||||
results = []
|
||||
for url in urls:
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=RegexChunking(),
|
||||
extraction_strategy=CosineStrategy(),
|
||||
bypass_cache=True
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
|
||||
for result in results:
|
||||
self.assertTrue(result.success, "Failed to crawl and extract a page in the list")
|
||||
|
||||
def test_run_fixed_length_word_chunking_and_no_extraction(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")
|
||||
|
||||
def test_run_sliding_window_and_no_extraction(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
||||
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
184
tests/test_scraper.py
Normal file
184
tests/test_scraper.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# basic_scraper_example.py
|
||||
from crawl4ai.scraper import (
|
||||
AsyncWebScraper,
|
||||
BFSScraperStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
|
||||
async def basic_scraper_example():
|
||||
"""
|
||||
Basic example: Scrape a blog site for articles
|
||||
- Crawls only HTML pages
|
||||
- Stays within the blog section
|
||||
- Collects all results at once
|
||||
"""
|
||||
# Create a simple filter chain
|
||||
filter_chain = FilterChain([
|
||||
# Only crawl pages within the blog section
|
||||
URLPatternFilter("*/blog/*"),
|
||||
# Only process HTML pages
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
|
||||
# Initialize the strategy with basic configuration
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=2, # Only go 2 levels deep
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=None, # Use default scoring
|
||||
max_concurrent=3 # Limit concurrent requests
|
||||
)
|
||||
|
||||
# Create the crawler and scraper
|
||||
crawler = AsyncWebCrawler()
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
|
||||
# Start scraping
|
||||
try:
|
||||
result = await scraper.ascrape("https://example.com/blog/")
|
||||
|
||||
# Process results
|
||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
||||
for url, data in result.extracted_data.items():
|
||||
print(f"- {url}: {len(data.html)} bytes")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
|
||||
# advanced_scraper_example.py
|
||||
import logging
|
||||
from crawl4ai.scraper import (
|
||||
AsyncWebScraper,
|
||||
BFSScraperStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
KeywordRelevanceScorer,
|
||||
PathDepthScorer,
|
||||
FreshnessScorer,
|
||||
CompositeScorer
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
|
||||
async def advanced_scraper_example():
|
||||
"""
|
||||
Advanced example: Intelligent news site scraping
|
||||
- Uses all filter types
|
||||
- Implements sophisticated scoring
|
||||
- Streams results
|
||||
- Includes monitoring and logging
|
||||
"""
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("advanced_scraper")
|
||||
|
||||
# Create sophisticated filter chain
|
||||
filter_chain = FilterChain([
|
||||
# Domain control
|
||||
DomainFilter(
|
||||
allowed_domains=["example.com", "blog.example.com"],
|
||||
blocked_domains=["ads.example.com", "tracker.example.com"]
|
||||
),
|
||||
# URL patterns
|
||||
URLPatternFilter([
|
||||
"*/article/*",
|
||||
"*/news/*",
|
||||
"*/blog/*",
|
||||
re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs
|
||||
]),
|
||||
# Content types
|
||||
ContentTypeFilter([
|
||||
"text/html",
|
||||
"application/xhtml+xml"
|
||||
])
|
||||
])
|
||||
|
||||
# Create composite scorer
|
||||
scorer = CompositeScorer([
|
||||
# Prioritize by keywords
|
||||
KeywordRelevanceScorer(
|
||||
keywords=["news", "breaking", "update", "latest"],
|
||||
weight=1.0
|
||||
),
|
||||
# Prefer optimal URL structure
|
||||
PathDepthScorer(
|
||||
optimal_depth=3,
|
||||
weight=0.7
|
||||
),
|
||||
# Prioritize fresh content
|
||||
FreshnessScorer(weight=0.9)
|
||||
])
|
||||
|
||||
# Initialize strategy with advanced configuration
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=4,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=scorer,
|
||||
max_concurrent=5,
|
||||
min_crawl_delay=1
|
||||
)
|
||||
|
||||
# Create crawler and scraper
|
||||
crawler = AsyncWebCrawler()
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
|
||||
# Track statistics
|
||||
stats = {
|
||||
'processed': 0,
|
||||
'errors': 0,
|
||||
'total_size': 0
|
||||
}
|
||||
|
||||
try:
|
||||
# Use streaming mode
|
||||
async for result in scraper.ascrape("https://example.com/news/", stream=True):
|
||||
stats['processed'] += 1
|
||||
|
||||
if result.success:
|
||||
stats['total_size'] += len(result.html)
|
||||
logger.info(f"Processed: {result.url}")
|
||||
|
||||
# Print scoring information
|
||||
for scorer_name, score in result.scores.items():
|
||||
logger.debug(f"{scorer_name}: {score:.2f}")
|
||||
else:
|
||||
stats['errors'] += 1
|
||||
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
||||
|
||||
# Log progress regularly
|
||||
if stats['processed'] % 10 == 0:
|
||||
logger.info(f"Progress: {stats['processed']} URLs processed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping error: {e}")
|
||||
|
||||
finally:
|
||||
# Print final statistics
|
||||
logger.info("Scraping completed:")
|
||||
logger.info(f"- URLs processed: {stats['processed']}")
|
||||
logger.info(f"- Errors: {stats['errors']}")
|
||||
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
||||
|
||||
# Print filter statistics
|
||||
for filter_ in filter_chain.filters:
|
||||
logger.info(f"{filter_.name} stats:")
|
||||
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
||||
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
||||
|
||||
# Print scorer statistics
|
||||
logger.info("Scoring statistics:")
|
||||
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
||||
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
# Run basic example
|
||||
print("Running basic scraper example...")
|
||||
asyncio.run(basic_scraper_example())
|
||||
|
||||
print("\nRunning advanced scraper example...")
|
||||
asyncio.run(advanced_scraper_example())
|
||||
Reference in New Issue
Block a user