feat(scraper): Enhance URL filtering and scoring systems
Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes.
This commit is contained in:
@@ -1,2 +1,3 @@
|
||||
from .async_web_scraper import AsyncWebScraper
|
||||
from .bfs_scraper_strategy import BFSScraperStrategy
|
||||
from .bfs_scraper_strategy import BFSScraperStrategy
|
||||
from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter
|
||||
@@ -1,43 +0,0 @@
|
||||
from .url_filter import URLFilter
|
||||
from typing import List, Union
|
||||
from urllib.parse import urlparse
|
||||
import mimetypes
|
||||
|
||||
|
||||
class ContentTypeFilter(URLFilter):
|
||||
"""Filter URLs based on expected content type"""
|
||||
|
||||
def __init__(self, allowed_types: Union[str, List[str]],
|
||||
check_extension: bool = True):
|
||||
super().__init__()
|
||||
self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types
|
||||
self.check_extension = check_extension
|
||||
self._normalize_types()
|
||||
|
||||
def _normalize_types(self):
|
||||
"""Normalize content type strings"""
|
||||
self.allowed_types = [t.lower() for t in self.allowed_types]
|
||||
|
||||
def _check_extension(self, url: str) -> bool:
|
||||
"""Check URL's file extension"""
|
||||
ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else ''
|
||||
if not ext:
|
||||
return True # No extension, might be dynamic content
|
||||
|
||||
guessed_type = mimetypes.guess_type(url)[0]
|
||||
return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types)
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Check if URL's content type is allowed"""
|
||||
result = True
|
||||
if self.check_extension:
|
||||
result = self._check_extension(url)
|
||||
self._update_stats(result)
|
||||
return result
|
||||
|
||||
# class ContentTypeFilter(URLFilter):
|
||||
# def __init__(self, contentType: str):
|
||||
# self.contentType = contentType
|
||||
# def apply(self, url: str) -> bool:
|
||||
# #TODO: This is a stub. Will implement this later
|
||||
# return True
|
||||
@@ -1,72 +0,0 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
from typing import List
|
||||
@dataclass
|
||||
class FilterStats:
|
||||
"""Statistics for filter applications"""
|
||||
total_urls: int = 0
|
||||
rejected_urls: int = 0
|
||||
passed_urls: int = 0
|
||||
|
||||
class URLFilter(ABC):
|
||||
"""Base class for URL filters"""
|
||||
|
||||
def __init__(self, name: str = None):
|
||||
self.name = name or self.__class__.__name__
|
||||
self.stats = FilterStats()
|
||||
self.logger = logging.getLogger(f"urlfilter.{self.name}")
|
||||
|
||||
@abstractmethod
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Apply the filter to a URL"""
|
||||
pass
|
||||
|
||||
def _update_stats(self, passed: bool):
|
||||
"""Update filter statistics"""
|
||||
self.stats.total_urls += 1
|
||||
if passed:
|
||||
self.stats.passed_urls += 1
|
||||
else:
|
||||
self.stats.rejected_urls += 1
|
||||
|
||||
class FilterChain:
|
||||
"""Chain of URL filters"""
|
||||
|
||||
def __init__(self, filters: List[URLFilter] = None):
|
||||
self.filters = filters or []
|
||||
self.stats = FilterStats()
|
||||
self.logger = logging.getLogger("urlfilter.chain")
|
||||
|
||||
def add_filter(self, filter_: URLFilter) -> 'FilterChain':
|
||||
"""Add a filter to the chain"""
|
||||
self.filters.append(filter_)
|
||||
return self # Enable method chaining
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Apply all filters in the chain"""
|
||||
self.stats.total_urls += 1
|
||||
|
||||
for filter_ in self.filters:
|
||||
if not filter_.apply(url):
|
||||
self.stats.rejected_urls += 1
|
||||
self.logger.debug(f"URL {url} rejected by {filter_.name}")
|
||||
return False
|
||||
|
||||
self.stats.passed_urls += 1
|
||||
return True
|
||||
|
||||
# class URLFilter(ABC):
|
||||
# @abstractmethod
|
||||
# def apply(self, url: str) -> bool:
|
||||
# pass
|
||||
|
||||
# class FilterChain:
|
||||
# def __init__(self):
|
||||
# self.filters = []
|
||||
|
||||
# def add_filter(self, filter: URLFilter):
|
||||
# self.filters.append(filter)
|
||||
|
||||
# def apply(self, url: str) -> bool:
|
||||
# return all(filter.apply(url) for filter in self.filters)
|
||||
@@ -1,39 +0,0 @@
|
||||
from .url_filter import URLFilter
|
||||
from re import Pattern
|
||||
from typing import List, Union
|
||||
import re
|
||||
import fnmatch
|
||||
|
||||
|
||||
class URLPatternFilter(URLFilter):
|
||||
"""Filter URLs based on glob patterns or regex"""
|
||||
|
||||
def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||||
use_glob: bool = True):
|
||||
super().__init__()
|
||||
self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||
self.use_glob = use_glob
|
||||
self._compiled_patterns = []
|
||||
|
||||
for pattern in self.patterns:
|
||||
if isinstance(pattern, str) and use_glob:
|
||||
self._compiled_patterns.append(self._glob_to_regex(pattern))
|
||||
else:
|
||||
self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern)
|
||||
|
||||
def _glob_to_regex(self, pattern: str) -> Pattern:
|
||||
"""Convert glob pattern to regex"""
|
||||
return re.compile(fnmatch.translate(pattern))
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Check if URL matches any of the patterns"""
|
||||
matches = any(pattern.search(url) for pattern in self._compiled_patterns)
|
||||
self._update_stats(matches)
|
||||
return matches
|
||||
|
||||
# class URLPatternFilter(URLFilter):
|
||||
# def __init__(self, pattern: Pattern):
|
||||
# self.pattern = pattern
|
||||
# def apply(self, url: str) -> bool:
|
||||
# #TODO: This is a stub. Will implement this later.
|
||||
# return True
|
||||
@@ -1,9 +0,0 @@
|
||||
from .url_scorer import URLScorer
|
||||
from typing import List
|
||||
|
||||
class KeywordRelevanceScorer(URLScorer):
|
||||
def __init__(self,keywords: List[str]):
|
||||
self.keyworkds = keywords
|
||||
def score(self, url: str) -> float:
|
||||
#TODO: This is a stub. Will implement this later.
|
||||
return 1
|
||||
@@ -1,6 +0,0 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
class URLScorer(ABC):
|
||||
@abstractmethod
|
||||
def score(self, url: str) -> float:
|
||||
pass
|
||||
184
docs/scrapper/scraper_quickstart.py
Normal file
184
docs/scrapper/scraper_quickstart.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# basic_scraper_example.py
|
||||
from crawl4ai.scraper import (
|
||||
AsyncWebScraper,
|
||||
BFSScraperStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
|
||||
async def basic_scraper_example():
|
||||
"""
|
||||
Basic example: Scrape a blog site for articles
|
||||
- Crawls only HTML pages
|
||||
- Stays within the blog section
|
||||
- Collects all results at once
|
||||
"""
|
||||
# Create a simple filter chain
|
||||
filter_chain = FilterChain([
|
||||
# Only crawl pages within the blog section
|
||||
URLPatternFilter("*/blog/*"),
|
||||
# Only process HTML pages
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
|
||||
# Initialize the strategy with basic configuration
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=2, # Only go 2 levels deep
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=None, # Use default scoring
|
||||
max_concurrent=3 # Limit concurrent requests
|
||||
)
|
||||
|
||||
# Create the crawler and scraper
|
||||
crawler = AsyncWebCrawler()
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
|
||||
# Start scraping
|
||||
try:
|
||||
result = await scraper.ascrape("https://example.com/blog/")
|
||||
|
||||
# Process results
|
||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
||||
for url, data in result.extracted_data.items():
|
||||
print(f"- {url}: {len(data.html)} bytes")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
|
||||
# advanced_scraper_example.py
|
||||
import logging
|
||||
from crawl4ai.scraper import (
|
||||
AsyncWebScraper,
|
||||
BFSScraperStrategy,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
KeywordRelevanceScorer,
|
||||
PathDepthScorer,
|
||||
FreshnessScorer,
|
||||
CompositeScorer
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
|
||||
async def advanced_scraper_example():
|
||||
"""
|
||||
Advanced example: Intelligent news site scraping
|
||||
- Uses all filter types
|
||||
- Implements sophisticated scoring
|
||||
- Streams results
|
||||
- Includes monitoring and logging
|
||||
"""
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("advanced_scraper")
|
||||
|
||||
# Create sophisticated filter chain
|
||||
filter_chain = FilterChain([
|
||||
# Domain control
|
||||
DomainFilter(
|
||||
allowed_domains=["example.com", "blog.example.com"],
|
||||
blocked_domains=["ads.example.com", "tracker.example.com"]
|
||||
),
|
||||
# URL patterns
|
||||
URLPatternFilter([
|
||||
"*/article/*",
|
||||
"*/news/*",
|
||||
"*/blog/*",
|
||||
re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs
|
||||
]),
|
||||
# Content types
|
||||
ContentTypeFilter([
|
||||
"text/html",
|
||||
"application/xhtml+xml"
|
||||
])
|
||||
])
|
||||
|
||||
# Create composite scorer
|
||||
scorer = CompositeScorer([
|
||||
# Prioritize by keywords
|
||||
KeywordRelevanceScorer(
|
||||
keywords=["news", "breaking", "update", "latest"],
|
||||
weight=1.0
|
||||
),
|
||||
# Prefer optimal URL structure
|
||||
PathDepthScorer(
|
||||
optimal_depth=3,
|
||||
weight=0.7
|
||||
),
|
||||
# Prioritize fresh content
|
||||
FreshnessScorer(weight=0.9)
|
||||
])
|
||||
|
||||
# Initialize strategy with advanced configuration
|
||||
strategy = BFSScraperStrategy(
|
||||
max_depth=4,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=scorer,
|
||||
max_concurrent=5,
|
||||
min_crawl_delay=1
|
||||
)
|
||||
|
||||
# Create crawler and scraper
|
||||
crawler = AsyncWebCrawler()
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
|
||||
# Track statistics
|
||||
stats = {
|
||||
'processed': 0,
|
||||
'errors': 0,
|
||||
'total_size': 0
|
||||
}
|
||||
|
||||
try:
|
||||
# Use streaming mode
|
||||
async for result in scraper.ascrape("https://example.com/news/", stream=True):
|
||||
stats['processed'] += 1
|
||||
|
||||
if result.success:
|
||||
stats['total_size'] += len(result.html)
|
||||
logger.info(f"Processed: {result.url}")
|
||||
|
||||
# Print scoring information
|
||||
for scorer_name, score in result.scores.items():
|
||||
logger.debug(f"{scorer_name}: {score:.2f}")
|
||||
else:
|
||||
stats['errors'] += 1
|
||||
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
||||
|
||||
# Log progress regularly
|
||||
if stats['processed'] % 10 == 0:
|
||||
logger.info(f"Progress: {stats['processed']} URLs processed")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Scraping error: {e}")
|
||||
|
||||
finally:
|
||||
# Print final statistics
|
||||
logger.info("Scraping completed:")
|
||||
logger.info(f"- URLs processed: {stats['processed']}")
|
||||
logger.info(f"- Errors: {stats['errors']}")
|
||||
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
||||
|
||||
# Print filter statistics
|
||||
for filter_ in filter_chain.filters:
|
||||
logger.info(f"{filter_.name} stats:")
|
||||
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
||||
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
||||
|
||||
# Print scorer statistics
|
||||
logger.info("Scoring statistics:")
|
||||
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
||||
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
|
||||
# Run basic example
|
||||
print("Running basic scraper example...")
|
||||
asyncio.run(basic_scraper_example())
|
||||
|
||||
print("\nRunning advanced scraper example...")
|
||||
asyncio.run(advanced_scraper_example())
|
||||
@@ -1,111 +0,0 @@
|
||||
import unittest, os
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import RegexChunking, FixedLengthWordChunking, SlidingWindowChunking
|
||||
from crawl4ai.extraction_strategy import CosineStrategy, LLMExtractionStrategy, TopicExtractionStrategy, NoExtractionStrategy
|
||||
|
||||
class TestWebCrawler(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.crawler = WebCrawler()
|
||||
|
||||
def test_warmup(self):
|
||||
self.crawler.warmup()
|
||||
self.assertTrue(self.crawler.ready, "WebCrawler failed to warm up")
|
||||
|
||||
def test_run_default_strategies(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=RegexChunking(),
|
||||
extraction_strategy=CosineStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract using default strategies")
|
||||
|
||||
def test_run_different_strategies(self):
|
||||
url = 'https://www.nbcnews.com/business'
|
||||
|
||||
# Test with FixedLengthWordChunking and LLMExtractionStrategy
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy")
|
||||
|
||||
# Test with SlidingWindowChunking and TopicExtractionStrategy
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
||||
extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy")
|
||||
|
||||
def test_invalid_url(self):
|
||||
with self.assertRaises(Exception) as context:
|
||||
self.crawler.run(url='invalid_url', bypass_cache=True)
|
||||
self.assertIn("Invalid URL", str(context.exception))
|
||||
|
||||
def test_unsupported_extraction_strategy(self):
|
||||
with self.assertRaises(Exception) as context:
|
||||
self.crawler.run(url='https://www.nbcnews.com/business', extraction_strategy="UnsupportedStrategy", bypass_cache=True)
|
||||
self.assertIn("Unsupported extraction strategy", str(context.exception))
|
||||
|
||||
def test_invalid_css_selector(self):
|
||||
with self.assertRaises(ValueError) as context:
|
||||
self.crawler.run(url='https://www.nbcnews.com/business', css_selector="invalid_selector", bypass_cache=True)
|
||||
self.assertIn("Invalid CSS selector", str(context.exception))
|
||||
|
||||
|
||||
def test_crawl_with_cache_and_bypass_cache(self):
|
||||
url = 'https://www.nbcnews.com/business'
|
||||
|
||||
# First crawl with cache enabled
|
||||
result = self.crawler.run(url=url, bypass_cache=False)
|
||||
self.assertTrue(result.success, "Failed to crawl and cache the result")
|
||||
|
||||
# Second crawl with bypass_cache=True
|
||||
result = self.crawler.run(url=url, bypass_cache=True)
|
||||
self.assertTrue(result.success, "Failed to bypass cache and fetch fresh data")
|
||||
|
||||
def test_fetch_multiple_pages(self):
|
||||
urls = [
|
||||
'https://www.nbcnews.com/business',
|
||||
'https://www.bbc.com/news'
|
||||
]
|
||||
results = []
|
||||
for url in urls:
|
||||
result = self.crawler.run(
|
||||
url=url,
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=RegexChunking(),
|
||||
extraction_strategy=CosineStrategy(),
|
||||
bypass_cache=True
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
self.assertEqual(len(results), 2, "Failed to crawl and extract multiple pages")
|
||||
for result in results:
|
||||
self.assertTrue(result.success, "Failed to crawl and extract a page in the list")
|
||||
|
||||
def test_run_fixed_length_word_chunking_and_no_extraction(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and NoExtractionStrategy")
|
||||
|
||||
def test_run_sliding_window_and_no_extraction(self):
|
||||
result = self.crawler.run(
|
||||
url='https://www.nbcnews.com/business',
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=SlidingWindowChunking(window_size=100, step=50),
|
||||
extraction_strategy=NoExtractionStrategy(), bypass_cache=True
|
||||
)
|
||||
self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and NoExtractionStrategy")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user