Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes. - Quick Start is created and added
184 lines
5.5 KiB
Python
184 lines
5.5 KiB
Python
# basic_scraper_example.py
|
|
from crawl4ai.scraper import (
|
|
AsyncWebScraper,
|
|
BFSScraperStrategy,
|
|
FilterChain,
|
|
URLPatternFilter,
|
|
ContentTypeFilter
|
|
)
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
|
|
async def basic_scraper_example():
|
|
"""
|
|
Basic example: Scrape a blog site for articles
|
|
- Crawls only HTML pages
|
|
- Stays within the blog section
|
|
- Collects all results at once
|
|
"""
|
|
# Create a simple filter chain
|
|
filter_chain = FilterChain([
|
|
# Only crawl pages within the blog section
|
|
URLPatternFilter("*/blog/*"),
|
|
# Only process HTML pages
|
|
ContentTypeFilter(["text/html"])
|
|
])
|
|
|
|
# Initialize the strategy with basic configuration
|
|
strategy = BFSScraperStrategy(
|
|
max_depth=2, # Only go 2 levels deep
|
|
filter_chain=filter_chain,
|
|
url_scorer=None, # Use default scoring
|
|
max_concurrent=3 # Limit concurrent requests
|
|
)
|
|
|
|
# Create the crawler and scraper
|
|
crawler = AsyncWebCrawler()
|
|
scraper = AsyncWebScraper(crawler, strategy)
|
|
|
|
# Start scraping
|
|
try:
|
|
result = await scraper.ascrape("https://example.com/blog/")
|
|
|
|
# Process results
|
|
print(f"Crawled {len(result.crawled_urls)} pages:")
|
|
for url, data in result.extracted_data.items():
|
|
print(f"- {url}: {len(data.html)} bytes")
|
|
|
|
except Exception as e:
|
|
print(f"Error during scraping: {e}")
|
|
|
|
# advanced_scraper_example.py
|
|
import logging
|
|
from crawl4ai.scraper import (
|
|
AsyncWebScraper,
|
|
BFSScraperStrategy,
|
|
FilterChain,
|
|
URLPatternFilter,
|
|
ContentTypeFilter,
|
|
DomainFilter,
|
|
KeywordRelevanceScorer,
|
|
PathDepthScorer,
|
|
FreshnessScorer,
|
|
CompositeScorer
|
|
)
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
|
|
async def advanced_scraper_example():
|
|
"""
|
|
Advanced example: Intelligent news site scraping
|
|
- Uses all filter types
|
|
- Implements sophisticated scoring
|
|
- Streams results
|
|
- Includes monitoring and logging
|
|
"""
|
|
# Set up logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger("advanced_scraper")
|
|
|
|
# Create sophisticated filter chain
|
|
filter_chain = FilterChain([
|
|
# Domain control
|
|
DomainFilter(
|
|
allowed_domains=["example.com", "blog.example.com"],
|
|
blocked_domains=["ads.example.com", "tracker.example.com"]
|
|
),
|
|
# URL patterns
|
|
URLPatternFilter([
|
|
"*/article/*",
|
|
"*/news/*",
|
|
"*/blog/*",
|
|
re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs
|
|
]),
|
|
# Content types
|
|
ContentTypeFilter([
|
|
"text/html",
|
|
"application/xhtml+xml"
|
|
])
|
|
])
|
|
|
|
# Create composite scorer
|
|
scorer = CompositeScorer([
|
|
# Prioritize by keywords
|
|
KeywordRelevanceScorer(
|
|
keywords=["news", "breaking", "update", "latest"],
|
|
weight=1.0
|
|
),
|
|
# Prefer optimal URL structure
|
|
PathDepthScorer(
|
|
optimal_depth=3,
|
|
weight=0.7
|
|
),
|
|
# Prioritize fresh content
|
|
FreshnessScorer(weight=0.9)
|
|
])
|
|
|
|
# Initialize strategy with advanced configuration
|
|
strategy = BFSScraperStrategy(
|
|
max_depth=4,
|
|
filter_chain=filter_chain,
|
|
url_scorer=scorer,
|
|
max_concurrent=5,
|
|
min_crawl_delay=1
|
|
)
|
|
|
|
# Create crawler and scraper
|
|
crawler = AsyncWebCrawler()
|
|
scraper = AsyncWebScraper(crawler, strategy)
|
|
|
|
# Track statistics
|
|
stats = {
|
|
'processed': 0,
|
|
'errors': 0,
|
|
'total_size': 0
|
|
}
|
|
|
|
try:
|
|
# Use streaming mode
|
|
async for result in scraper.ascrape("https://example.com/news/", stream=True):
|
|
stats['processed'] += 1
|
|
|
|
if result.success:
|
|
stats['total_size'] += len(result.html)
|
|
logger.info(f"Processed: {result.url}")
|
|
|
|
# Print scoring information
|
|
for scorer_name, score in result.scores.items():
|
|
logger.debug(f"{scorer_name}: {score:.2f}")
|
|
else:
|
|
stats['errors'] += 1
|
|
logger.error(f"Failed to process {result.url}: {result.error_message}")
|
|
|
|
# Log progress regularly
|
|
if stats['processed'] % 10 == 0:
|
|
logger.info(f"Progress: {stats['processed']} URLs processed")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Scraping error: {e}")
|
|
|
|
finally:
|
|
# Print final statistics
|
|
logger.info("Scraping completed:")
|
|
logger.info(f"- URLs processed: {stats['processed']}")
|
|
logger.info(f"- Errors: {stats['errors']}")
|
|
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
|
|
|
# Print filter statistics
|
|
for filter_ in filter_chain.filters:
|
|
logger.info(f"{filter_.name} stats:")
|
|
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
|
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
|
|
|
# Print scorer statistics
|
|
logger.info("Scoring statistics:")
|
|
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
|
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
|
|
# Run basic example
|
|
print("Running basic scraper example...")
|
|
asyncio.run(basic_scraper_example())
|
|
|
|
print("\nRunning advanced scraper example...")
|
|
asyncio.run(advanced_scraper_example()) |