188 lines
5.8 KiB
Python
188 lines
5.8 KiB
Python
# basic_scraper_example.py
|
|
from crawl4ai.scraper import (
|
|
AsyncWebScraper,
|
|
BFSScraperStrategy,
|
|
FilterChain,
|
|
URLPatternFilter,
|
|
ContentTypeFilter,
|
|
)
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
|
|
import re
|
|
|
|
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
|
|
|
|
|
async def basic_scraper_example():
|
|
"""
|
|
Basic example: Scrape a blog site for articles
|
|
- Crawls only HTML pages
|
|
- Stays within the blog section
|
|
- Collects all results at once
|
|
"""
|
|
# Create a simple filter chain
|
|
filter_chain = FilterChain(
|
|
[
|
|
# Only crawl pages within the blog section
|
|
URLPatternFilter("*/tutorial/*"),
|
|
# Only process HTML pages
|
|
ContentTypeFilter(["text/html"]),
|
|
]
|
|
)
|
|
|
|
# Initialize the strategy with basic configuration
|
|
strategy = BFSScraperStrategy(
|
|
max_depth=2, # Only go 2 levels deep
|
|
filter_chain=filter_chain,
|
|
url_scorer=None, # Use default scoring
|
|
process_external_links=True,
|
|
)
|
|
|
|
# Create the crawler and scraper
|
|
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
|
|
scraper = AsyncWebScraper(crawler, strategy)
|
|
# Start scraping
|
|
try:
|
|
result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
|
|
|
|
# Process results
|
|
print(f"Crawled {len(result.crawled_urls)} pages:")
|
|
for url, data in result.extracted_data.items():
|
|
print(f"- {url}: {len(data.html)} bytes")
|
|
|
|
except Exception as e:
|
|
print(f"Error during scraping: {e}")
|
|
|
|
|
|
# advanced_scraper_example.py
|
|
import logging
|
|
from crawl4ai.scraper import (
|
|
AsyncWebScraper,
|
|
BFSScraperStrategy,
|
|
FilterChain,
|
|
URLPatternFilter,
|
|
ContentTypeFilter,
|
|
DomainFilter,
|
|
KeywordRelevanceScorer,
|
|
PathDepthScorer,
|
|
FreshnessScorer,
|
|
CompositeScorer,
|
|
)
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
|
|
|
|
async def advanced_scraper_example():
|
|
"""
|
|
Advanced example: Intelligent news site scraping
|
|
- Uses all filter types
|
|
- Implements sophisticated scoring
|
|
- Streams results
|
|
- Includes monitoring and logging
|
|
"""
|
|
# Set up logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger("advanced_scraper")
|
|
|
|
# Create sophisticated filter chain
|
|
filter_chain = FilterChain(
|
|
[
|
|
# Domain control
|
|
DomainFilter(
|
|
allowed_domains=["techcrunch.com"],
|
|
blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
|
|
),
|
|
# URL patterns
|
|
URLPatternFilter(
|
|
[
|
|
"*/article/*",
|
|
"*/news/*",
|
|
"*/blog/*",
|
|
re.compile(r"\d{4}/\d{2}/.*"), # Date-based URLs
|
|
]
|
|
),
|
|
# Content types
|
|
ContentTypeFilter(["text/html", "application/xhtml+xml"]),
|
|
]
|
|
)
|
|
|
|
# Create composite scorer
|
|
scorer = CompositeScorer(
|
|
[
|
|
# Prioritize by keywords
|
|
KeywordRelevanceScorer(
|
|
keywords=["news", "breaking", "update", "latest"], weight=1.0
|
|
),
|
|
# Prefer optimal URL structure
|
|
PathDepthScorer(optimal_depth=3, weight=0.7),
|
|
# Prioritize fresh content
|
|
FreshnessScorer(weight=0.9),
|
|
]
|
|
)
|
|
|
|
# Initialize strategy with advanced configuration
|
|
strategy = BFSScraperStrategy(
|
|
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
|
|
)
|
|
|
|
# Create crawler and scraper
|
|
async with AsyncWebCrawler(verbose=True, config=browser_config) as crawler:
|
|
scraper = AsyncWebScraper(crawler, strategy)
|
|
|
|
# Track statistics
|
|
stats = {"processed": 0, "errors": 0, "total_size": 0}
|
|
|
|
try:
|
|
# Use streaming mode
|
|
result_generator = await scraper.ascrape(
|
|
"https://techcrunch.com", stream=True
|
|
)
|
|
async for result in result_generator:
|
|
stats["processed"] += 1
|
|
|
|
if result.success:
|
|
stats["total_size"] += len(result.html)
|
|
logger.info(f"Processed: {result.url}")
|
|
else:
|
|
stats["errors"] += 1
|
|
logger.error(
|
|
f"Failed to process {result.url}: {result.error_message}"
|
|
)
|
|
|
|
# Log progress regularly
|
|
if stats["processed"] % 10 == 0:
|
|
logger.info(f"Progress: {stats['processed']} URLs processed")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Scraping error: {e}")
|
|
|
|
finally:
|
|
# Print final statistics
|
|
logger.info("Scraping completed:")
|
|
logger.info(f"- URLs processed: {stats['processed']}")
|
|
logger.info(f"- Errors: {stats['errors']}")
|
|
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
|
|
|
# Print filter statistics
|
|
for filter_ in filter_chain.filters:
|
|
logger.info(f"{filter_.name} stats:")
|
|
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
|
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
|
|
|
# Print scorer statistics
|
|
logger.info("Scoring statistics:")
|
|
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
|
logger.info(
|
|
f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
|
|
# Run basic example
|
|
print("Running basic scraper example...")
|
|
asyncio.run(basic_scraper_example())
|
|
|
|
# Run advanced example
|
|
# print("\nRunning advanced scraper example...")
|
|
# asyncio.run(advanced_scraper_example())
|