Refactor: Removed all scheduling logic from scraper. From now scraper expects arun_many to handle all scheduling. Scraper will only do traversal, validations, compliance checks, URL filtering and scoring etc. Reformatted some of the scraper files with Black code formatter

This commit is contained in:
Aravind Karnam
2025-01-21 17:49:51 +05:30
parent 26d78d8512
commit 67fa06c09b
9 changed files with 316 additions and 243 deletions

View File

@@ -4,13 +4,14 @@ from crawl4ai.scraper import (
BFSScraperStrategy,
FilterChain,
URLPatternFilter,
ContentTypeFilter
ContentTypeFilter,
)
from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
import re
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
async def basic_scraper_example():
"""
Basic example: Scrape a blog site for articles
@@ -19,37 +20,39 @@ async def basic_scraper_example():
- Collects all results at once
"""
# Create a simple filter chain
filter_chain = FilterChain([
# Only crawl pages within the blog section
URLPatternFilter("*/tutorial/*"),
# Only process HTML pages
ContentTypeFilter(["text/html"])
])
filter_chain = FilterChain(
[
# Only crawl pages within the blog section
URLPatternFilter("*/tutorial/*"),
# Only process HTML pages
ContentTypeFilter(["text/html"]),
]
)
# Initialize the strategy with basic configuration
strategy = BFSScraperStrategy(
max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain,
url_scorer=None, # Use default scoring
max_concurrent=3, # Limit concurrent requests
process_external_links=True
process_external_links=True,
)
# Create the crawler and scraper
async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler:
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
scraper = AsyncWebScraper(crawler, strategy)
# Start scraping
try:
result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
# Process results
print(f"Crawled {len(result.crawled_urls)} pages:")
for url, data in result.extracted_data.items():
print(f"- {url}: {len(data.html)} bytes")
except Exception as e:
print(f"Error during scraping: {e}")
# advanced_scraper_example.py
import logging
from crawl4ai.scraper import (
@@ -62,10 +65,11 @@ from crawl4ai.scraper import (
KeywordRelevanceScorer,
PathDepthScorer,
FreshnessScorer,
CompositeScorer
CompositeScorer,
)
from crawl4ai.async_webcrawler import AsyncWebCrawler
async def advanced_scraper_example():
"""
Advanced example: Intelligent news site scraping
@@ -79,49 +83,44 @@ async def advanced_scraper_example():
logger = logging.getLogger("advanced_scraper")
# Create sophisticated filter chain
filter_chain = FilterChain([
# Domain control
DomainFilter(
allowed_domains=["techcrunch.com"],
blocked_domains=["login.techcrunch.com","legal.yahoo.com"]
),
# URL patterns
URLPatternFilter([
"*/article/*",
"*/news/*",
"*/blog/*",
re.compile(r"\d{4}/\d{2}/.*") # Date-based URLs
]),
# Content types
ContentTypeFilter([
"text/html",
"application/xhtml+xml"
])
])
filter_chain = FilterChain(
[
# Domain control
DomainFilter(
allowed_domains=["techcrunch.com"],
blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
),
# URL patterns
URLPatternFilter(
[
"*/article/*",
"*/news/*",
"*/blog/*",
re.compile(r"\d{4}/\d{2}/.*"), # Date-based URLs
]
),
# Content types
ContentTypeFilter(["text/html", "application/xhtml+xml"]),
]
)
# Create composite scorer
scorer = CompositeScorer([
# Prioritize by keywords
KeywordRelevanceScorer(
keywords=["news", "breaking", "update", "latest"],
weight=1.0
),
# Prefer optimal URL structure
PathDepthScorer(
optimal_depth=3,
weight=0.7
),
# Prioritize fresh content
FreshnessScorer(weight=0.9)
])
scorer = CompositeScorer(
[
# Prioritize by keywords
KeywordRelevanceScorer(
keywords=["news", "breaking", "update", "latest"], weight=1.0
),
# Prefer optimal URL structure
PathDepthScorer(optimal_depth=3, weight=0.7),
# Prioritize fresh content
FreshnessScorer(weight=0.9),
]
)
# Initialize strategy with advanced configuration
strategy = BFSScraperStrategy(
max_depth=2,
filter_chain=filter_chain,
url_scorer=scorer,
max_concurrent=2,
min_crawl_delay=1
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
)
# Create crawler and scraper
@@ -129,57 +128,60 @@ async def advanced_scraper_example():
scraper = AsyncWebScraper(crawler, strategy)
# Track statistics
stats = {
'processed': 0,
'errors': 0,
'total_size': 0
}
stats = {"processed": 0, "errors": 0, "total_size": 0}
try:
# Use streaming mode
result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True)
result_generator = await scraper.ascrape(
"https://techcrunch.com", stream=True
)
async for result in result_generator:
stats['processed'] += 1
stats["processed"] += 1
if result.success:
stats['total_size'] += len(result.html)
stats["total_size"] += len(result.html)
logger.info(f"Processed: {result.url}")
else:
stats['errors'] += 1
logger.error(f"Failed to process {result.url}: {result.error_message}")
stats["errors"] += 1
logger.error(
f"Failed to process {result.url}: {result.error_message}"
)
# Log progress regularly
if stats['processed'] % 10 == 0:
if stats["processed"] % 10 == 0:
logger.info(f"Progress: {stats['processed']} URLs processed")
except Exception as e:
logger.error(f"Scraping error: {e}")
finally:
# Print final statistics
logger.info("Scraping completed:")
logger.info(f"- URLs processed: {stats['processed']}")
logger.info(f"- Errors: {stats['errors']}")
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
# Print filter statistics
for filter_ in filter_chain.filters:
logger.info(f"{filter_.name} stats:")
logger.info(f"- Passed: {filter_.stats.passed_urls}")
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
# Print scorer statistics
logger.info("Scoring statistics:")
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
logger.info(
f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
)
if __name__ == "__main__":
import asyncio
# Run basic example
print("Running basic scraper example...")
asyncio.run(basic_scraper_example())
# Run advanced example
print("\nRunning advanced scraper example...")
asyncio.run(advanced_scraper_example())
# print("\nRunning advanced scraper example...")
# asyncio.run(advanced_scraper_example())