261 lines
8.4 KiB
Python
261 lines
8.4 KiB
Python
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
|
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
|
from crawl4ai.deep_crawl import (
|
|
BFSDeepCrawlStrategy,
|
|
FilterChain,
|
|
URLPatternFilter,
|
|
ContentTypeFilter,
|
|
DomainFilter,
|
|
KeywordRelevanceScorer,
|
|
PathDepthScorer,
|
|
FreshnessScorer,
|
|
CompositeScorer,
|
|
)
|
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
import re
|
|
import time
|
|
import logging
|
|
|
|
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
|
|
|
|
|
|
async def basic_example():
|
|
"""
|
|
Basic example: Deep crawl a blog site for articles
|
|
- Crawls only HTML pages
|
|
- Stays within the blog section
|
|
- Collects all results at once
|
|
"""
|
|
# Create a simple filter chain
|
|
filter_chain = FilterChain(
|
|
[
|
|
# Only crawl pages within the blog section
|
|
URLPatternFilter("*/basic/*"),
|
|
# Only process HTML pages
|
|
ContentTypeFilter(["text/html"]),
|
|
]
|
|
)
|
|
|
|
# Initialize the strategy with basic configuration
|
|
bfs_strategy = BFSDeepCrawlStrategy(
|
|
max_depth=2, # Only go 2 levels deep
|
|
filter_chain=filter_chain,
|
|
url_scorer=None, # Use default scoring
|
|
process_external_links=True,
|
|
)
|
|
|
|
# Create the crawler
|
|
async with AsyncWebCrawler(
|
|
config=browser_config,
|
|
) as crawler:
|
|
# Start scraping
|
|
try:
|
|
results = await crawler.arun(
|
|
"https://crawl4ai.com/mkdocs",
|
|
CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
|
|
)
|
|
# Process results
|
|
print(f"Crawled {len(results)} pages:")
|
|
for result in results:
|
|
print(f"- {result.url}: {len(result.html)} bytes")
|
|
|
|
except Exception as e:
|
|
print(f"Error during scraping: {e}")
|
|
|
|
|
|
async def advanced_example():
|
|
"""
|
|
Advanced example: Intelligent news site crawling
|
|
- Uses all filter types
|
|
- Implements sophisticated scoring
|
|
- Streams results
|
|
- Includes monitoring and logging
|
|
"""
|
|
# Set up logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger("advanced_deep_crawler")
|
|
|
|
# Create sophisticated filter chain
|
|
filter_chain = FilterChain(
|
|
[
|
|
# Domain control
|
|
DomainFilter(
|
|
allowed_domains=["techcrunch.com"],
|
|
blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
|
|
),
|
|
# URL patterns
|
|
URLPatternFilter(
|
|
[
|
|
"*/article/*",
|
|
"*/news/*",
|
|
"*/blog/*",
|
|
re.compile(r"\d{4}/\d{2}/.*"), # Date-based URLs
|
|
]
|
|
),
|
|
# Content types
|
|
ContentTypeFilter(["text/html", "application/xhtml+xml"]),
|
|
]
|
|
)
|
|
|
|
# Create composite scorer
|
|
scorer = CompositeScorer(
|
|
[
|
|
# Prioritize by keywords
|
|
KeywordRelevanceScorer(
|
|
keywords=["news", "breaking", "update", "latest"], weight=1.0
|
|
),
|
|
# Prefer optimal URL structure
|
|
PathDepthScorer(optimal_depth=3, weight=0.7),
|
|
# Prioritize fresh content
|
|
FreshnessScorer(weight=0.9),
|
|
]
|
|
)
|
|
|
|
# Initialize strategy with advanced configuration
|
|
bfs_strategy = BFSDeepCrawlStrategy(
|
|
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
|
|
)
|
|
|
|
# Create crawler
|
|
async with AsyncWebCrawler(
|
|
config=browser_config,
|
|
) as crawler:
|
|
|
|
# Track statistics
|
|
stats = {"processed": 0, "errors": 0, "total_size": 0}
|
|
|
|
try:
|
|
# Use streaming mode
|
|
results = []
|
|
result_generator = await crawler.arun(
|
|
"https://techcrunch.com",
|
|
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy, stream=True),
|
|
)
|
|
async for result in result_generator:
|
|
stats["processed"] += 1
|
|
|
|
if result.success:
|
|
stats["total_size"] += len(result.html)
|
|
logger.info(
|
|
f"Processed at depth: {result.depth} with score: {result.score:.3f} : \n {result.url}"
|
|
)
|
|
results.append(result)
|
|
else:
|
|
stats["errors"] += 1
|
|
logger.error(
|
|
f"Failed to process {result.url}: {result.error_message}"
|
|
)
|
|
|
|
# Log progress regularly
|
|
if stats["processed"] % 10 == 0:
|
|
logger.info(f"Progress: {stats['processed']} URLs processed")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Scraping error: {e}")
|
|
|
|
finally:
|
|
# Print final statistics
|
|
logger.info("Scraping completed:")
|
|
logger.info(f"- URLs processed: {stats['processed']}")
|
|
logger.info(f"- Errors: {stats['errors']}")
|
|
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
|
|
|
|
# Print filter statistics
|
|
for filter_ in filter_chain.filters:
|
|
logger.info(f"{filter_.name} stats:")
|
|
logger.info(f"- Passed: {filter_.stats.passed_urls}")
|
|
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
|
|
|
|
# Print scorer statistics
|
|
logger.info("Scoring statistics:")
|
|
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
|
|
logger.info(
|
|
f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
|
|
)
|
|
|
|
|
|
async def basic_example_many_urls():
|
|
filter_chain = FilterChain(
|
|
[
|
|
URLPatternFilter("*/basic/*"),
|
|
ContentTypeFilter(["text/html"]),
|
|
]
|
|
)
|
|
# Initialize the strategy with basic configuration
|
|
bfs_strategy = BFSDeepCrawlStrategy(
|
|
max_depth=2, # Only go 2 levels deep
|
|
filter_chain=filter_chain,
|
|
url_scorer=None, # Use default scoring
|
|
process_external_links=False,
|
|
)
|
|
|
|
# Create the crawler
|
|
async with AsyncWebCrawler(
|
|
config=browser_config,
|
|
) as crawler:
|
|
# Start scraping
|
|
try:
|
|
results = await crawler.arun_many(
|
|
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
|
|
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
|
|
)
|
|
# Process results
|
|
print(f"Crawled {len(results)} pages:")
|
|
for url_result in results:
|
|
for result in url_result:
|
|
print(f"- {result.url}: {len(result.html)} bytes")
|
|
|
|
except Exception as e:
|
|
print(f"Error during scraping: {e}")
|
|
|
|
async def basic_example_many_urls_stream():
|
|
filter_chain = FilterChain(
|
|
[
|
|
URLPatternFilter("*/basic/*"),
|
|
ContentTypeFilter(["text/html"]),
|
|
]
|
|
)
|
|
# Initialize the strategy with basic configuration
|
|
bfs_strategy = BFSDeepCrawlStrategy(
|
|
max_depth=2, # Only go 2 levels deep
|
|
filter_chain=filter_chain,
|
|
url_scorer=None, # Use default scoring
|
|
process_external_links=False,
|
|
)
|
|
|
|
# Create the crawler
|
|
async with AsyncWebCrawler(
|
|
config=browser_config,
|
|
) as crawler:
|
|
# Start scraping
|
|
try:
|
|
async for result in await crawler.arun_many(
|
|
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
|
|
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,stream=True),
|
|
):
|
|
# Process results
|
|
print(f"- {result.url}: {len(result.html)} bytes")
|
|
except Exception as e:
|
|
print(f"Error during scraping: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
import time
|
|
|
|
# Run basic example
|
|
start_time = time.perf_counter()
|
|
print("Running basic Deep crawl example...")
|
|
asyncio.run(basic_example())
|
|
end_time = time.perf_counter()
|
|
print(f"Basic deep crawl example completed in {end_time - start_time:.2f} seconds")
|
|
|
|
# Run advanced example
|
|
print("\nRunning advanced deep crawl example...")
|
|
asyncio.run(advanced_example())
|
|
|
|
print("\nRunning advanced deep crawl example with arun_many...")
|
|
asyncio.run(basic_example_many_urls())
|
|
|
|
print("\nRunning advanced deep crawl example with arun_many streaming enabled...")
|
|
asyncio.run(basic_example_many_urls_stream())
|