Files
crawl4ai/docs/deep_crawl/deep_crawl_quickstart.py
2025-01-30 17:49:58 +05:30

261 lines
8.4 KiB
Python

from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawl import (
BFSDeepCrawlStrategy,
FilterChain,
URLPatternFilter,
ContentTypeFilter,
DomainFilter,
KeywordRelevanceScorer,
PathDepthScorer,
FreshnessScorer,
CompositeScorer,
)
from crawl4ai.async_webcrawler import AsyncWebCrawler
import re
import time
import logging
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
async def basic_example():
"""
Basic example: Deep crawl a blog site for articles
- Crawls only HTML pages
- Stays within the blog section
- Collects all results at once
"""
# Create a simple filter chain
filter_chain = FilterChain(
[
# Only crawl pages within the blog section
URLPatternFilter("*/basic/*"),
# Only process HTML pages
ContentTypeFilter(["text/html"]),
]
)
# Initialize the strategy with basic configuration
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain,
url_scorer=None, # Use default scoring
process_external_links=True,
)
# Create the crawler
async with AsyncWebCrawler(
config=browser_config,
) as crawler:
# Start scraping
try:
results = await crawler.arun(
"https://crawl4ai.com/mkdocs",
CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
)
# Process results
print(f"Crawled {len(results)} pages:")
for result in results:
print(f"- {result.url}: {len(result.html)} bytes")
except Exception as e:
print(f"Error during scraping: {e}")
async def advanced_example():
"""
Advanced example: Intelligent news site crawling
- Uses all filter types
- Implements sophisticated scoring
- Streams results
- Includes monitoring and logging
"""
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("advanced_deep_crawler")
# Create sophisticated filter chain
filter_chain = FilterChain(
[
# Domain control
DomainFilter(
allowed_domains=["techcrunch.com"],
blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
),
# URL patterns
URLPatternFilter(
[
"*/article/*",
"*/news/*",
"*/blog/*",
re.compile(r"\d{4}/\d{2}/.*"), # Date-based URLs
]
),
# Content types
ContentTypeFilter(["text/html", "application/xhtml+xml"]),
]
)
# Create composite scorer
scorer = CompositeScorer(
[
# Prioritize by keywords
KeywordRelevanceScorer(
keywords=["news", "breaking", "update", "latest"], weight=1.0
),
# Prefer optimal URL structure
PathDepthScorer(optimal_depth=3, weight=0.7),
# Prioritize fresh content
FreshnessScorer(weight=0.9),
]
)
# Initialize strategy with advanced configuration
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
)
# Create crawler
async with AsyncWebCrawler(
config=browser_config,
) as crawler:
# Track statistics
stats = {"processed": 0, "errors": 0, "total_size": 0}
try:
# Use streaming mode
results = []
result_generator = await crawler.arun(
"https://techcrunch.com",
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy, stream=True),
)
async for result in result_generator:
stats["processed"] += 1
if result.success:
stats["total_size"] += len(result.html)
logger.info(
f"Processed at depth: {result.depth} with score: {result.score:.3f} : \n {result.url}"
)
results.append(result)
else:
stats["errors"] += 1
logger.error(
f"Failed to process {result.url}: {result.error_message}"
)
# Log progress regularly
if stats["processed"] % 10 == 0:
logger.info(f"Progress: {stats['processed']} URLs processed")
except Exception as e:
logger.error(f"Scraping error: {e}")
finally:
# Print final statistics
logger.info("Scraping completed:")
logger.info(f"- URLs processed: {stats['processed']}")
logger.info(f"- Errors: {stats['errors']}")
logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
# Print filter statistics
for filter_ in filter_chain.filters:
logger.info(f"{filter_.name} stats:")
logger.info(f"- Passed: {filter_.stats.passed_urls}")
logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
# Print scorer statistics
logger.info("Scoring statistics:")
logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
logger.info(
f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
)
async def basic_example_many_urls():
filter_chain = FilterChain(
[
URLPatternFilter("*/basic/*"),
ContentTypeFilter(["text/html"]),
]
)
# Initialize the strategy with basic configuration
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain,
url_scorer=None, # Use default scoring
process_external_links=False,
)
# Create the crawler
async with AsyncWebCrawler(
config=browser_config,
) as crawler:
# Start scraping
try:
results = await crawler.arun_many(
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
)
# Process results
print(f"Crawled {len(results)} pages:")
for url_result in results:
for result in url_result:
print(f"- {result.url}: {len(result.html)} bytes")
except Exception as e:
print(f"Error during scraping: {e}")
async def basic_example_many_urls_stream():
filter_chain = FilterChain(
[
URLPatternFilter("*/basic/*"),
ContentTypeFilter(["text/html"]),
]
)
# Initialize the strategy with basic configuration
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain,
url_scorer=None, # Use default scoring
process_external_links=False,
)
# Create the crawler
async with AsyncWebCrawler(
config=browser_config,
) as crawler:
# Start scraping
try:
async for result in await crawler.arun_many(
urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,stream=True),
):
# Process results
print(f"- {result.url}: {len(result.html)} bytes")
except Exception as e:
print(f"Error during scraping: {e}")
if __name__ == "__main__":
import asyncio
import time
# Run basic example
start_time = time.perf_counter()
print("Running basic Deep crawl example...")
asyncio.run(basic_example())
end_time = time.perf_counter()
print(f"Basic deep crawl example completed in {end_time - start_time:.2f} seconds")
# Run advanced example
print("\nRunning advanced deep crawl example...")
asyncio.run(advanced_example())
print("\nRunning advanced deep crawl example with arun_many...")
asyncio.run(basic_example_many_urls())
print("\nRunning advanced deep crawl example with arun_many streaming enabled...")
asyncio.run(basic_example_many_urls_stream())