Refactor:Moved deep_crawl_strategy, inside crawler run config

This commit is contained in:
Aravind Karnam
2025-01-30 16:18:15 +05:30
parent 858c18df39
commit ca3f0126d3
9 changed files with 79 additions and 57 deletions

View File

@@ -1,18 +1,25 @@
# basic_scraper_example.py
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.traversal import (
BFSTraversalStrategy,
from crawl4ai.deep_crawl import (
BFSDeepCrawlStrategy,
FilterChain,
URLPatternFilter,
ContentTypeFilter,
DomainFilter,
KeywordRelevanceScorer,
PathDepthScorer,
FreshnessScorer,
CompositeScorer,
)
from crawl4ai.async_webcrawler import AsyncWebCrawler
import re
import time
import logging
browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
async def basic_scraper_example():
"""
Basic example: Scrape a blog site for articles
@@ -31,7 +38,7 @@ async def basic_scraper_example():
)
# Initialize the strategy with basic configuration
bfs_strategy = BFSTraversalStrategy(
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2, # Only go 2 levels deep
filter_chain=filter_chain,
url_scorer=None, # Use default scoring
@@ -44,8 +51,8 @@ async def basic_scraper_example():
) as crawler:
# Start scraping
try:
results = await crawler.adeep_crawl(
"https://crawl4ai.com/mkdocs", strategy=bfs_strategy
results = await crawler.arun(
"https://crawl4ai.com/mkdocs", CrawlerRunConfig(deep_crawl_strategy=bfs_strategy)
)
# Process results
print(f"Crawled {len(results)} pages:")
@@ -55,23 +62,6 @@ async def basic_scraper_example():
except Exception as e:
print(f"Error during scraping: {e}")
# advanced_scraper_example.py
import logging
from crawl4ai.traversal import (
BFSTraversalStrategy,
FilterChain,
URLPatternFilter,
ContentTypeFilter,
DomainFilter,
KeywordRelevanceScorer,
PathDepthScorer,
FreshnessScorer,
CompositeScorer,
)
async def advanced_scraper_example():
"""
Advanced example: Intelligent news site scraping
@@ -121,7 +111,7 @@ async def advanced_scraper_example():
)
# Initialize strategy with advanced configuration
bfs_strategy = BFSTraversalStrategy(
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2, filter_chain=filter_chain, url_scorer=scorer
)
@@ -136,13 +126,10 @@ async def advanced_scraper_example():
try:
# Use streaming mode
results = []
result_generator = await crawler.adeep_crawl(
result_generator = await crawler.arun(
"https://techcrunch.com",
strategy=bfs_strategy,
crawler_run_config=CrawlerRunConfig(
scraping_strategy=LXMLWebScrapingStrategy()
),
stream=True,
config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,
stream=True)
)
async for result in result_generator:
stats["processed"] += 1