@@ -65,6 +65,7 @@ async def basic_deep_crawl():
f " \n ✅ Performance: { len ( results ) } pages in { time . perf_counter ( ) - start_time : .2f } seconds "
)
# 2️ ⃣ Stream vs. Non-Stream Execution
async def stream_vs_nonstream ( ) :
"""
@@ -126,6 +127,7 @@ async def stream_vs_nonstream():
print ( f " ✅ All results: { time . perf_counter ( ) - start_time : .2f } seconds " )
print ( " \n 🔍 Key Takeaway: Streaming allows processing results immediately " )
# 3️ ⃣ Introduce Filters & Scorers
async def filters_and_scorers ( ) :
"""
@@ -234,10 +236,82 @@ async def filters_and_scorers():
print ( f " ✅ Crawler prioritized { len ( results ) } pages by relevance score " )
print ( " 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first " )
# 4️ ⃣ Advanced Filters
# 4️ ⃣ Wrap-Up and Key Takeaways
async def wrap_up ( ) :
"""
PART 4: Wrap-Up and Key Takeaways
Summarize the key concepts learned in this tutorial.
"""
print ( " \n ===== COMPLETE CRAWLER EXAMPLE ===== " )
print ( " Combining filters, scorers, and streaming for an optimized crawl " )
# Create a sophisticated filter chain
filter_chain = FilterChain (
[
DomainFilter (
allowed_domains = [ " docs.crawl4ai.com " ] ,
blocked_domains = [ " old.docs.crawl4ai.com " ] ,
) ,
URLPatternFilter ( patterns = [ " *core* " , " *advanced* " , " *blog* " ] ) ,
ContentTypeFilter ( allowed_types = [ " text/html " ] ) ,
]
)
# Create a composite scorer that combines multiple scoring strategies
keyword_scorer = KeywordRelevanceScorer (
keywords = [ " crawl " , " example " , " async " , " configuration " ] , weight = 0.7
)
# Set up the configuration
config = CrawlerRunConfig (
deep_crawl_strategy = BestFirstCrawlingStrategy (
max_depth = 1 ,
include_external = False ,
filter_chain = filter_chain ,
url_scorer = keyword_scorer ,
) ,
scraping_strategy = LXMLWebScrapingStrategy ( ) ,
stream = True ,
verbose = True ,
)
# Execute the crawl
results = [ ]
start_time = time . perf_counter ( )
async with AsyncWebCrawler ( ) as crawler :
async for result in await crawler . arun (
url = " https://docs.crawl4ai.com " , config = config
) :
results . append ( result )
score = result . metadata . get ( " score " , 0 )
depth = result . metadata . get ( " depth " , 0 )
print ( f " → Depth: { depth } | Score: { score : .2f } | { result . url } " )
duration = time . perf_counter ( ) - start_time
# Summarize the results
print ( f " \n ✅ Crawled { len ( results ) } high-value pages in { duration : .2f } seconds " )
print (
f " ✅ Average score: { sum ( r . metadata . get ( ' score ' , 0 ) for r in results ) / len ( results ) : .2f } "
)
# Group by depth
depth_counts = { }
for result in results :
depth = result . metadata . get ( " depth " , 0 )
depth_counts [ depth ] = depth_counts . get ( depth , 0 ) + 1
print ( " \n 📊 Pages crawled by depth: " )
for depth , count in sorted ( depth_counts . items ( ) ) :
print ( f " Depth { depth } : { count } pages " )
# 5️ ⃣ Advanced Filters
async def advanced_filters ( ) :
"""
PART 4 : Demonstrates advanced filtering techniques for specialized crawling.
PART 5 : Demonstrates advanced filtering techniques for specialized crawling.
This function covers:
- SEO filters
@@ -297,10 +371,11 @@ async def advanced_filters():
relevance_score = result . metadata . get ( " relevance_score " , 0 )
print ( f " → Score: { relevance_score : .2f } | { result . url } " )
# 5️ ⃣ Max Pages and Score Thresholds
# Main function to run the entire tutorial
async def max_pages_and_thresholds ( ) :
"""
PART 5 : Demonstrates using max_pages and score_threshold parameters with different strategies.
PART 6 : Demonstrates using max_pages and score_threshold parameters with different strategies.
This function shows:
- How to limit the number of pages crawled
@@ -396,77 +471,6 @@ async def max_pages_and_thresholds():
print ( f " ✅ Average score: { avg_score : .2f } " )
print ( " 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first " )
# 6️ ⃣ Wrap-Up and Key Takeaways
async def wrap_up ( ) :
"""
PART 6: Wrap-Up and Key Takeaways
Summarize the key concepts learned in this tutorial.
"""
print ( " \n ===== COMPLETE CRAWLER EXAMPLE ===== " )
print ( " Combining filters, scorers, and streaming for an optimized crawl " )
# Create a sophisticated filter chain
filter_chain = FilterChain (
[
DomainFilter (
allowed_domains = [ " docs.crawl4ai.com " ] ,
blocked_domains = [ " old.docs.crawl4ai.com " ] ,
) ,
URLPatternFilter ( patterns = [ " *core* " , " *advanced* " , " *blog* " ] ) ,
ContentTypeFilter ( allowed_types = [ " text/html " ] ) ,
]
)
# Create a composite scorer that combines multiple scoring strategies
keyword_scorer = KeywordRelevanceScorer (
keywords = [ " crawl " , " example " , " async " , " configuration " ] , weight = 0.7
)
# Set up the configuration
config = CrawlerRunConfig (
deep_crawl_strategy = BestFirstCrawlingStrategy (
max_depth = 1 ,
include_external = False ,
filter_chain = filter_chain ,
url_scorer = keyword_scorer ,
) ,
scraping_strategy = LXMLWebScrapingStrategy ( ) ,
stream = True ,
verbose = True ,
)
# Execute the crawl
results = [ ]
start_time = time . perf_counter ( )
async with AsyncWebCrawler ( ) as crawler :
async for result in await crawler . arun (
url = " https://docs.crawl4ai.com " , config = config
) :
results . append ( result )
score = result . metadata . get ( " score " , 0 )
depth = result . metadata . get ( " depth " , 0 )
print ( f " → Depth: { depth } | Score: { score : .2f } | { result . url } " )
duration = time . perf_counter ( ) - start_time
# Summarize the results
print ( f " \n ✅ Crawled { len ( results ) } high-value pages in { duration : .2f } seconds " )
print (
f " ✅ Average score: { sum ( r . metadata . get ( ' score ' , 0 ) for r in results ) / len ( results ) : .2f } "
)
# Group by depth
depth_counts = { }
for result in results :
depth = result . metadata . get ( " depth " , 0 )
depth_counts [ depth ] = depth_counts . get ( depth , 0 ) + 1
print ( " \n 📊 Pages crawled by depth: " )
for depth , count in sorted ( depth_counts . items ( ) ) :
print ( f " Depth { depth } : { count } pages " )
async def run_tutorial ( ) :
"""
Executes all tutorial sections in sequence.
@@ -478,12 +482,12 @@ async def run_tutorial():
# Define sections - uncomment to run specific parts during development
tutorial_sections = [
basic_deep_crawl ,
stream_vs_nonstream ,
filters_and_scorers ,
max_pages_and_thresholds ,
advanced_filters ,
# basic_deep_crawl,
# stream_vs_nonstream,
# filters_and_scorers,
max_pages_and_thresholds , # Added new section
wrap_up ,
advanced_filters ,
]
for section in tutorial_sections :
@@ -493,6 +497,7 @@ async def run_tutorial():
print ( " You now have a comprehensive understanding of deep crawling with Crawl4AI. " )
print ( " For more information, check out https://docs.crawl4ai.com " )
# Execute the tutorial when run directly
if __name__ == " __main__ " :
asyncio . run ( run_tutorial ( ) )