This commit introduces a new cache mode, SMART, to the crawl4ai library. The SMART mode intelligently validates cached content using HEAD requests before using it, saving significant bandwidth while ensuring fresh content. The changes include modifications to the async_webcrawler.py, cache_context.py, and utils.py files in the crawl4ai directory. The async_webcrawler.py file now includes a check for the SMART cache mode and performs a HEAD check to see if the content has changed. If the content has changed, the url is re-crawled; otherwise, the cached result is used. The cache_context.py and utils.py files have been updated to support these changes. The documentation has also been updated to reflect these changes. The cache-modes.md file now includes a detailed explanation of the SMART mode, its logs, limitations, and an advanced example. The examples.md file now includes a link to the SMART Cache Mode example. The quickstart.md file now mentions the SMART mode in the note about cache modes. These changes improve the efficiency of the crawl4ai library by reducing unnecessary re-crawling and bandwidth usage. BREAKING CHANGE: The introduction of the SMART cache mode may affect existing code that uses the crawl4ai library and does not expect this new mode. Users should review the updated documentation to understand how to use this new mode.
202 lines
7.2 KiB
Python
202 lines
7.2 KiB
Python
"""
|
|
SMART Cache Mode Example for Crawl4AI
|
|
|
|
This example demonstrates how to use the SMART cache mode to intelligently
|
|
validate cached content before using it. SMART mode can save 70-95% bandwidth
|
|
on unchanged content while ensuring you always get fresh data when it changes.
|
|
|
|
SMART Cache Mode: Only Crawl When Changes
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
|
|
|
import asyncio
|
|
import time
|
|
from crawl4ai import AsyncWebCrawler
|
|
from crawl4ai.cache_context import CacheMode
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
|
|
|
|
async def basic_smart_cache_example():
|
|
"""Basic example showing SMART cache mode in action"""
|
|
print("=== Basic SMART Cache Example ===\n")
|
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
url = "https://example.com"
|
|
|
|
# First crawl: Cache the content
|
|
print("1. Initial crawl to cache the content:")
|
|
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
result1 = await crawler.arun(url=url, config=config)
|
|
print(f" Initial crawl: {len(result1.html)} bytes\n")
|
|
|
|
# Second crawl: Use SMART mode
|
|
print("2. SMART mode crawl (should use cache for static content):")
|
|
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
|
start_time = time.time()
|
|
result2 = await crawler.arun(url=url, config=smart_config)
|
|
elapsed = time.time() - start_time
|
|
print(f" SMART crawl: {len(result2.html)} bytes in {elapsed:.2f}s")
|
|
print(f" Content identical: {result1.html == result2.html}\n")
|
|
|
|
|
|
async def news_site_monitoring():
|
|
"""Monitor a news site for changes using SMART cache mode"""
|
|
print("=== News Site Monitoring Example ===\n")
|
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
|
url = "https://news.ycombinator.com"
|
|
|
|
print("Monitoring Hacker News for changes...\n")
|
|
|
|
previous_length = 0
|
|
for i in range(3):
|
|
result = await crawler.arun(url=url, config=config)
|
|
current_length = len(result.html)
|
|
|
|
if i == 0:
|
|
print(f"Check {i+1}: Initial crawl - {current_length} bytes")
|
|
else:
|
|
if current_length != previous_length:
|
|
print(f"Check {i+1}: Content changed! {previous_length} -> {current_length} bytes")
|
|
else:
|
|
print(f"Check {i+1}: Content unchanged - {current_length} bytes")
|
|
|
|
previous_length = current_length
|
|
|
|
if i < 2: # Don't wait after last check
|
|
print(" Waiting 10 seconds before next check...")
|
|
await asyncio.sleep(10)
|
|
|
|
print()
|
|
|
|
|
|
async def compare_cache_modes():
|
|
"""Compare different cache modes to understand SMART mode benefits"""
|
|
print("=== Cache Mode Comparison ===\n")
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
url = "https://www.wikipedia.org"
|
|
|
|
# First, populate the cache
|
|
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
await crawler.arun(url=url, config=config)
|
|
print("Cache populated.\n")
|
|
|
|
# Test different cache modes
|
|
modes = [
|
|
(CacheMode.ENABLED, "ENABLED (always uses cache if available)"),
|
|
(CacheMode.BYPASS, "BYPASS (never uses cache)"),
|
|
(CacheMode.SMART, "SMART (validates cache before using)")
|
|
]
|
|
|
|
for mode, description in modes:
|
|
config = CrawlerRunConfig(cache_mode=mode)
|
|
start_time = time.time()
|
|
result = await crawler.arun(url=url, config=config)
|
|
elapsed = time.time() - start_time
|
|
|
|
print(f"{description}:")
|
|
print(f" Time: {elapsed:.2f}s")
|
|
print(f" Size: {len(result.html)} bytes\n")
|
|
|
|
|
|
async def dynamic_content_example():
|
|
"""Show how SMART mode handles dynamic content"""
|
|
print("=== Dynamic Content Example ===\n")
|
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# URL that returns different content each time
|
|
dynamic_url = "https://httpbin.org/uuid"
|
|
|
|
print("Testing with dynamic content (changes every request):\n")
|
|
|
|
# First crawl
|
|
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
result1 = await crawler.arun(url=dynamic_url, config=config)
|
|
|
|
# Extract UUID from the response
|
|
import re
|
|
uuid1 = re.search(r'"uuid":\s*"([^"]+)"', result1.html)
|
|
if uuid1:
|
|
print(f"1. First crawl UUID: {uuid1.group(1)}")
|
|
|
|
# SMART mode crawl - should detect change and re-crawl
|
|
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
|
result2 = await crawler.arun(url=dynamic_url, config=smart_config)
|
|
|
|
uuid2 = re.search(r'"uuid":\s*"([^"]+)"', result2.html)
|
|
if uuid2:
|
|
print(f"2. SMART crawl UUID: {uuid2.group(1)}")
|
|
print(f" Different UUIDs: {uuid1.group(1) != uuid2.group(1)} (should be True)")
|
|
|
|
|
|
async def bandwidth_savings_demo():
|
|
"""Demonstrate bandwidth savings with SMART mode"""
|
|
print("=== Bandwidth Savings Demo ===\n")
|
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# List of URLs to crawl
|
|
urls = [
|
|
"https://example.com",
|
|
"https://www.python.org",
|
|
"https://docs.python.org/3/",
|
|
]
|
|
|
|
print("Crawling multiple URLs twice to show bandwidth savings:\n")
|
|
|
|
# First pass: Cache all URLs
|
|
print("First pass - Caching all URLs:")
|
|
total_bytes_pass1 = 0
|
|
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
|
|
for url in urls:
|
|
result = await crawler.arun(url=url, config=config)
|
|
total_bytes_pass1 += len(result.html)
|
|
print(f" {url}: {len(result.html)} bytes")
|
|
|
|
print(f"\nTotal downloaded in first pass: {total_bytes_pass1} bytes")
|
|
|
|
# Second pass: Use SMART mode
|
|
print("\nSecond pass - Using SMART mode:")
|
|
total_bytes_pass2 = 0
|
|
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
|
|
|
for url in urls:
|
|
result = await crawler.arun(url=url, config=smart_config)
|
|
# In SMART mode, unchanged content uses cache (minimal bandwidth)
|
|
print(f" {url}: Using {'cache' if result else 'fresh crawl'}")
|
|
|
|
print(f"\nBandwidth saved: ~{total_bytes_pass1} bytes (only HEAD requests sent)")
|
|
|
|
|
|
async def main():
|
|
"""Run all examples"""
|
|
examples = [
|
|
basic_smart_cache_example,
|
|
news_site_monitoring,
|
|
compare_cache_modes,
|
|
dynamic_content_example,
|
|
bandwidth_savings_demo
|
|
]
|
|
|
|
for example in examples:
|
|
await example()
|
|
print("\n" + "="*50 + "\n")
|
|
await asyncio.sleep(2) # Brief pause between examples
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("""
|
|
Crawl4AI SMART Cache Mode Examples
|
|
==================================
|
|
|
|
These examples demonstrate the SMART cache mode that intelligently
|
|
validates cached content using HEAD requests before deciding whether
|
|
to use cache or perform a fresh crawl.
|
|
|
|
""")
|
|
asyncio.run(main()) |