This commit introduces a new cache mode, SMART, to the crawl4ai library. The SMART mode intelligently validates cached content using HEAD requests before using it, saving significant bandwidth while ensuring fresh content. The changes include modifications to the async_webcrawler.py, cache_context.py, and utils.py files in the crawl4ai directory. The async_webcrawler.py file now includes a check for the SMART cache mode and performs a HEAD check to see if the content has changed. If the content has changed, the url is re-crawled; otherwise, the cached result is used. The cache_context.py and utils.py files have been updated to support these changes. The documentation has also been updated to reflect these changes. The cache-modes.md file now includes a detailed explanation of the SMART mode, its logs, limitations, and an advanced example. The examples.md file now includes a link to the SMART Cache Mode example. The quickstart.md file now mentions the SMART mode in the note about cache modes. These changes improve the efficiency of the crawl4ai library by reducing unnecessary re-crawling and bandwidth usage. BREAKING CHANGE: The introduction of the SMART cache mode may affect existing code that uses the crawl4ai library and does not expect this new mode. Users should review the updated documentation to understand how to use this new mode.
69 lines
2.3 KiB
Python
69 lines
2.3 KiB
Python
"""
|
|
Simple test for SMART cache mode functionality.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler
|
|
from crawl4ai.async_configs import CrawlerRunConfig
|
|
from crawl4ai.cache_context import CacheMode
|
|
import time
|
|
|
|
|
|
async def test_smart_cache():
|
|
"""Test SMART cache mode with a simple example"""
|
|
|
|
print("Testing SMART Cache Mode")
|
|
print("-" * 40)
|
|
|
|
# Test URL
|
|
url = "https://example.com"
|
|
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# First crawl with normal caching
|
|
print("\n1. Initial crawl with ENABLED mode:")
|
|
config1 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
result1 = await crawler.arun(url=url, config=config1)
|
|
print(f" Crawled: {len(result1.html)} bytes")
|
|
print(f" Headers: {list(result1.response_headers.keys())[:3]}...")
|
|
|
|
# Wait a moment
|
|
await asyncio.sleep(2)
|
|
|
|
# Re-crawl with SMART mode
|
|
print("\n2. Re-crawl with SMART mode:")
|
|
config2 = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
|
start = time.time()
|
|
result2 = await crawler.arun(url=url, config=config2)
|
|
elapsed = time.time() - start
|
|
|
|
print(f" Time: {elapsed:.2f}s")
|
|
print(f" Result: {len(result2.html)} bytes")
|
|
print(f" Should use cache (content unchanged)")
|
|
|
|
# Test with dynamic content
|
|
print("\n3. Testing with dynamic URL:")
|
|
dynamic_url = "https://httpbin.org/uuid"
|
|
|
|
# First crawl
|
|
config3 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
|
result3 = await crawler.arun(url=dynamic_url, config=config3)
|
|
content1 = result3.html
|
|
|
|
# Re-crawl with SMART
|
|
config4 = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
|
result4 = await crawler.arun(url=dynamic_url, config=config4)
|
|
content2 = result4.html
|
|
|
|
print(f" Content changed: {content1 != content2}")
|
|
print(f" Should re-crawl (dynamic content)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print(f"Python path: {sys.path[0]}")
|
|
print(f"CacheMode values: {[e.value for e in CacheMode]}")
|
|
print()
|
|
asyncio.run(test_smart_cache()) |