Files
crawl4ai/tests/adaptive/test_adaptive_crawler.py
UncleCode 1a73fb60db feat(crawl4ai): Implement adaptive crawling feature
This commit introduces the adaptive crawling feature to the crawl4ai project. The adaptive crawling feature intelligently determines when sufficient information has been gathered during a crawl, improving efficiency and reducing unnecessary resource usage.

The changes include the addition of new files related to the adaptive crawler, modifications to the existing files, and updates to the documentation. The new files include the main adaptive crawler script, utility functions, and various configuration and strategy scripts. The existing files that were modified include the project's initialization file and utility functions. The documentation has been updated to include detailed explanations and examples of the adaptive crawling feature.

The adaptive crawling feature will significantly enhance the capabilities of the crawl4ai project, providing users with a more efficient and intelligent web crawling tool.

Significant modifications:
- Added adaptive_crawler.py and related scripts
- Modified __init__.py and utils.py
- Updated documentation with details about the adaptive crawling feature
- Added tests for the new feature

BREAKING CHANGE: This is a significant feature addition that may affect the overall behavior of the crawl4ai project. Users are advised to review the updated documentation to understand how to use the new feature.

Refs: #123, #456
2025-07-04 15:16:53 +08:00

293 lines
9.1 KiB
Python

"""
Test and demo script for Adaptive Crawler
This script demonstrates the progressive crawling functionality
with various configurations and use cases.
"""
import asyncio
import json
from pathlib import Path
import time
from typing import Dict, List
from rich.console import Console
from rich.table import Table
from rich.progress import Progress
from rich import print as rprint
# Add parent directory to path for imports
import sys
sys.path.append(str(Path(__file__).parent.parent))
from crawl4ai import (
AsyncWebCrawler,
AdaptiveCrawler,
AdaptiveConfig,
CrawlState
)
console = Console()
def print_relevant_content(crawler: AdaptiveCrawler, top_k: int = 3):
"""Print most relevant content found"""
relevant = crawler.get_relevant_content(top_k=top_k)
if not relevant:
console.print("[yellow]No relevant content found yet.[/yellow]")
return
console.print(f"\n[bold cyan]Top {len(relevant)} Most Relevant Pages:[/bold cyan]")
for i, doc in enumerate(relevant, 1):
console.print(f"\n[green]{i}. {doc['url']}[/green]")
console.print(f" Score: {doc['score']:.2f}")
# Show snippet
content = doc['content'] or ""
snippet = content[:200].replace('\n', ' ') + "..." if len(content) > 200 else content
console.print(f" [dim]{snippet}[/dim]")
async def test_basic_progressive_crawl():
"""Test basic progressive crawling functionality"""
console.print("\n[bold yellow]Test 1: Basic Progressive Crawl[/bold yellow]")
console.print("Testing on Python documentation with query about async/await")
config = AdaptiveConfig(
confidence_threshold=0.7,
max_pages=10,
top_k_links=2,
min_gain_threshold=0.1
)
# Create crawler
async with AsyncWebCrawler() as crawler:
prog_crawler = AdaptiveCrawler(
crawler=crawler,
config=config
)
# Start progressive crawl
start_time = time.time()
state = await prog_crawler.digest(
start_url="https://docs.python.org/3/library/asyncio.html",
query="async await context managers"
)
elapsed = time.time() - start_time
# Print results
prog_crawler.print_stats(detailed=False)
prog_crawler.print_stats(detailed=True)
print_relevant_content(prog_crawler)
console.print(f"\n[green]Crawl completed in {elapsed:.2f} seconds[/green]")
console.print(f"Final confidence: {prog_crawler.confidence:.2%}")
console.print(f"URLs crawled: {list(state.crawled_urls)[:5]}...") # Show first 5
# Test export functionality
export_path = "knowledge_base_export.jsonl"
prog_crawler.export_knowledge_base(export_path)
console.print(f"[green]Knowledge base exported to {export_path}[/green]")
# Clean up
Path(export_path).unlink(missing_ok=True)
async def test_with_persistence():
"""Test state persistence and resumption"""
console.print("\n[bold yellow]Test 2: Persistence and Resumption[/bold yellow]")
console.print("Testing state save/load functionality")
state_path = "test_crawl_state.json"
config = AdaptiveConfig(
confidence_threshold=0.6,
max_pages=5,
top_k_links=2,
save_state=True,
state_path=state_path
)
# First crawl - partial
async with AsyncWebCrawler() as crawler:
prog_crawler = AdaptiveCrawler(
crawler=crawler,
config=config
)
state1 = await prog_crawler.digest(
start_url="https://httpbin.org",
query="http headers response"
)
console.print(f"[cyan]First crawl: {len(state1.crawled_urls)} pages[/cyan]")
# Resume crawl
config.max_pages = 10 # Increase limit
async with AsyncWebCrawler() as crawler:
prog_crawler = AdaptiveCrawler(
crawler=crawler,
config=config
)
state2 = await prog_crawler.digest(
start_url="https://httpbin.org",
query="http headers response",
resume_from=state_path
)
console.print(f"[green]Resumed crawl: {len(state2.crawled_urls)} total pages[/green]")
# Clean up
Path(state_path).unlink(missing_ok=True)
async def test_different_domains():
"""Test on different types of websites"""
console.print("\n[bold yellow]Test 3: Different Domain Types[/bold yellow]")
test_cases = [
{
"name": "Documentation Site",
"url": "https://docs.python.org/3/",
"query": "decorators and context managers"
},
{
"name": "API Documentation",
"url": "https://httpbin.org",
"query": "http authentication headers"
}
]
for test in test_cases:
console.print(f"\n[cyan]Testing: {test['name']}[/cyan]")
console.print(f"URL: {test['url']}")
console.print(f"Query: {test['query']}")
config = AdaptiveConfig(
confidence_threshold=0.6,
max_pages=5,
top_k_links=2
)
async with AsyncWebCrawler() as crawler:
prog_crawler = AdaptiveCrawler(
crawler=crawler,
config=config
)
start_time = time.time()
state = await prog_crawler.digest(
start_url=test['url'],
query=test['query']
)
elapsed = time.time() - start_time
# Summary using print_stats
prog_crawler.print_stats(detailed=False)
async def test_stopping_criteria():
"""Test different stopping criteria"""
console.print("\n[bold yellow]Test 4: Stopping Criteria[/bold yellow]")
# Test 1: High confidence threshold
console.print("\n[cyan]4.1 High confidence threshold (0.9)[/cyan]")
config = AdaptiveConfig(
confidence_threshold=0.9, # Very high
max_pages=20,
top_k_links=3
)
async with AsyncWebCrawler() as crawler:
prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
state = await prog_crawler.digest(
start_url="https://docs.python.org/3/library/",
query="python standard library"
)
console.print(f"Pages needed for 90% confidence: {len(state.crawled_urls)}")
prog_crawler.print_stats(detailed=False)
# Test 2: Page limit
console.print("\n[cyan]4.2 Page limit (3 pages max)[/cyan]")
config = AdaptiveConfig(
confidence_threshold=0.9,
max_pages=3, # Very low limit
top_k_links=2
)
async with AsyncWebCrawler() as crawler:
prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
state = await prog_crawler.digest(
start_url="https://docs.python.org/3/library/",
query="python standard library modules"
)
console.print(f"Stopped by: {'Page limit' if len(state.crawled_urls) >= 3 else 'Other'}")
prog_crawler.print_stats(detailed=False)
async def test_crawl_patterns():
"""Analyze crawl patterns and link selection"""
console.print("\n[bold yellow]Test 5: Crawl Pattern Analysis[/bold yellow]")
config = AdaptiveConfig(
confidence_threshold=0.7,
max_pages=8,
top_k_links=2,
min_gain_threshold=0.05
)
async with AsyncWebCrawler() as crawler:
prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
# Track crawl progress
console.print("\n[cyan]Crawl Progress:[/cyan]")
state = await prog_crawler.digest(
start_url="https://httpbin.org",
query="http methods post get"
)
# Show crawl order
console.print("\n[green]Crawl Order:[/green]")
for i, url in enumerate(state.crawl_order, 1):
console.print(f"{i}. {url}")
# Show new terms discovered per page
console.print("\n[green]New Terms Discovered:[/green]")
for i, new_terms in enumerate(state.new_terms_history, 1):
console.print(f"Page {i}: {new_terms} new terms")
# Final metrics
console.print(f"\n[yellow]Saturation reached: {state.metrics.get('saturation', 0):.2%}[/yellow]")
async def main():
"""Run all tests"""
console.print("[bold magenta]Adaptive Crawler Test Suite[/bold magenta]")
console.print("=" * 50)
try:
# Run tests
await test_basic_progressive_crawl()
# await test_with_persistence()
# await test_different_domains()
# await test_stopping_criteria()
# await test_crawl_patterns()
console.print("\n[bold green]✅ All tests completed successfully![/bold green]")
except Exception as e:
console.print(f"\n[bold red]❌ Test failed with error: {e}[/bold red]")
import traceback
traceback.print_exc()
if __name__ == "__main__":
# Run the test suite
asyncio.run(main())