crawl4ai/tests/adaptive/test_adaptive_crawler.py

"""
Test and demo script for Adaptive Crawler

This script demonstrates the progressive crawling functionality
with various configurations and use cases.
"""

import asyncio
import json
from pathlib import Path
import time
from typing import Dict, List
from rich.console import Console
from rich.table import Table
from rich.progress import Progress
from rich import print as rprint

# Add parent directory to path for imports
import sys
sys.path.append(str(Path(__file__).parent.parent))

from crawl4ai import (
    AsyncWebCrawler,
    AdaptiveCrawler,
    AdaptiveConfig,
    AdaptiveCrawlResult
)


console = Console()


def print_relevant_content(crawler: AdaptiveCrawler, top_k: int = 3):
    """Print most relevant content found"""
    relevant = crawler.get_relevant_content(top_k=top_k)

    if not relevant:
        console.print("[yellow]No relevant content found yet.[/yellow]")
        return

    console.print(f"\n[bold cyan]Top {len(relevant)} Most Relevant Pages:[/bold cyan]")
    for i, doc in enumerate(relevant, 1):
        console.print(f"\n[green]{i}. {doc['url']}[/green]")
        console.print(f"   Score: {doc['score']:.2f}")
        # Show snippet
        content = doc['content'] or ""
        snippet = content[:200].replace('\n', ' ') + "..." if len(content) > 200 else content
        console.print(f"   [dim]{snippet}[/dim]")


async def test_basic_progressive_crawl():
    """Test basic progressive crawling functionality"""
    console.print("\n[bold yellow]Test 1: Basic Progressive Crawl[/bold yellow]")
    console.print("Testing on Python documentation with query about async/await")

    config = AdaptiveConfig(
        confidence_threshold=0.7,
        max_pages=10,
        top_k_links=2,
        min_gain_threshold=0.1
    )

    # Create crawler
    async with AsyncWebCrawler() as crawler:
        prog_crawler = AdaptiveCrawler(
            crawler=crawler,
            config=config
        )

        # Start progressive crawl
        start_time = time.time()
        state = await prog_crawler.digest(
            start_url="https://docs.python.org/3/library/asyncio.html",
            query="async await context managers"
        )
        elapsed = time.time() - start_time

        # Print results
        prog_crawler.print_stats(detailed=False)
        prog_crawler.print_stats(detailed=True)
        print_relevant_content(prog_crawler)

        console.print(f"\n[green]Crawl completed in {elapsed:.2f} seconds[/green]")
        console.print(f"Final confidence: {prog_crawler.confidence:.2%}")
        console.print(f"URLs crawled: {list(state.crawled_urls)[:5]}...")  # Show first 5

        # Test export functionality
        export_path = "knowledge_base_export.jsonl"
        prog_crawler.export_knowledge_base(export_path)
        console.print(f"[green]Knowledge base exported to {export_path}[/green]")

        # Clean up
        Path(export_path).unlink(missing_ok=True)


async def test_with_persistence():
    """Test state persistence and resumption"""
    console.print("\n[bold yellow]Test 2: Persistence and Resumption[/bold yellow]")
    console.print("Testing state save/load functionality")

    state_path = "test_crawl_state.json"

    config = AdaptiveConfig(
        confidence_threshold=0.6,
        max_pages=5,
        top_k_links=2,
        save_state=True,
        state_path=state_path
    )

    # First crawl - partial
    async with AsyncWebCrawler() as crawler:
        prog_crawler = AdaptiveCrawler(
            crawler=crawler,
            config=config
        )

        state1 = await prog_crawler.digest(
            start_url="https://httpbin.org",
            query="http headers response"
        )

        console.print(f"[cyan]First crawl: {len(state1.crawled_urls)} pages[/cyan]")

    # Resume crawl
    config.max_pages = 10  # Increase limit

    async with AsyncWebCrawler() as crawler:
        prog_crawler = AdaptiveCrawler(
            crawler=crawler,
            config=config
        )

        state2 = await prog_crawler.digest(
            start_url="https://httpbin.org",
            query="http headers response",
            resume_from=state_path
        )

        console.print(f"[green]Resumed crawl: {len(state2.crawled_urls)} total pages[/green]")

    # Clean up
    Path(state_path).unlink(missing_ok=True)


async def test_different_domains():
    """Test on different types of websites"""
    console.print("\n[bold yellow]Test 3: Different Domain Types[/bold yellow]")

    test_cases = [
        {
            "name": "Documentation Site",
            "url": "https://docs.python.org/3/",
            "query": "decorators and context managers"
        },
        {
            "name": "API Documentation",
            "url": "https://httpbin.org",
            "query": "http authentication headers"
        }
    ]

    for test in test_cases:
        console.print(f"\n[cyan]Testing: {test['name']}[/cyan]")
        console.print(f"URL: {test['url']}")
        console.print(f"Query: {test['query']}")

        config = AdaptiveConfig(
            confidence_threshold=0.6,
            max_pages=5,
            top_k_links=2
        )

        async with AsyncWebCrawler() as crawler:
            prog_crawler = AdaptiveCrawler(
                crawler=crawler,
                config=config
            )

            start_time = time.time()
            state = await prog_crawler.digest(
                start_url=test['url'],
                query=test['query']
            )
            elapsed = time.time() - start_time

            # Summary using print_stats
            prog_crawler.print_stats(detailed=False)


async def test_stopping_criteria():
    """Test different stopping criteria"""
    console.print("\n[bold yellow]Test 4: Stopping Criteria[/bold yellow]")

    # Test 1: High confidence threshold
    console.print("\n[cyan]4.1 High confidence threshold (0.9)[/cyan]")
    config = AdaptiveConfig(
        confidence_threshold=0.9,  # Very high
        max_pages=20,
        top_k_links=3
    )

    async with AsyncWebCrawler() as crawler:
        prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
        state = await prog_crawler.digest(
            start_url="https://docs.python.org/3/library/",
            query="python standard library"
        )

        console.print(f"Pages needed for 90% confidence: {len(state.crawled_urls)}")
        prog_crawler.print_stats(detailed=False)

    # Test 2: Page limit
    console.print("\n[cyan]4.2 Page limit (3 pages max)[/cyan]")
    config = AdaptiveConfig(
        confidence_threshold=0.9,
        max_pages=3,  # Very low limit
        top_k_links=2
    )

    async with AsyncWebCrawler() as crawler:
        prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
        state = await prog_crawler.digest(
            start_url="https://docs.python.org/3/library/",
            query="python standard library modules"
        )

        console.print(f"Stopped by: {'Page limit' if len(state.crawled_urls) >= 3 else 'Other'}")
        prog_crawler.print_stats(detailed=False)


async def test_crawl_patterns():
    """Analyze crawl patterns and link selection"""
    console.print("\n[bold yellow]Test 5: Crawl Pattern Analysis[/bold yellow]")

    config = AdaptiveConfig(
        confidence_threshold=0.7,
        max_pages=8,
        top_k_links=2,
        min_gain_threshold=0.05
    )

    async with AsyncWebCrawler() as crawler:
        prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)

        # Track crawl progress
        console.print("\n[cyan]Crawl Progress:[/cyan]")

        state = await prog_crawler.digest(
            start_url="https://httpbin.org",
            query="http methods post get"
        )

        # Show crawl order
        console.print("\n[green]Crawl Order:[/green]")
        for i, url in enumerate(state.crawl_order, 1):
            console.print(f"{i}. {url}")

        # Show new terms discovered per page
        console.print("\n[green]New Terms Discovered:[/green]")
        for i, new_terms in enumerate(state.new_terms_history, 1):
            console.print(f"Page {i}: {new_terms} new terms")

        # Final metrics
        console.print(f"\n[yellow]Saturation reached: {state.metrics.get('saturation', 0):.2%}[/yellow]")


async def main():
    """Run all tests"""
    console.print("[bold magenta]Adaptive Crawler Test Suite[/bold magenta]")
    console.print("=" * 50)

    try:
        # Run tests
        await test_basic_progressive_crawl()
        # await test_with_persistence()
        # await test_different_domains()
        # await test_stopping_criteria()
        # await test_crawl_patterns()

        console.print("\n[bold green]✅ All tests completed successfully![/bold green]")

    except Exception as e:
        console.print(f"\n[bold red]❌ Test failed with error: {e}[/bold red]")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    # Run the test suite
    asyncio.run(main())