crawl4ai/tests/example_url_discovery.py

#!/usr/bin/env python3
"""
Runnable example for the /urls/discover endpoint.

This script demonstrates how to use the new URL Discovery API endpoint
to find relevant URLs from a domain before committing to a full crawl.
"""

import asyncio
import httpx
import json
from typing import List, Dict, Any

# Configuration
BASE_URL = "http://localhost:11235"
EXAMPLE_DOMAIN = "nbcnews.com"


async def discover_urls_basic_example():
    """Basic example of URL discovery."""
    print("🔍 Basic URL Discovery Example")
    print("=" * 50)

    # Basic discovery request
    request_data = {
        "domain": EXAMPLE_DOMAIN,
        "seeding_config": {
            "source": "sitemap",      # Use sitemap for fast discovery
            "max_urls": 10           # Limit to 10 URLs
        }
    }

    async with httpx.AsyncClient() as client:
        try:
            response = await client.post(
                f"{BASE_URL}/urls/discover",
                json=request_data,
                timeout=30.0
            )
            response.raise_for_status()

            urls = response.json()
            print(f"✅ Found {len(urls)} URLs")

            # Display first few URLs
            for i, url_obj in enumerate(urls[:3]):
                print(f"  {i+1}. {url_obj.get('url', 'N/A')}")

            return urls

        except httpx.HTTPStatusError as e:
            print(f"❌ HTTP Error: {e.response.status_code}")
            print(f"Response: {e.response.text}")
            return []
        except Exception as e:
            print(f"❌ Error: {e}")
            return []


async def discover_urls_advanced_example():
    """Advanced example with filtering and metadata extraction."""
    print("\n🎯 Advanced URL Discovery Example")
    print("=" * 50)

    # Advanced discovery with filtering
    request_data = {
        "domain": EXAMPLE_DOMAIN,
        "seeding_config": {
            "source": "sitemap+cc",   # Use both sitemap and Common Crawl
            "pattern": "*/news/*",    # Filter to news articles only
            "extract_head": True,     # Extract page metadata
            "max_urls": 5,
            "live_check": True,       # Verify URLs are accessible
            "verbose": True
        }
    }

    async with httpx.AsyncClient() as client:
        try:
            response = await client.post(
                f"{BASE_URL}/urls/discover",
                json=request_data,
                timeout=60.0  # Longer timeout for advanced features
            )
            response.raise_for_status()

            urls = response.json()
            print(f"✅ Found {len(urls)} news URLs with metadata")

            # Display URLs with metadata
            for i, url_obj in enumerate(urls[:3]):
                print(f"\n  {i+1}. URL: {url_obj.get('url', 'N/A')}")
                print(f"     Status: {url_obj.get('status', 'unknown')}")

                head_data = url_obj.get('head_data', {})
                if head_data:
                    title = head_data.get('title', 'No title')
                    description = head_data.get('description', 'No description')
                    print(f"     Title: {title[:60]}...")
                    print(f"     Description: {description[:60]}...")

            return urls

        except httpx.HTTPStatusError as e:
            print(f"❌ HTTP Error: {e.response.status_code}")
            print(f"Response: {e.response.text}")
            return []
        except Exception as e:
            print(f"❌ Error: {e}")
            return []


async def discover_urls_with_scoring_example():
    """Example using BM25 relevance scoring."""
    print("\n🏆 URL Discovery with Relevance Scoring")
    print("=" * 50)

    # Discovery with relevance scoring
    request_data = {
        "domain": EXAMPLE_DOMAIN,
        "seeding_config": {
            "source": "sitemap",
            "extract_head": True,     # Required for BM25 scoring
            "query": "politics election",  # Search for political content
            "scoring_method": "bm25",
            "score_threshold": 0.1,   # Minimum relevance score
            "max_urls": 5
        }
    }

    async with httpx.AsyncClient() as client:
        try:
            response = await client.post(
                f"{BASE_URL}/urls/discover",
                json=request_data,
                timeout=60.0
            )
            response.raise_for_status()

            urls = response.json()
            print(f"✅ Found {len(urls)} relevant URLs")

            # Display URLs sorted by relevance score
            for i, url_obj in enumerate(urls[:3]):
                score = url_obj.get('score', 0)
                print(f"\n  {i+1}. Score: {score:.3f}")
                print(f"     URL: {url_obj.get('url', 'N/A')}")

                head_data = url_obj.get('head_data', {})
                if head_data:
                    title = head_data.get('title', 'No title')
                    print(f"     Title: {title[:60]}...")

            return urls

        except httpx.HTTPStatusError as e:
            print(f"❌ HTTP Error: {e.response.status_code}")
            print(f"Response: {e.response.text}")
            return []
        except Exception as e:
            print(f"❌ Error: {e}")
            return []


def demonstrate_request_schema():
    """Show the complete request schema with all options."""
    print("\n📋 Complete Request Schema")
    print("=" * 50)

    complete_schema = {
        "domain": "example.com",  # Required: Domain to discover URLs from
        "seeding_config": {       # Optional: Configuration object
            # Discovery sources
            "source": "sitemap+cc",           # "sitemap", "cc", or "sitemap+cc"

            # Filtering options
            "pattern": "*/blog/*",            # URL pattern filter (glob style)
            "max_urls": 50,                   # Maximum URLs to return (-1 = no limit)
            "filter_nonsense_urls": True,     # Filter out nonsense URLs

            # Metadata and validation
            "extract_head": True,             # Extract <head> metadata
            "live_check": True,               # Verify URL accessibility

            # Performance and rate limiting
            "concurrency": 100,               # Concurrent requests
            "hits_per_sec": 10,              # Rate limit (requests/second)
            "force": False,                   # Bypass cache

            # Relevance scoring (requires extract_head=True)
            "query": "search terms",          # Query for BM25 scoring
            "scoring_method": "bm25",         # Scoring algorithm
            "score_threshold": 0.2,           # Minimum score threshold

            # Debugging
            "verbose": True                   # Enable verbose logging
        }
    }

    print("Full request schema:")
    print(json.dumps(complete_schema, indent=2))


async def main():
    """Run all examples."""
    print("🚀 URL Discovery API Examples")
    print("=" * 50)
    print(f"Server: {BASE_URL}")
    print(f"Domain: {EXAMPLE_DOMAIN}")

    # Check if server is running
    async with httpx.AsyncClient() as client:
        try:
            response = await client.get(f"{BASE_URL}/health", timeout=5.0)
            response.raise_for_status()
            print("✅ Server is running\n")
        except Exception as e:
            print(f"❌ Server not available: {e}")
            print("Please start the Crawl4AI server first:")
            print("  docker compose up crawl4ai -d")
            return

    # Run examples
    await discover_urls_basic_example()
    await discover_urls_advanced_example()
    await discover_urls_with_scoring_example()

    # Show schema
    demonstrate_request_schema()

    print("\n🎉 Examples complete!")
    print("\nNext steps:")
    print("1. Use discovered URLs with the /crawl endpoint")
    print("2. Filter URLs based on your specific needs")
    print("3. Combine with other API endpoints for complete workflows")


if __name__ == "__main__":
    asyncio.run(main())