#!/usr/bin/env python3 """ Runnable example for the /urls/discover endpoint. This script demonstrates how to use the new URL Discovery API endpoint to find relevant URLs from a domain before committing to a full crawl. """ import asyncio import httpx import json from typing import List, Dict, Any # Configuration BASE_URL = "http://localhost:11235" EXAMPLE_DOMAIN = "nbcnews.com" async def discover_urls_basic_example(): """Basic example of URL discovery.""" print("šŸ” Basic URL Discovery Example") print("=" * 50) # Basic discovery request request_data = { "domain": EXAMPLE_DOMAIN, "seeding_config": { "source": "sitemap", # Use sitemap for fast discovery "max_urls": 10 # Limit to 10 URLs } } async with httpx.AsyncClient() as client: try: response = await client.post( f"{BASE_URL}/urls/discover", json=request_data, timeout=30.0 ) response.raise_for_status() urls = response.json() print(f"āœ… Found {len(urls)} URLs") # Display first few URLs for i, url_obj in enumerate(urls[:3]): print(f" {i+1}. {url_obj.get('url', 'N/A')}") return urls except httpx.HTTPStatusError as e: print(f"āŒ HTTP Error: {e.response.status_code}") print(f"Response: {e.response.text}") return [] except Exception as e: print(f"āŒ Error: {e}") return [] async def discover_urls_advanced_example(): """Advanced example with filtering and metadata extraction.""" print("\nšŸŽÆ Advanced URL Discovery Example") print("=" * 50) # Advanced discovery with filtering request_data = { "domain": EXAMPLE_DOMAIN, "seeding_config": { "source": "sitemap+cc", # Use both sitemap and Common Crawl "pattern": "*/news/*", # Filter to news articles only "extract_head": True, # Extract page metadata "max_urls": 5, "live_check": True, # Verify URLs are accessible "verbose": True } } async with httpx.AsyncClient() as client: try: response = await client.post( f"{BASE_URL}/urls/discover", json=request_data, timeout=60.0 # Longer timeout for advanced features ) response.raise_for_status() urls = response.json() print(f"āœ… Found {len(urls)} news URLs with metadata") # Display URLs with metadata for i, url_obj in enumerate(urls[:3]): print(f"\n {i+1}. URL: {url_obj.get('url', 'N/A')}") print(f" Status: {url_obj.get('status', 'unknown')}") head_data = url_obj.get('head_data', {}) if head_data: title = head_data.get('title', 'No title') description = head_data.get('description', 'No description') print(f" Title: {title[:60]}...") print(f" Description: {description[:60]}...") return urls except httpx.HTTPStatusError as e: print(f"āŒ HTTP Error: {e.response.status_code}") print(f"Response: {e.response.text}") return [] except Exception as e: print(f"āŒ Error: {e}") return [] async def discover_urls_with_scoring_example(): """Example using BM25 relevance scoring.""" print("\nšŸ† URL Discovery with Relevance Scoring") print("=" * 50) # Discovery with relevance scoring request_data = { "domain": EXAMPLE_DOMAIN, "seeding_config": { "source": "sitemap", "extract_head": True, # Required for BM25 scoring "query": "politics election", # Search for political content "scoring_method": "bm25", "score_threshold": 0.1, # Minimum relevance score "max_urls": 5 } } async with httpx.AsyncClient() as client: try: response = await client.post( f"{BASE_URL}/urls/discover", json=request_data, timeout=60.0 ) response.raise_for_status() urls = response.json() print(f"āœ… Found {len(urls)} relevant URLs") # Display URLs sorted by relevance score for i, url_obj in enumerate(urls[:3]): score = url_obj.get('score', 0) print(f"\n {i+1}. Score: {score:.3f}") print(f" URL: {url_obj.get('url', 'N/A')}") head_data = url_obj.get('head_data', {}) if head_data: title = head_data.get('title', 'No title') print(f" Title: {title[:60]}...") return urls except httpx.HTTPStatusError as e: print(f"āŒ HTTP Error: {e.response.status_code}") print(f"Response: {e.response.text}") return [] except Exception as e: print(f"āŒ Error: {e}") return [] def demonstrate_request_schema(): """Show the complete request schema with all options.""" print("\nšŸ“‹ Complete Request Schema") print("=" * 50) complete_schema = { "domain": "example.com", # Required: Domain to discover URLs from "seeding_config": { # Optional: Configuration object # Discovery sources "source": "sitemap+cc", # "sitemap", "cc", or "sitemap+cc" # Filtering options "pattern": "*/blog/*", # URL pattern filter (glob style) "max_urls": 50, # Maximum URLs to return (-1 = no limit) "filter_nonsense_urls": True, # Filter out nonsense URLs # Metadata and validation "extract_head": True, # Extract metadata "live_check": True, # Verify URL accessibility # Performance and rate limiting "concurrency": 100, # Concurrent requests "hits_per_sec": 10, # Rate limit (requests/second) "force": False, # Bypass cache # Relevance scoring (requires extract_head=True) "query": "search terms", # Query for BM25 scoring "scoring_method": "bm25", # Scoring algorithm "score_threshold": 0.2, # Minimum score threshold # Debugging "verbose": True # Enable verbose logging } } print("Full request schema:") print(json.dumps(complete_schema, indent=2)) async def main(): """Run all examples.""" print("šŸš€ URL Discovery API Examples") print("=" * 50) print(f"Server: {BASE_URL}") print(f"Domain: {EXAMPLE_DOMAIN}") # Check if server is running async with httpx.AsyncClient() as client: try: response = await client.get(f"{BASE_URL}/health", timeout=5.0) response.raise_for_status() print("āœ… Server is running\n") except Exception as e: print(f"āŒ Server not available: {e}") print("Please start the Crawl4AI server first:") print(" docker compose up crawl4ai -d") return # Run examples await discover_urls_basic_example() await discover_urls_advanced_example() await discover_urls_with_scoring_example() # Show schema demonstrate_request_schema() print("\nšŸŽ‰ Examples complete!") print("\nNext steps:") print("1. Use discovered URLs with the /crawl endpoint") print("2. Filter URLs based on your specific needs") print("3. Combine with other API endpoints for complete workflows") if __name__ == "__main__": asyncio.run(main())