239 lines
8.2 KiB
Python
239 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Runnable example for the /urls/discover endpoint.
|
|
|
|
This script demonstrates how to use the new URL Discovery API endpoint
|
|
to find relevant URLs from a domain before committing to a full crawl.
|
|
"""
|
|
|
|
import asyncio
|
|
import httpx
|
|
import json
|
|
from typing import List, Dict, Any
|
|
|
|
# Configuration
|
|
BASE_URL = "http://localhost:11235"
|
|
EXAMPLE_DOMAIN = "nbcnews.com"
|
|
|
|
|
|
async def discover_urls_basic_example():
|
|
"""Basic example of URL discovery."""
|
|
print("🔍 Basic URL Discovery Example")
|
|
print("=" * 50)
|
|
|
|
# Basic discovery request
|
|
request_data = {
|
|
"domain": EXAMPLE_DOMAIN,
|
|
"seeding_config": {
|
|
"source": "sitemap", # Use sitemap for fast discovery
|
|
"max_urls": 10 # Limit to 10 URLs
|
|
}
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{BASE_URL}/urls/discover",
|
|
json=request_data,
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
|
|
urls = response.json()
|
|
print(f"✅ Found {len(urls)} URLs")
|
|
|
|
# Display first few URLs
|
|
for i, url_obj in enumerate(urls[:3]):
|
|
print(f" {i+1}. {url_obj.get('url', 'N/A')}")
|
|
|
|
return urls
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
print(f"❌ HTTP Error: {e.response.status_code}")
|
|
print(f"Response: {e.response.text}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return []
|
|
|
|
|
|
async def discover_urls_advanced_example():
|
|
"""Advanced example with filtering and metadata extraction."""
|
|
print("\n🎯 Advanced URL Discovery Example")
|
|
print("=" * 50)
|
|
|
|
# Advanced discovery with filtering
|
|
request_data = {
|
|
"domain": EXAMPLE_DOMAIN,
|
|
"seeding_config": {
|
|
"source": "sitemap+cc", # Use both sitemap and Common Crawl
|
|
"pattern": "*/news/*", # Filter to news articles only
|
|
"extract_head": True, # Extract page metadata
|
|
"max_urls": 5,
|
|
"live_check": True, # Verify URLs are accessible
|
|
"verbose": True
|
|
}
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{BASE_URL}/urls/discover",
|
|
json=request_data,
|
|
timeout=60.0 # Longer timeout for advanced features
|
|
)
|
|
response.raise_for_status()
|
|
|
|
urls = response.json()
|
|
print(f"✅ Found {len(urls)} news URLs with metadata")
|
|
|
|
# Display URLs with metadata
|
|
for i, url_obj in enumerate(urls[:3]):
|
|
print(f"\n {i+1}. URL: {url_obj.get('url', 'N/A')}")
|
|
print(f" Status: {url_obj.get('status', 'unknown')}")
|
|
|
|
head_data = url_obj.get('head_data', {})
|
|
if head_data:
|
|
title = head_data.get('title', 'No title')
|
|
description = head_data.get('description', 'No description')
|
|
print(f" Title: {title[:60]}...")
|
|
print(f" Description: {description[:60]}...")
|
|
|
|
return urls
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
print(f"❌ HTTP Error: {e.response.status_code}")
|
|
print(f"Response: {e.response.text}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return []
|
|
|
|
|
|
async def discover_urls_with_scoring_example():
|
|
"""Example using BM25 relevance scoring."""
|
|
print("\n🏆 URL Discovery with Relevance Scoring")
|
|
print("=" * 50)
|
|
|
|
# Discovery with relevance scoring
|
|
request_data = {
|
|
"domain": EXAMPLE_DOMAIN,
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"extract_head": True, # Required for BM25 scoring
|
|
"query": "politics election", # Search for political content
|
|
"scoring_method": "bm25",
|
|
"score_threshold": 0.1, # Minimum relevance score
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{BASE_URL}/urls/discover",
|
|
json=request_data,
|
|
timeout=60.0
|
|
)
|
|
response.raise_for_status()
|
|
|
|
urls = response.json()
|
|
print(f"✅ Found {len(urls)} relevant URLs")
|
|
|
|
# Display URLs sorted by relevance score
|
|
for i, url_obj in enumerate(urls[:3]):
|
|
score = url_obj.get('score', 0)
|
|
print(f"\n {i+1}. Score: {score:.3f}")
|
|
print(f" URL: {url_obj.get('url', 'N/A')}")
|
|
|
|
head_data = url_obj.get('head_data', {})
|
|
if head_data:
|
|
title = head_data.get('title', 'No title')
|
|
print(f" Title: {title[:60]}...")
|
|
|
|
return urls
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
print(f"❌ HTTP Error: {e.response.status_code}")
|
|
print(f"Response: {e.response.text}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return []
|
|
|
|
|
|
def demonstrate_request_schema():
|
|
"""Show the complete request schema with all options."""
|
|
print("\n📋 Complete Request Schema")
|
|
print("=" * 50)
|
|
|
|
complete_schema = {
|
|
"domain": "example.com", # Required: Domain to discover URLs from
|
|
"seeding_config": { # Optional: Configuration object
|
|
# Discovery sources
|
|
"source": "sitemap+cc", # "sitemap", "cc", or "sitemap+cc"
|
|
|
|
# Filtering options
|
|
"pattern": "*/blog/*", # URL pattern filter (glob style)
|
|
"max_urls": 50, # Maximum URLs to return (-1 = no limit)
|
|
"filter_nonsense_urls": True, # Filter out nonsense URLs
|
|
|
|
# Metadata and validation
|
|
"extract_head": True, # Extract <head> metadata
|
|
"live_check": True, # Verify URL accessibility
|
|
|
|
# Performance and rate limiting
|
|
"concurrency": 100, # Concurrent requests
|
|
"hits_per_sec": 10, # Rate limit (requests/second)
|
|
"force": False, # Bypass cache
|
|
|
|
# Relevance scoring (requires extract_head=True)
|
|
"query": "search terms", # Query for BM25 scoring
|
|
"scoring_method": "bm25", # Scoring algorithm
|
|
"score_threshold": 0.2, # Minimum score threshold
|
|
|
|
# Debugging
|
|
"verbose": True # Enable verbose logging
|
|
}
|
|
}
|
|
|
|
print("Full request schema:")
|
|
print(json.dumps(complete_schema, indent=2))
|
|
|
|
|
|
async def main():
|
|
"""Run all examples."""
|
|
print("🚀 URL Discovery API Examples")
|
|
print("=" * 50)
|
|
print(f"Server: {BASE_URL}")
|
|
print(f"Domain: {EXAMPLE_DOMAIN}")
|
|
|
|
# Check if server is running
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.get(f"{BASE_URL}/health", timeout=5.0)
|
|
response.raise_for_status()
|
|
print("✅ Server is running\n")
|
|
except Exception as e:
|
|
print(f"❌ Server not available: {e}")
|
|
print("Please start the Crawl4AI server first:")
|
|
print(" docker compose up crawl4ai -d")
|
|
return
|
|
|
|
# Run examples
|
|
await discover_urls_basic_example()
|
|
await discover_urls_advanced_example()
|
|
await discover_urls_with_scoring_example()
|
|
|
|
# Show schema
|
|
demonstrate_request_schema()
|
|
|
|
print("\n🎉 Examples complete!")
|
|
print("\nNext steps:")
|
|
print("1. Use discovered URLs with the /crawl endpoint")
|
|
print("2. Filter URLs based on your specific needs")
|
|
print("3. Combine with other API endpoints for complete workflows")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |