Add new type definitions file with extensive Union type aliases for all core components including AsyncUrlSeeder, SeedingConfig, and various crawler strategies. Enhance test coverage with improved bot detection tests, Docker-based testing, and extended features validation. The changes provide better type safety and more robust testing infrastructure for the crawling framework.
239 lines
8.2 KiB
Python
239 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Runnable example for the /urls/discover endpoint.
|
|
|
|
This script demonstrates how to use the new URL Discovery API endpoint
|
|
to find relevant URLs from a domain before committing to a full crawl.
|
|
"""
|
|
|
|
import asyncio
|
|
import httpx
|
|
import json
|
|
from typing import List, Dict, Any
|
|
|
|
# Configuration
|
|
BASE_URL = "http://localhost:11235"
|
|
EXAMPLE_DOMAIN = "nbcnews.com"
|
|
|
|
|
|
async def discover_urls_basic_example():
|
|
"""Basic example of URL discovery."""
|
|
print("🔍 Basic URL Discovery Example")
|
|
print("=" * 50)
|
|
|
|
# Basic discovery request
|
|
request_data = {
|
|
"domain": EXAMPLE_DOMAIN,
|
|
"seeding_config": {
|
|
"source": "sitemap", # Use sitemap for fast discovery
|
|
"max_urls": 10 # Limit to 10 URLs
|
|
}
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{BASE_URL}/urls/discover",
|
|
json=request_data,
|
|
timeout=30.0
|
|
)
|
|
response.raise_for_status()
|
|
|
|
urls = response.json()
|
|
print(f"✅ Found {len(urls)} URLs")
|
|
|
|
# Display first few URLs
|
|
for i, url_obj in enumerate(urls[:3]):
|
|
print(f" {i+1}. {url_obj.get('url', 'N/A')}")
|
|
|
|
return urls
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
print(f"❌ HTTP Error: {e.response.status_code}")
|
|
print(f"Response: {e.response.text}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return []
|
|
|
|
|
|
async def discover_urls_advanced_example():
|
|
"""Advanced example with filtering and metadata extraction."""
|
|
print("\n🎯 Advanced URL Discovery Example")
|
|
print("=" * 50)
|
|
|
|
# Advanced discovery with filtering
|
|
request_data = {
|
|
"domain": EXAMPLE_DOMAIN,
|
|
"seeding_config": {
|
|
"source": "sitemap+cc", # Use both sitemap and Common Crawl
|
|
"pattern": "*/news/*", # Filter to news articles only
|
|
"extract_head": True, # Extract page metadata
|
|
"max_urls": 5,
|
|
"live_check": True, # Verify URLs are accessible
|
|
"verbose": True
|
|
}
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{BASE_URL}/urls/discover",
|
|
json=request_data,
|
|
timeout=60.0 # Longer timeout for advanced features
|
|
)
|
|
response.raise_for_status()
|
|
|
|
urls = response.json()
|
|
print(f"✅ Found {len(urls)} news URLs with metadata")
|
|
|
|
# Display URLs with metadata
|
|
for i, url_obj in enumerate(urls[:3]):
|
|
print(f"\n {i+1}. URL: {url_obj.get('url', 'N/A')}")
|
|
print(f" Status: {url_obj.get('status', 'unknown')}")
|
|
|
|
head_data = url_obj.get('head_data', {})
|
|
if head_data:
|
|
title = head_data.get('title', 'No title')
|
|
description = head_data.get('description', 'No description')
|
|
print(f" Title: {title[:60]}...")
|
|
print(f" Description: {description[:60]}...")
|
|
|
|
return urls
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
print(f"❌ HTTP Error: {e.response.status_code}")
|
|
print(f"Response: {e.response.text}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return []
|
|
|
|
|
|
async def discover_urls_with_scoring_example():
|
|
"""Example using BM25 relevance scoring."""
|
|
print("\n🏆 URL Discovery with Relevance Scoring")
|
|
print("=" * 50)
|
|
|
|
# Discovery with relevance scoring
|
|
request_data = {
|
|
"domain": EXAMPLE_DOMAIN,
|
|
"seeding_config": {
|
|
"source": "sitemap",
|
|
"extract_head": True, # Required for BM25 scoring
|
|
"query": "politics election", # Search for political content
|
|
"scoring_method": "bm25",
|
|
"score_threshold": 0.1, # Minimum relevance score
|
|
"max_urls": 5
|
|
}
|
|
}
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{BASE_URL}/urls/discover",
|
|
json=request_data,
|
|
timeout=60.0
|
|
)
|
|
response.raise_for_status()
|
|
|
|
urls = response.json()
|
|
print(f"✅ Found {len(urls)} relevant URLs")
|
|
|
|
# Display URLs sorted by relevance score
|
|
for i, url_obj in enumerate(urls[:3]):
|
|
score = url_obj.get('score', 0)
|
|
print(f"\n {i+1}. Score: {score:.3f}")
|
|
print(f" URL: {url_obj.get('url', 'N/A')}")
|
|
|
|
head_data = url_obj.get('head_data', {})
|
|
if head_data:
|
|
title = head_data.get('title', 'No title')
|
|
print(f" Title: {title[:60]}...")
|
|
|
|
return urls
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
print(f"❌ HTTP Error: {e.response.status_code}")
|
|
print(f"Response: {e.response.text}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return []
|
|
|
|
|
|
def demonstrate_request_schema():
|
|
"""Show the complete request schema with all options."""
|
|
print("\n📋 Complete Request Schema")
|
|
print("=" * 50)
|
|
|
|
complete_schema = {
|
|
"domain": "example.com", # Required: Domain to discover URLs from
|
|
"seeding_config": { # Optional: Configuration object
|
|
# Discovery sources
|
|
"source": "sitemap+cc", # "sitemap", "cc", or "sitemap+cc"
|
|
|
|
# Filtering options
|
|
"pattern": "*/blog/*", # URL pattern filter (glob style)
|
|
"max_urls": 50, # Maximum URLs to return (-1 = no limit)
|
|
"filter_nonsense_urls": True, # Filter out nonsense URLs
|
|
|
|
# Metadata and validation
|
|
"extract_head": True, # Extract <head> metadata
|
|
"live_check": True, # Verify URL accessibility
|
|
|
|
# Performance and rate limiting
|
|
"concurrency": 100, # Concurrent requests
|
|
"hits_per_sec": 10, # Rate limit (requests/second)
|
|
"force": False, # Bypass cache
|
|
|
|
# Relevance scoring (requires extract_head=True)
|
|
"query": "search terms", # Query for BM25 scoring
|
|
"scoring_method": "bm25", # Scoring algorithm
|
|
"score_threshold": 0.2, # Minimum score threshold
|
|
|
|
# Debugging
|
|
"verbose": True # Enable verbose logging
|
|
}
|
|
}
|
|
|
|
print("Full request schema:")
|
|
print(json.dumps(complete_schema, indent=2))
|
|
|
|
|
|
async def main():
|
|
"""Run all examples."""
|
|
print("🚀 URL Discovery API Examples")
|
|
print("=" * 50)
|
|
print(f"Server: {BASE_URL}")
|
|
print(f"Domain: {EXAMPLE_DOMAIN}")
|
|
|
|
# Check if server is running
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.get(f"{BASE_URL}/health", timeout=5.0)
|
|
response.raise_for_status()
|
|
print("✅ Server is running\n")
|
|
except Exception as e:
|
|
print(f"❌ Server not available: {e}")
|
|
print("Please start the Crawl4AI server first:")
|
|
print(" docker compose up crawl4ai -d")
|
|
return
|
|
|
|
# Run examples
|
|
await discover_urls_basic_example()
|
|
await discover_urls_advanced_example()
|
|
await discover_urls_with_scoring_example()
|
|
|
|
# Show schema
|
|
demonstrate_request_schema()
|
|
|
|
print("\n🎉 Examples complete!")
|
|
print("\nNext steps:")
|
|
print("1. Use discovered URLs with the /crawl endpoint")
|
|
print("2. Filter URLs based on your specific needs")
|
|
print("3. Combine with other API endpoints for complete workflows")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |