This commit adds a complete, web scraping API example that demonstrates how to get structured data from any website and use it like an API using the crawl4ai library with a minimalist frontend interface. Core Functionality - AI-powered web scraping with plain English queries - Dual scraping approaches: Schema-based (faster) and LLM-based (flexible) - Intelligent schema caching for improved performance - Custom LLM model support with API key management - Automatic duplicate request prevention Modern Frontend Interface - Minimalist black-and-white design inspired by modern web apps - Responsive layout with smooth animations and transitions - Three main pages: Scrape Data, Models Management, API Request History - Real-time results display with JSON formatting - Copy-to-clipboard functionality for extracted data - Toast notifications for user feedback - Auto-scroll to results when scraping starts Model Management System - Web-based model configuration interface - Support for any LLM provider (OpenAI, Gemini, Anthropic, etc.) - Simplified configuration requiring only provider and API token - Add, list, and delete model configurations - Secure storage of API keys in local JSON files API Request History - Automatic saving of all API requests and responses - Display of request history with URL, query, and cURL commands - Duplicate prevention (same URL + query combinations) - Request deletion functionality - Clean, simplified display focusing on essential information Technical Implementation Backend (FastAPI) - RESTful API with comprehensive endpoints - Pydantic models for request/response validation - Async web scraping with crawl4ai library - Error handling with detailed error messages - File-based storage for models and request history Frontend (Vanilla JS/CSS/HTML) - No framework dependencies - pure HTML, CSS, JavaScript - Modern CSS Grid and Flexbox layouts - Custom dropdown styling with SVG arrows - Responsive design for mobile and desktop - Smooth scrolling and animations Core Library Integration - WebScraperAgent class for orchestration - ModelConfig class for LLM configuration management - Schema generation and caching system - LLM extraction strategy support - Browser configuration with headless mode
363 lines
12 KiB
Python
363 lines
12 KiB
Python
from fastapi import FastAPI, HTTPException
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi.responses import FileResponse
|
|
from pydantic import BaseModel, HttpUrl
|
|
from typing import Dict, Any, Optional, Union, List
|
|
import uvicorn
|
|
import asyncio
|
|
import os
|
|
import json
|
|
from datetime import datetime
|
|
from web_scraper_lib import WebScraperAgent, scrape_website
|
|
|
|
app = FastAPI(
|
|
title="Web Scraper API",
|
|
description="Convert any website into a structured data API. Provide a URL and tell AI what data you need in plain English.",
|
|
version="1.0.0"
|
|
)
|
|
|
|
# Mount static files
|
|
if os.path.exists("static"):
|
|
app.mount("/static", StaticFiles(directory="static"), name="static")
|
|
|
|
# Mount assets directory
|
|
if os.path.exists("assets"):
|
|
app.mount("/assets", StaticFiles(directory="assets"), name="assets")
|
|
|
|
# Initialize the scraper agent
|
|
scraper_agent = WebScraperAgent()
|
|
|
|
# Create directory for saved API requests
|
|
os.makedirs("saved_requests", exist_ok=True)
|
|
|
|
class ScrapeRequest(BaseModel):
|
|
url: HttpUrl
|
|
query: str
|
|
model_name: Optional[str] = None
|
|
|
|
class ModelConfigRequest(BaseModel):
|
|
model_name: str
|
|
provider: str
|
|
api_token: str
|
|
|
|
class ScrapeResponse(BaseModel):
|
|
success: bool
|
|
url: str
|
|
query: str
|
|
extracted_data: Union[Dict[str, Any], list]
|
|
schema_used: Optional[Dict[str, Any]] = None
|
|
timestamp: Optional[str] = None
|
|
error: Optional[str] = None
|
|
|
|
class SavedApiRequest(BaseModel):
|
|
id: str
|
|
endpoint: str
|
|
method: str
|
|
headers: Dict[str, str]
|
|
body: Dict[str, Any]
|
|
timestamp: str
|
|
response: Optional[Dict[str, Any]] = None
|
|
|
|
def save_api_request(endpoint: str, method: str, headers: Dict[str, str], body: Dict[str, Any], response: Optional[Dict[str, Any]] = None) -> str:
|
|
"""Save an API request to a JSON file."""
|
|
|
|
# Check for duplicate requests (same URL and query)
|
|
if endpoint in ["/scrape", "/scrape-with-llm"] and "url" in body and "query" in body:
|
|
existing_requests = get_saved_requests()
|
|
for existing_request in existing_requests:
|
|
if (existing_request.endpoint == endpoint and
|
|
existing_request.body.get("url") == body["url"] and
|
|
existing_request.body.get("query") == body["query"]):
|
|
print(f"Duplicate request found for URL: {body['url']} and query: {body['query']}")
|
|
return existing_request.id # Return existing request ID instead of creating new one
|
|
|
|
request_id = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
|
|
|
|
saved_request = SavedApiRequest(
|
|
id=request_id,
|
|
endpoint=endpoint,
|
|
method=method,
|
|
headers=headers,
|
|
body=body,
|
|
timestamp=datetime.now().isoformat(),
|
|
response=response
|
|
)
|
|
|
|
file_path = os.path.join("saved_requests", f"{request_id}.json")
|
|
with open(file_path, "w") as f:
|
|
json.dump(saved_request.dict(), f, indent=2)
|
|
|
|
return request_id
|
|
|
|
def get_saved_requests() -> List[SavedApiRequest]:
|
|
"""Get all saved API requests."""
|
|
requests = []
|
|
if os.path.exists("saved_requests"):
|
|
for filename in os.listdir("saved_requests"):
|
|
if filename.endswith('.json'):
|
|
file_path = os.path.join("saved_requests", filename)
|
|
try:
|
|
with open(file_path, "r") as f:
|
|
data = json.load(f)
|
|
requests.append(SavedApiRequest(**data))
|
|
except Exception as e:
|
|
print(f"Error loading saved request {filename}: {e}")
|
|
|
|
# Sort by timestamp (newest first)
|
|
requests.sort(key=lambda x: x.timestamp, reverse=True)
|
|
return requests
|
|
|
|
@app.get("/")
|
|
async def root():
|
|
"""Serve the frontend interface."""
|
|
if os.path.exists("static/index.html"):
|
|
return FileResponse("static/index.html")
|
|
else:
|
|
return {
|
|
"message": "Web Scraper API",
|
|
"description": "Convert any website into structured data with AI",
|
|
"endpoints": {
|
|
"/scrape": "POST - Scrape data from a website",
|
|
"/schemas": "GET - List cached schemas",
|
|
"/clear-cache": "POST - Clear schema cache",
|
|
"/models": "GET - List saved model configurations",
|
|
"/models": "POST - Save a new model configuration",
|
|
"/models/{model_name}": "DELETE - Delete a model configuration",
|
|
"/saved-requests": "GET - List saved API requests"
|
|
}
|
|
}
|
|
|
|
@app.post("/scrape", response_model=ScrapeResponse)
|
|
async def scrape_website_endpoint(request: ScrapeRequest):
|
|
"""
|
|
Scrape structured data from any website.
|
|
|
|
This endpoint:
|
|
1. Takes a URL and plain English query
|
|
2. Generates a custom scraper using AI
|
|
3. Returns structured data
|
|
"""
|
|
try:
|
|
# Save the API request
|
|
headers = {"Content-Type": "application/json"}
|
|
body = {
|
|
"url": str(request.url),
|
|
"query": request.query,
|
|
"model_name": request.model_name
|
|
}
|
|
|
|
result = await scraper_agent.scrape_data(
|
|
url=str(request.url),
|
|
query=request.query,
|
|
model_name=request.model_name
|
|
)
|
|
|
|
response_data = ScrapeResponse(
|
|
success=True,
|
|
url=result["url"],
|
|
query=result["query"],
|
|
extracted_data=result["extracted_data"],
|
|
schema_used=result["schema_used"],
|
|
timestamp=result["timestamp"]
|
|
)
|
|
|
|
# Save the request with response
|
|
save_api_request(
|
|
endpoint="/scrape",
|
|
method="POST",
|
|
headers=headers,
|
|
body=body,
|
|
response=response_data.dict()
|
|
)
|
|
|
|
return response_data
|
|
|
|
except Exception as e:
|
|
# Save the failed request
|
|
headers = {"Content-Type": "application/json"}
|
|
body = {
|
|
"url": str(request.url),
|
|
"query": request.query,
|
|
"model_name": request.model_name
|
|
}
|
|
|
|
save_api_request(
|
|
endpoint="/scrape",
|
|
method="POST",
|
|
headers=headers,
|
|
body=body,
|
|
response={"error": str(e)}
|
|
)
|
|
|
|
raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
|
|
|
|
@app.post("/scrape-with-llm", response_model=ScrapeResponse)
|
|
async def scrape_website_endpoint_with_llm(request: ScrapeRequest):
|
|
"""
|
|
Scrape structured data from any website using a custom LLM model.
|
|
"""
|
|
try:
|
|
# Save the API request
|
|
headers = {"Content-Type": "application/json"}
|
|
body = {
|
|
"url": str(request.url),
|
|
"query": request.query,
|
|
"model_name": request.model_name
|
|
}
|
|
|
|
result = await scraper_agent.scrape_data_with_llm(
|
|
url=str(request.url),
|
|
query=request.query,
|
|
model_name=request.model_name
|
|
)
|
|
|
|
response_data = ScrapeResponse(
|
|
success=True,
|
|
url=result["url"],
|
|
query=result["query"],
|
|
extracted_data=result["extracted_data"],
|
|
timestamp=result["timestamp"]
|
|
)
|
|
|
|
# Save the request with response
|
|
save_api_request(
|
|
endpoint="/scrape-with-llm",
|
|
method="POST",
|
|
headers=headers,
|
|
body=body,
|
|
response=response_data.dict()
|
|
)
|
|
|
|
return response_data
|
|
|
|
except Exception as e:
|
|
# Save the failed request
|
|
headers = {"Content-Type": "application/json"}
|
|
body = {
|
|
"url": str(request.url),
|
|
"query": request.query,
|
|
"model_name": request.model_name
|
|
}
|
|
|
|
save_api_request(
|
|
endpoint="/scrape-with-llm",
|
|
method="POST",
|
|
headers=headers,
|
|
body=body,
|
|
response={"error": str(e)}
|
|
)
|
|
|
|
raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
|
|
|
|
@app.get("/saved-requests")
|
|
async def list_saved_requests():
|
|
"""List all saved API requests."""
|
|
try:
|
|
requests = get_saved_requests()
|
|
return {
|
|
"success": True,
|
|
"requests": [req.dict() for req in requests],
|
|
"count": len(requests)
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to list saved requests: {str(e)}")
|
|
|
|
@app.delete("/saved-requests/{request_id}")
|
|
async def delete_saved_request(request_id: str):
|
|
"""Delete a saved API request."""
|
|
try:
|
|
file_path = os.path.join("saved_requests", f"{request_id}.json")
|
|
if os.path.exists(file_path):
|
|
os.remove(file_path)
|
|
return {
|
|
"success": True,
|
|
"message": f"Saved request '{request_id}' deleted successfully"
|
|
}
|
|
else:
|
|
raise HTTPException(status_code=404, detail=f"Saved request '{request_id}' not found")
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to delete saved request: {str(e)}")
|
|
|
|
@app.get("/schemas")
|
|
async def list_cached_schemas():
|
|
"""List all cached schemas."""
|
|
try:
|
|
schemas = await scraper_agent.get_cached_schemas()
|
|
return {
|
|
"success": True,
|
|
"cached_schemas": schemas,
|
|
"count": len(schemas)
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to list schemas: {str(e)}")
|
|
|
|
@app.post("/clear-cache")
|
|
async def clear_schema_cache():
|
|
"""Clear all cached schemas."""
|
|
try:
|
|
scraper_agent.clear_cache()
|
|
return {
|
|
"success": True,
|
|
"message": "Schema cache cleared successfully"
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to clear cache: {str(e)}")
|
|
|
|
@app.get("/models")
|
|
async def list_models():
|
|
"""List all saved model configurations."""
|
|
try:
|
|
models = scraper_agent.list_saved_models()
|
|
return {
|
|
"success": True,
|
|
"models": models,
|
|
"count": len(models)
|
|
}
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to list models: {str(e)}")
|
|
|
|
@app.post("/models")
|
|
async def save_model_config(request: ModelConfigRequest):
|
|
"""Save a new model configuration."""
|
|
try:
|
|
success = scraper_agent.save_model_config(
|
|
model_name=request.model_name,
|
|
provider=request.provider,
|
|
api_token=request.api_token
|
|
)
|
|
|
|
if success:
|
|
return {
|
|
"success": True,
|
|
"message": f"Model configuration '{request.model_name}' saved successfully"
|
|
}
|
|
else:
|
|
raise HTTPException(status_code=500, detail="Failed to save model configuration")
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to save model: {str(e)}")
|
|
|
|
@app.delete("/models/{model_name}")
|
|
async def delete_model_config(model_name: str):
|
|
"""Delete a model configuration."""
|
|
try:
|
|
success = scraper_agent.delete_model_config(model_name)
|
|
|
|
if success:
|
|
return {
|
|
"success": True,
|
|
"message": f"Model configuration '{model_name}' deleted successfully"
|
|
}
|
|
else:
|
|
raise HTTPException(status_code=404, detail=f"Model configuration '{model_name}' not found")
|
|
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to delete model: {str(e)}")
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
"""Health check endpoint."""
|
|
return {"status": "healthy", "service": "web-scraper-api"}
|
|
|
|
if __name__ == "__main__":
|
|
uvicorn.run(app, host="0.0.0.0", port=8000) |