Files
crawl4ai/docs/examples/website-to-api/web_scraper_lib.py
Soham Kukreti b1dff5a4d3 feat: Add comprehensive website to API example with frontend
This commit adds a complete, web scraping API example that demonstrates how to get structured data from any website and use it like an API using the crawl4ai library with a minimalist frontend interface.

Core Functionality
- AI-powered web scraping with plain English queries
- Dual scraping approaches: Schema-based (faster) and LLM-based (flexible)
- Intelligent schema caching for improved performance
- Custom LLM model support with API key management
- Automatic duplicate request prevention

Modern Frontend Interface
- Minimalist black-and-white design inspired by modern web apps
- Responsive layout with smooth animations and transitions
- Three main pages: Scrape Data, Models Management, API Request History
- Real-time results display with JSON formatting
- Copy-to-clipboard functionality for extracted data
- Toast notifications for user feedback
- Auto-scroll to results when scraping starts

Model Management System
- Web-based model configuration interface
- Support for any LLM provider (OpenAI, Gemini, Anthropic, etc.)
- Simplified configuration requiring only provider and API token
- Add, list, and delete model configurations
- Secure storage of API keys in local JSON files

API Request History
- Automatic saving of all API requests and responses
- Display of request history with URL, query, and cURL commands
- Duplicate prevention (same URL + query combinations)
- Request deletion functionality
- Clean, simplified display focusing on essential information

Technical Implementation

Backend (FastAPI)
- RESTful API with comprehensive endpoints
- Pydantic models for request/response validation
- Async web scraping with crawl4ai library
- Error handling with detailed error messages
- File-based storage for models and request history

Frontend (Vanilla JS/CSS/HTML)
- No framework dependencies - pure HTML, CSS, JavaScript
- Modern CSS Grid and Flexbox layouts
- Custom dropdown styling with SVG arrows
- Responsive design for mobile and desktop
- Smooth scrolling and animations

Core Library Integration
- WebScraperAgent class for orchestration
- ModelConfig class for LLM configuration management
- Schema generation and caching system
- LLM extraction strategy support
- Browser configuration with headless mode
2025-08-24 18:52:37 +05:30

397 lines
14 KiB
Python

from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CacheMode,
CrawlerRunConfig,
LLMConfig,
JsonCssExtractionStrategy,
LLMExtractionStrategy
)
import os
import json
import hashlib
from typing import Dict, Any, Optional, List
from litellm import completion
class ModelConfig:
"""Configuration for LLM models."""
def __init__(self, provider: str, api_token: str):
self.provider = provider
self.api_token = api_token
def to_dict(self) -> Dict[str, Any]:
return {
"provider": self.provider,
"api_token": self.api_token
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'ModelConfig':
return cls(
provider=data["provider"],
api_token=data["api_token"]
)
class WebScraperAgent:
"""
A mini library that converts any website into a structured data API.
Features:
1. Provide a URL and tell AI what data you need in plain English
2. Generate: Agent reverse-engineers the site and deploys custom scraper
3. Integrate: Use private API endpoint to get structured data
4. Support for custom LLM models and API keys
"""
def __init__(self, schemas_dir: str = "schemas", models_dir: str = "models"):
self.schemas_dir = schemas_dir
self.models_dir = models_dir
os.makedirs(self.schemas_dir, exist_ok=True)
os.makedirs(self.models_dir, exist_ok=True)
def _generate_schema_key(self, url: str, query: str) -> str:
"""Generate a unique key for schema caching based on URL and query."""
content = f"{url}:{query}"
return hashlib.md5(content.encode()).hexdigest()
def save_model_config(self, model_name: str, provider: str, api_token: str) -> bool:
"""
Save a model configuration for later use.
Args:
model_name: User-friendly name for the model
provider: LLM provider (e.g., 'gemini', 'openai', 'anthropic')
api_token: API token for the provider
Returns:
True if saved successfully
"""
try:
model_config = ModelConfig(provider, api_token)
config_path = os.path.join(self.models_dir, f"{model_name}.json")
with open(config_path, "w") as f:
json.dump(model_config.to_dict(), f, indent=2)
print(f"Model configuration saved: {model_name}")
return True
except Exception as e:
print(f"Failed to save model configuration: {e}")
return False
def load_model_config(self, model_name: str) -> Optional[ModelConfig]:
"""
Load a saved model configuration.
Args:
model_name: Name of the saved model configuration
Returns:
ModelConfig object or None if not found
"""
try:
config_path = os.path.join(self.models_dir, f"{model_name}.json")
if not os.path.exists(config_path):
return None
with open(config_path, "r") as f:
data = json.load(f)
return ModelConfig.from_dict(data)
except Exception as e:
print(f"Failed to load model configuration: {e}")
return None
def list_saved_models(self) -> List[str]:
"""List all saved model configurations."""
models = []
for filename in os.listdir(self.models_dir):
if filename.endswith('.json'):
models.append(filename[:-5]) # Remove .json extension
return models
def delete_model_config(self, model_name: str) -> bool:
"""
Delete a saved model configuration.
Args:
model_name: Name of the model configuration to delete
Returns:
True if deleted successfully
"""
try:
config_path = os.path.join(self.models_dir, f"{model_name}.json")
if os.path.exists(config_path):
os.remove(config_path)
print(f"Model configuration deleted: {model_name}")
return True
return False
except Exception as e:
print(f"Failed to delete model configuration: {e}")
return False
async def _load_or_generate_schema(self, url: str, query: str, session_id: str = "schema_generator", model_name: Optional[str] = None) -> Dict[str, Any]:
"""
Loads schema from cache if exists, otherwise generates using AI.
This is the "Generate" step - our agent reverse-engineers the site.
Args:
url: URL to scrape
query: Query for data extraction
session_id: Session identifier
model_name: Name of saved model configuration to use
"""
schema_key = self._generate_schema_key(url, query)
schema_path = os.path.join(self.schemas_dir, f"{schema_key}.json")
if os.path.exists(schema_path):
print(f"Schema found in cache for {url}")
with open(schema_path, "r") as f:
return json.load(f)
print(f"Generating new schema for {url}")
print(f"Query: {query}")
query += """
IMPORTANT:
GENERATE THE SCHEMA WITH ONLY THE FIELDS MENTIONED IN THE QUERY. MAKE SURE THE NUMBER OF FIELDS IN THE SCHEME MATCH THE NUMBER OF FIELDS IN THE QUERY.
"""
# Step 1: Fetch the page HTML
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
result = await crawler.arun(
url=url,
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id=session_id,
simulate_user=True,
remove_overlay_elements=True,
delay_before_return_html=5,
)
)
html = result.fit_html
# Step 2: Generate schema using AI with custom model if specified
print("AI is analyzing the page structure...")
# Use custom model configuration if provided
if model_name:
model_config = self.load_model_config(model_name)
if model_config:
llm_config = LLMConfig(
provider=model_config.provider,
api_token=model_config.api_token
)
print(f"Using custom model: {model_name}")
else:
raise ValueError(f"Model configuration '{model_name}' not found. Please add it from the Models page.")
else:
# Require a model to be specified
raise ValueError("No model specified. Please select a model from the dropdown or add one from the Models page.")
schema = JsonCssExtractionStrategy.generate_schema(
html=html,
llm_config=llm_config,
query=query
)
# Step 3: Cache the generated schema
print(f"Schema generated and cached: {json.dumps(schema, indent=2)}")
with open(schema_path, "w") as f:
json.dump(schema, f, indent=2)
return schema
def _generate_llm_schema(self, query: str, llm_config: LLMConfig) -> Dict[str, Any]:
"""
Generate a schema for a given query using a custom LLM model.
Args:
query: Plain English description of what data to extract
model_config: Model configuration to use
"""
# ask the model to generate a schema for the given query in the form of a json.
prompt = f"""
IDENTIFY THE FIELDS FOR EXTRACTION MENTIONED IN THE QUERY and GENERATE A JSON SCHEMA FOR THE FIELDS.
eg.
{{
"name": "str",
"age": "str",
"email": "str",
"product_name": "str",
"product_price": "str",
"product_description": "str",
"product_image": "str",
"product_url": "str",
"product_rating": "str",
"product_reviews": "str",
}}
Here is the query:
{query}
IMPORTANT:
THE RESULT SHOULD BE A JSON OBJECT.
MAKE SURE THE NUMBER OF FIELDS IN THE RESULT MATCH THE NUMBER OF FIELDS IN THE QUERY.
THE RESULT SHOULD BE A JSON OBJECT.
"""
response = completion(
model=llm_config.provider,
messages=[{"role": "user", "content": prompt}],
api_key=llm_config.api_token,
result_type="json"
)
return response.json()["choices"][0]["message"]["content"]
async def scrape_data_with_llm(self, url: str, query: str, model_name: Optional[str] = None) -> Dict[str, Any]:
"""
Scrape structured data from any website using a custom LLM model.
Args:
url: The website URL to scrape
query: Plain English description of what data to extract
model_name: Name of saved model configuration to use
"""
if model_name:
model_config = self.load_model_config(model_name)
if model_config:
llm_config = LLMConfig(
provider=model_config.provider,
api_token=model_config.api_token
)
print(f"Using custom model: {model_name}")
else:
raise ValueError(f"Model configuration '{model_name}' not found. Please add it from the Models page.")
else:
# Require a model to be specified
raise ValueError("No model specified. Please select a model from the dropdown or add one from the Models page.")
query += """\n
IMPORTANT:
THE RESULT SHOULD BE A JSON OBJECT WITH THE ONLY THE FIELDS MENTIONED IN THE QUERY.
MAKE SURE THE NUMBER OF FIELDS IN THE RESULT MATCH THE NUMBER OF FIELDS IN THE QUERY.
THE RESULT SHOULD BE A JSON OBJECT.
"""
schema = self._generate_llm_schema(query, llm_config)
print(f"Schema: {schema}")
llm_extraction_strategy = LLMExtractionStrategy(
llm_config=llm_config,
instruction=query,
result_type="json",
schema=schema
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
simulate_user=True,
extraction_strategy=llm_extraction_strategy,
)
)
extracted_data = result.extracted_content
if isinstance(extracted_data, str):
try:
extracted_data = json.loads(extracted_data)
except json.JSONDecodeError:
# If it's not valid JSON, keep it as string
pass
return {
"url": url,
"query": query,
"extracted_data": extracted_data,
"timestamp": result.timestamp if hasattr(result, 'timestamp') else None
}
async def scrape_data(self, url: str, query: str, model_name: Optional[str] = None) -> Dict[str, Any]:
"""
Main method to scrape structured data from any website.
Args:
url: The website URL to scrape
query: Plain English description of what data to extract
model_name: Name of saved model configuration to use
Returns:
Structured data extracted from the website
"""
# Step 1: Generate or load schema (reverse-engineer the site)
schema = await self._load_or_generate_schema(url=url, query=query, model_name=model_name)
# Step 2: Deploy custom high-speed scraper
print(f"Deploying custom scraper for {url}")
browser_config = BrowserConfig(headless=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
run_config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(schema=schema),
)
result = await crawler.arun(url=url, config=run_config)
# Step 3: Return structured data
# Parse extracted_content if it's a JSON string
extracted_data = result.extracted_content
if isinstance(extracted_data, str):
try:
extracted_data = json.loads(extracted_data)
except json.JSONDecodeError:
# If it's not valid JSON, keep it as string
pass
return {
"url": url,
"query": query,
"extracted_data": extracted_data,
"schema_used": schema,
"timestamp": result.timestamp if hasattr(result, 'timestamp') else None
}
async def get_cached_schemas(self) -> Dict[str, str]:
"""Get list of cached schemas."""
schemas = {}
for filename in os.listdir(self.schemas_dir):
if filename.endswith('.json'):
schema_key = filename[:-5] # Remove .json extension
schemas[schema_key] = filename
return schemas
def clear_cache(self):
"""Clear all cached schemas."""
import shutil
if os.path.exists(self.schemas_dir):
shutil.rmtree(self.schemas_dir)
os.makedirs(self.schemas_dir, exist_ok=True)
print("Schema cache cleared")
# Convenience function for simple usage
async def scrape_website(url: str, query: str, model_name: Optional[str] = None) -> Dict[str, Any]:
"""
Simple function to scrape any website with plain English instructions.
Args:
url: Website URL
query: Plain English description of what data to extract
model_name: Name of saved model configuration to use
Returns:
Extracted structured data
"""
agent = WebScraperAgent()
return await agent.scrape_data(url, query, model_name)
async def scrape_website_with_llm(url: str, query: str, model_name: Optional[str] = None):
"""
Scrape structured data from any website using a custom LLM model.
Args:
url: The website URL to scrape
query: Plain English description of what data to extract
model_name: Name of saved model configuration to use
"""
agent = WebScraperAgent()
return await agent.scrape_data_with_llm(url, query, model_name)