Add all 5 deployments solution for testing

This commit is contained in:
UncleCode
2025-03-10 18:57:14 +08:00
parent 9547bada3a
commit 3ea3c0520d
38 changed files with 6431 additions and 0 deletions

View File

@@ -0,0 +1,543 @@
import os
import time
import uuid
from datetime import datetime
from typing import Dict, Any, Optional, List
import modal
from modal import Image, App, Volume, Secret, web_endpoint, function
# Configuration
APP_NAME = "crawl4ai-api"
CRAWL4AI_VERSION = "next" # Using the 'next' branch
PYTHON_VERSION = "3.10" # Compatible with playwright
DEFAULT_CREDITS = 1000
# Create a custom image with Crawl4ai and its dependencies
image = Image.debian_slim(python_version=PYTHON_VERSION).pip_install(
["fastapi[standard]", "pymongo", "pydantic"]
).run_commands(
"apt-get update",
"apt-get install -y software-properties-common",
"apt-get install -y git",
"apt-add-repository non-free",
"apt-add-repository contrib",
# Install crawl4ai from the next branch
f"pip install -U git+https://github.com/unclecode/crawl4ai.git@{CRAWL4AI_VERSION}",
"pip install -U fastapi[standard]",
"pip install -U pydantic",
# Install playwright and browsers
"crawl4ai-setup",
)
# Create persistent volume for user database
user_db = Volume.from_name("crawl4ai-users", create_if_missing=True)
# Create admin secret for secure operations
admin_secret = Secret.from_name("admin-secret", create_if_missing=True)
# Define the app
app = App(APP_NAME, image=image)
# Default configurations
DEFAULT_BROWSER_CONFIG = {
"headless": True,
"verbose": False,
}
DEFAULT_CRAWLER_CONFIG = {
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.48,
"threshold_type": "fixed"
}
}
}
}
}
}
}
# Database operations
@app.function(volumes={"/data": user_db})
def init_db() -> None:
"""Initialize database with indexes."""
from pymongo import MongoClient, ASCENDING
client = MongoClient("mongodb://localhost:27017")
db = client.crawl4ai_db
# Ensure indexes for faster lookups
db.users.create_index([("api_token", ASCENDING)], unique=True)
db.users.create_index([("email", ASCENDING)], unique=True)
# Create usage stats collection
db.usage_stats.create_index([("user_id", ASCENDING), ("timestamp", ASCENDING)])
print("Database initialized with required indexes")
@app.function(volumes={"/data": user_db})
def get_user_by_token(api_token: str) -> Optional[Dict[str, Any]]:
"""Get user by API token."""
from pymongo import MongoClient
from bson.objectid import ObjectId
client = MongoClient("mongodb://localhost:27017")
db = client.crawl4ai_db
user = db.users.find_one({"api_token": api_token})
if not user:
return None
# Convert ObjectId to string for serialization
user["_id"] = str(user["_id"])
return user
@app.function(volumes={"/data": user_db})
def create_user(email: str, name: str) -> Dict[str, Any]:
"""Create a new user with initial credits."""
from pymongo import MongoClient
from bson.objectid import ObjectId
client = MongoClient("mongodb://localhost:27017")
db = client.crawl4ai_db
# Generate API token
api_token = str(uuid.uuid4())
user = {
"email": email,
"name": name,
"api_token": api_token,
"credits": DEFAULT_CREDITS,
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow(),
"is_active": True
}
try:
result = db.users.insert_one(user)
user["_id"] = str(result.inserted_id)
return user
except Exception as e:
if "duplicate key error" in str(e):
return {"error": "User with this email already exists"}
raise
@app.function(volumes={"/data": user_db})
def update_user_credits(api_token: str, amount: int) -> Dict[str, Any]:
"""Update user credits (add or subtract)."""
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.crawl4ai_db
# First get current user to check credits
user = db.users.find_one({"api_token": api_token})
if not user:
return {"success": False, "error": "User not found"}
# For deductions, ensure sufficient credits
if amount < 0 and user["credits"] + amount < 0:
return {"success": False, "error": "Insufficient credits"}
# Update credits
result = db.users.update_one(
{"api_token": api_token},
{
"$inc": {"credits": amount},
"$set": {"updated_at": datetime.utcnow()}
}
)
if result.modified_count == 1:
# Get updated user
updated_user = db.users.find_one({"api_token": api_token})
return {
"success": True,
"credits": updated_user["credits"]
}
else:
return {"success": False, "error": "Failed to update credits"}
@app.function(volumes={"/data": user_db})
def log_usage(user_id: str, url: str, success: bool, error: Optional[str] = None) -> None:
"""Log usage statistics."""
from pymongo import MongoClient
from bson.objectid import ObjectId
client = MongoClient("mongodb://localhost:27017")
db = client.crawl4ai_db
log_entry = {
"user_id": user_id,
"url": url,
"timestamp": datetime.utcnow(),
"success": success,
"error": error
}
db.usage_stats.insert_one(log_entry)
# Main crawling function
@app.function(timeout=300) # 5 minute timeout
async def crawl(
url: str,
browser_config: Optional[Dict[str, Any]] = None,
crawler_config: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Crawl a given URL using Crawl4ai.
Args:
url: The URL to crawl
browser_config: Optional browser configuration to override defaults
crawler_config: Optional crawler configuration to override defaults
Returns:
A dictionary containing the crawl results
"""
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CrawlResult
)
# Prepare browser config using the loader method
if browser_config is None:
browser_config = DEFAULT_BROWSER_CONFIG
browser_config_obj = BrowserConfig.load(browser_config)
# Prepare crawler config using the loader method
if crawler_config is None:
crawler_config = DEFAULT_CRAWLER_CONFIG
crawler_config_obj = CrawlerRunConfig.load(crawler_config)
# Perform the crawl
async with AsyncWebCrawler(config=browser_config_obj) as crawler:
result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj)
# Return serializable results
try:
# Try newer Pydantic v2 method
return result.model_dump()
except AttributeError:
try:
# Try older Pydantic v1 method
return result.dict()
except AttributeError:
# Fallback to manual conversion
return {
"url": result.url,
"title": result.title,
"status": result.status,
"content": str(result.content) if hasattr(result, "content") else None,
"links": [{"url": link.url, "text": link.text} for link in result.links] if hasattr(result, "links") else [],
"markdown_v2": {
"raw_markdown": result.markdown_v2.raw_markdown if hasattr(result, "markdown_v2") else None
}
}
# API endpoints
@app.function()
@web_endpoint(method="POST")
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Web endpoint that accepts POST requests with JSON data containing:
- api_token: User's API token
- url: The URL to crawl
- browser_config: Optional browser configuration
- crawler_config: Optional crawler configuration
Returns the crawl results and remaining credits.
"""
# Extract and validate API token
api_token = data.get("api_token")
if not api_token:
return {
"success": False,
"error": "API token is required",
"status_code": 401
}
# Verify user
user = get_user_by_token.remote(api_token)
if not user:
return {
"success": False,
"error": "Invalid API token",
"status_code": 401
}
if not user.get("is_active", False):
return {
"success": False,
"error": "Account is inactive",
"status_code": 403
}
# Validate URL
url = data.get("url")
if not url:
return {
"success": False,
"error": "URL is required",
"status_code": 400
}
# Check credits
if user.get("credits", 0) <= 0:
return {
"success": False,
"error": "Insufficient credits",
"status_code": 403
}
# Deduct credit first (1 credit per call)
credit_result = update_user_credits.remote(api_token, -1)
if not credit_result.get("success", False):
return {
"success": False,
"error": credit_result.get("error", "Failed to process credits"),
"status_code": 500
}
# Extract configs
browser_config = data.get("browser_config")
crawler_config = data.get("crawler_config")
# Perform crawl
try:
start_time = time.time()
result = crawl.remote(url, browser_config, crawler_config)
execution_time = time.time() - start_time
# Log successful usage
log_usage.spawn(user["_id"], url, True)
return {
"success": True,
"data": result,
"credits_remaining": credit_result.get("credits"),
"execution_time_seconds": round(execution_time, 2),
"status_code": 200
}
except Exception as e:
# Log failed usage
log_usage.spawn(user["_id"], url, False, str(e))
# Return error
return {
"success": False,
"error": f"Crawling error: {str(e)}",
"credits_remaining": credit_result.get("credits"),
"status_code": 500
}
# Admin endpoints
@app.function(secrets=[admin_secret])
@web_endpoint(method="POST")
def admin_create_user(data: Dict[str, Any]) -> Dict[str, Any]:
"""Admin endpoint to create new users."""
# Validate admin token
admin_token = data.get("admin_token")
if admin_token != os.environ.get("ADMIN_TOKEN"):
return {
"success": False,
"error": "Invalid admin token",
"status_code": 401
}
# Validate input
email = data.get("email")
name = data.get("name")
if not email or not name:
return {
"success": False,
"error": "Email and name are required",
"status_code": 400
}
# Create user
user = create_user.remote(email, name)
if "error" in user:
return {
"success": False,
"error": user["error"],
"status_code": 400
}
return {
"success": True,
"data": {
"user_id": user["_id"],
"email": user["email"],
"name": user["name"],
"api_token": user["api_token"],
"credits": user["credits"],
"created_at": user["created_at"].isoformat() if isinstance(user["created_at"], datetime) else user["created_at"]
},
"status_code": 201
}
@app.function(secrets=[admin_secret])
@web_endpoint(method="POST")
def admin_update_credits(data: Dict[str, Any]) -> Dict[str, Any]:
"""Admin endpoint to update user credits."""
# Validate admin token
admin_token = data.get("admin_token")
if admin_token != os.environ.get("ADMIN_TOKEN"):
return {
"success": False,
"error": "Invalid admin token",
"status_code": 401
}
# Validate input
api_token = data.get("api_token")
amount = data.get("amount")
if not api_token:
return {
"success": False,
"error": "API token is required",
"status_code": 400
}
if not isinstance(amount, int):
return {
"success": False,
"error": "Amount must be an integer",
"status_code": 400
}
# Update credits
result = update_user_credits.remote(api_token, amount)
if not result.get("success", False):
return {
"success": False,
"error": result.get("error", "Failed to update credits"),
"status_code": 400
}
return {
"success": True,
"data": {
"credits": result["credits"]
},
"status_code": 200
}
@app.function(secrets=[admin_secret])
@web_endpoint(method="GET")
def admin_get_users(admin_token: str) -> Dict[str, Any]:
"""Admin endpoint to list all users."""
# Validate admin token
if admin_token != os.environ.get("ADMIN_TOKEN"):
return {
"success": False,
"error": "Invalid admin token",
"status_code": 401
}
users = get_all_users.remote()
return {
"success": True,
"data": users,
"status_code": 200
}
@app.function(volumes={"/data": user_db})
def get_all_users() -> List[Dict[str, Any]]:
"""Get all users (for admin)."""
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db = client.crawl4ai_db
users = []
for user in db.users.find():
# Convert ObjectId to string
user["_id"] = str(user["_id"])
# Convert datetime to ISO format
for field in ["created_at", "updated_at"]:
if field in user and isinstance(user[field], datetime):
user[field] = user[field].isoformat()
users.append(user)
return users
# Public endpoints
@app.function()
@web_endpoint(method="GET")
def health_check() -> Dict[str, Any]:
"""Health check endpoint."""
return {
"status": "online",
"service": APP_NAME,
"version": CRAWL4AI_VERSION,
"timestamp": datetime.utcnow().isoformat()
}
@app.function()
@web_endpoint(method="GET")
def check_credits(api_token: str) -> Dict[str, Any]:
"""Check user credits."""
if not api_token:
return {
"success": False,
"error": "API token is required",
"status_code": 401
}
user = get_user_by_token.remote(api_token)
if not user:
return {
"success": False,
"error": "Invalid API token",
"status_code": 401
}
return {
"success": True,
"data": {
"credits": user["credits"],
"email": user["email"],
"name": user["name"]
},
"status_code": 200
}
# Local entrypoint for testing
@app.local_entrypoint()
def main(url: str = "https://www.modal.com"):
"""Command line entrypoint for local testing."""
print("Initializing database...")
init_db.remote()
print(f"Testing crawl on URL: {url}")
result = crawl.remote(url)
# Print sample of result
print("\nCrawl Result Sample:")
if "title" in result:
print(f"Title: {result['title']}")
if "status" in result:
print(f"Status: {result['status']}")
if "links" in result:
print(f"Links found: {len(result['links'])}")
if "markdown_v2" in result and result["markdown_v2"] and "raw_markdown" in result["markdown_v2"]:
print("\nMarkdown Preview (first 300 chars):")
print(result["markdown_v2"]["raw_markdown"][:300] + "...")

127
deploy/modal/entry.py Normal file
View File

@@ -0,0 +1,127 @@
import modal
from typing import Optional, Dict, Any
# Create a custom image with Crawl4ai and its dependencies
# "pip install crawl4ai",
image = modal.Image.debian_slim(python_version="3.10").pip_install(["fastapi[standard]"]).run_commands(
"apt-get update",
"apt-get install -y software-properties-common",
"apt-get install -y git",
"apt-add-repository non-free",
"apt-add-repository contrib",
"pip install -U git+https://github.com/unclecode/crawl4ai.git@next",
"pip install -U fastapi[standard]",
"pip install -U pydantic",
"crawl4ai-setup", # This installs playwright and downloads chromium
# Print fastpi version
"python -m fastapi --version",
)
# Define the app
app = modal.App("crawl4ai", image=image)
# Define default configurations
DEFAULT_BROWSER_CONFIG = {
"headless": True,
"verbose": False,
}
DEFAULT_CRAWLER_CONFIG = {
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.48,
"threshold_type": "fixed"
}
}
}
}
}
}
}
@app.function(timeout=300) # 5 minute timeout
async def crawl(
url: str,
browser_config: Optional[Dict[str, Any]] = None,
crawler_config: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Crawl a given URL using Crawl4ai.
Args:
url: The URL to crawl
browser_config: Optional browser configuration to override defaults
crawler_config: Optional crawler configuration to override defaults
Returns:
A dictionary containing the crawl results
"""
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CrawlResult
)
# Prepare browser config using the loader method
if browser_config is None:
browser_config = DEFAULT_BROWSER_CONFIG
browser_config_obj = BrowserConfig.load(browser_config)
# Prepare crawler config using the loader method
if crawler_config is None:
crawler_config = DEFAULT_CRAWLER_CONFIG
crawler_config_obj = CrawlerRunConfig.load(crawler_config)
# Perform the crawl
async with AsyncWebCrawler(config=browser_config_obj) as crawler:
result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj)
# Return serializable results
try:
# Try newer Pydantic v2 method
return result.model_dump()
except AttributeError:
try:
# Try older Pydantic v1 method
return result.__dict__
except AttributeError:
# Fallback to returning the raw result
return result
@app.function()
@modal.web_endpoint(method="POST")
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Web endpoint that accepts POST requests with JSON data containing:
- url: The URL to crawl
- browser_config: Optional browser configuration
- crawler_config: Optional crawler configuration
Returns the crawl results.
"""
url = data.get("url")
if not url:
return {"error": "URL is required"}
browser_config = data.get("browser_config")
crawler_config = data.get("crawler_config")
return crawl.remote(url, browser_config, crawler_config)
@app.local_entrypoint()
def main(url: str = "https://www.modal.com"):
"""
Command line entrypoint for local testing.
"""
result = crawl.remote(url)
print(result)

453
deploy/modal/guide.md Normal file
View File

@@ -0,0 +1,453 @@
# Deploying Crawl4ai with Modal: A Comprehensive Tutorial
Hey there! UncleCode here. I'm excited to show you how to deploy Crawl4ai using Modal - a fantastic serverless platform that makes deployment super simple and scalable.
In this tutorial, I'll walk you through deploying your own Crawl4ai instance on Modal's infrastructure. This will give you a powerful, scalable web crawling solution without having to worry about infrastructure management.
## What is Modal?
Modal is a serverless platform that allows you to run Python functions in the cloud without managing servers. It's perfect for deploying Crawl4ai because:
1. It handles all the infrastructure for you
2. It scales automatically based on demand
3. It makes deployment incredibly simple
## Prerequisites
Before we get started, you'll need:
- A Modal account (sign up at [modal.com](https://modal.com))
- Python 3.10 or later installed on your local machine
- Basic familiarity with Python and command-line operations
## Step 1: Setting Up Your Modal Account
First, sign up for a Modal account at [modal.com](https://modal.com) if you haven't already. Modal offers a generous free tier that's perfect for getting started.
After signing up, install the Modal CLI and authenticate:
```bash
pip install modal
modal token new
```
This will open a browser window where you can authenticate and generate a token for the CLI.
## Step 2: Creating Your Crawl4ai Deployment
Now, let's create a Python file called `crawl4ai_modal.py` with our deployment code:
```python
import modal
from typing import Optional, Dict, Any
# Create a custom image with Crawl4ai and its dependencies
image = modal.Image.debian_slim(python_version="3.10").pip_install(
["fastapi[standard]"]
).run_commands(
"apt-get update",
"apt-get install -y software-properties-common",
"apt-get install -y git",
"apt-add-repository non-free",
"apt-add-repository contrib",
"pip install -U crawl4ai",
"pip install -U fastapi[standard]",
"pip install -U pydantic",
"crawl4ai-setup", # This installs playwright and downloads chromium
)
# Define the app
app = modal.App("crawl4ai", image=image)
# Define default configurations
DEFAULT_BROWSER_CONFIG = {
"headless": True,
"verbose": False,
}
DEFAULT_CRAWLER_CONFIG = {
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.48,
"threshold_type": "fixed"
}
}
}
}
}
}
}
@app.function(timeout=300) # 5 minute timeout
async def crawl(
url: str,
browser_config: Optional[Dict[str, Any]] = None,
crawler_config: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Crawl a given URL using Crawl4ai.
Args:
url: The URL to crawl
browser_config: Optional browser configuration to override defaults
crawler_config: Optional crawler configuration to override defaults
Returns:
A dictionary containing the crawl results
"""
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CrawlResult
)
# Prepare browser config using the loader method
if browser_config is None:
browser_config = DEFAULT_BROWSER_CONFIG
browser_config_obj = BrowserConfig.load(browser_config)
# Prepare crawler config using the loader method
if crawler_config is None:
crawler_config = DEFAULT_CRAWLER_CONFIG
crawler_config_obj = CrawlerRunConfig.load(crawler_config)
# Perform the crawl
async with AsyncWebCrawler(config=browser_config_obj) as crawler:
result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj)
# Return serializable results
try:
# Try newer Pydantic v2 method
return result.model_dump()
except AttributeError:
try:
# Try older Pydantic v1 method
return result.dict()
except AttributeError:
# Fallback to manual conversion
return {
"url": result.url,
"title": result.title,
"status": result.status,
"content": str(result.content) if hasattr(result, "content") else None,
"links": [{"url": link.url, "text": link.text} for link in result.links] if hasattr(result, "links") else [],
"markdown_v2": {
"raw_markdown": result.markdown_v2.raw_markdown if hasattr(result, "markdown_v2") else None
}
}
@app.function()
@modal.web_endpoint(method="POST")
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Web endpoint that accepts POST requests with JSON data containing:
- url: The URL to crawl
- browser_config: Optional browser configuration
- crawler_config: Optional crawler configuration
Returns the crawl results.
"""
url = data.get("url")
if not url:
return {"error": "URL is required"}
browser_config = data.get("browser_config")
crawler_config = data.get("crawler_config")
return crawl.remote(url, browser_config, crawler_config)
@app.local_entrypoint()
def main(url: str = "https://www.modal.com"):
"""
Command line entrypoint for local testing.
"""
result = crawl.remote(url)
print(result)
```
## Step 3: Understanding the Code Components
Let's break down what's happening in this code:
### 1. Image Definition
```python
image = modal.Image.debian_slim(python_version="3.10").pip_install(
["fastapi[standard]"]
).run_commands(
"apt-get update",
"apt-get install -y software-properties-common",
"apt-get install -y git",
"apt-add-repository non-free",
"apt-add-repository contrib",
"pip install -U git+https://github.com/unclecode/crawl4ai.git@next",
"pip install -U fastapi[standard]",
"pip install -U pydantic",
"crawl4ai-setup", # This installs playwright and downloads chromium
)
```
This section defines the container image that Modal will use to run your code. It:
- Starts with a Debian Slim base image with Python 3.10
- Installs FastAPI
- Updates the system packages
- Installs Git and other dependencies
- Installs Crawl4ai from the GitHub repository
- Runs the Crawl4ai setup to install Playwright and download Chromium
### 2. Modal App Definition
```python
app = modal.App("crawl4ai", image=image)
```
This creates a Modal application named "crawl4ai" that uses the image we defined above.
### 3. Default Configurations
```python
DEFAULT_BROWSER_CONFIG = {
"headless": True,
"verbose": False,
}
DEFAULT_CRAWLER_CONFIG = {
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.48,
"threshold_type": "fixed"
}
}
}
}
}
}
}
```
These define the default configurations for the browser and crawler. You can customize these settings based on your specific needs.
### 4. The Crawl Function
```python
@app.function(timeout=300)
async def crawl(url, browser_config, crawler_config):
# Function implementation
```
This is the main function that performs the crawling. It:
- Takes a URL and optional configurations
- Sets up the browser and crawler with those configurations
- Performs the crawl
- Returns the results in a serializable format
The `@app.function(timeout=300)` decorator tells Modal to run this function in the cloud with a 5-minute timeout.
### 5. The Web Endpoint
```python
@app.function()
@modal.web_endpoint(method="POST")
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
# Function implementation
```
This creates a web endpoint that accepts POST requests. It:
- Extracts the URL and configurations from the request
- Calls the crawl function with those parameters
- Returns the results
### 6. Local Entrypoint
```python
@app.local_entrypoint()
def main(url: str = "https://www.modal.com"):
# Function implementation
```
This provides a way to test the application from the command line.
## Step 4: Testing Locally
Before deploying, let's test our application locally:
```bash
modal run crawl4ai_modal.py --url "https://example.com"
```
This command will:
1. Upload your code to Modal
2. Create the necessary containers
3. Run the `main` function with the specified URL
4. Return the results
Modal will handle all the infrastructure setup for you. You should see the crawling results printed to your console.
## Step 5: Deploying Your Application
Once you're satisfied with the local testing, it's time to deploy:
```bash
modal deploy crawl4ai_modal.py
```
This will deploy your application to Modal's cloud. The deployment process will output URLs for your web endpoints.
You should see output similar to:
```
✓ Deployed crawl4ai.
URLs:
crawl_endpoint => https://your-username--crawl-endpoint.modal.run
```
Save this URL - you'll need it to make requests to your deployment.
## Step 6: Using Your Deployment
Now that your application is deployed, you can use it by sending POST requests to the endpoint URL:
```bash
curl -X POST https://your-username--crawl-endpoint.modal.run \
-H "Content-Type: application/json" \
-d '{"url": "https://example.com"}'
```
Or in Python:
```python
import requests
response = requests.post(
"https://your-username--crawl-endpoint.modal.run",
json={"url": "https://example.com"}
)
result = response.json()
print(result)
```
You can also customize the browser and crawler configurations:
```python
requests.post(
"https://your-username--crawl-endpoint.modal.run",
json={
"url": "https://example.com",
"browser_config": {
"headless": False,
"verbose": True
},
"crawler_config": {
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.6, # Adjusted threshold
"threshold_type": "fixed"
}
}
}
}
}
}
}
}
)
```
## Step 7: Calling Your Deployment from Another Python Script
You can also call your deployed function directly from another Python script:
```python
import modal
# Get a reference to the deployed function
crawl_function = modal.Function.from_name("crawl4ai", "crawl")
# Call the function
result = crawl_function.remote("https://example.com")
print(result)
```
## Understanding Modal's Execution Flow
To understand how Modal works, it's important to know:
1. **Local vs. Remote Execution**: When you call a function with `.remote()`, it runs in Modal's cloud, not on your local machine.
2. **Container Lifecycle**: Modal creates containers on-demand and destroys them when they're not needed.
3. **Caching**: Modal caches your container images to speed up subsequent runs.
4. **Serverless Scaling**: Modal automatically scales your application based on demand.
## Customizing Your Deployment
You can customize your deployment in several ways:
### Changing the Crawl4ai Version
To use a different version of Crawl4ai, update the installation command in the image definition:
```python
"pip install -U git+https://github.com/unclecode/crawl4ai.git@main", # Use main branch
```
### Adjusting Resource Limits
You can change the resources allocated to your functions:
```python
@app.function(timeout=600, cpu=2, memory=4096) # 10 minute timeout, 2 CPUs, 4GB RAM
async def crawl(...):
# Function implementation
```
### Keeping Containers Warm
To reduce cold start times, you can keep containers warm:
```python
@app.function(keep_warm=1) # Keep 1 container warm
async def crawl(...):
# Function implementation
```
## Conclusion
That's it! You've successfully deployed Crawl4ai on Modal. You now have a scalable web crawling solution that can handle as many requests as you need without requiring any infrastructure management.
The beauty of this setup is its simplicity - Modal handles all the hard parts, letting you focus on using Crawl4ai to extract the data you need.
Feel free to reach out if you have any questions or need help with your deployment!
Happy crawling!
- UncleCode
## Additional Resources
- [Modal Documentation](https://modal.com/docs)
- [Crawl4ai GitHub Repository](https://github.com/unclecode/crawl4ai)
- [Crawl4ai Documentation](https://docs.crawl4ai.com)

317
deploy/modal/test_modal.py Normal file
View File

@@ -0,0 +1,317 @@
#!/usr/bin/env python3
"""
Crawl4ai API Testing Script
This script tests all endpoints of the Crawl4ai API service and demonstrates their usage.
"""
import argparse
import json
import sys
import time
from typing import Dict, Any, List, Optional
import requests
# Colors for terminal output
class Colors:
HEADER = '\033[95m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def print_header(text: str) -> None:
"""Print a formatted header."""
print(f"\n{Colors.HEADER}{Colors.BOLD}{'=' * 80}{Colors.ENDC}")
print(f"{Colors.HEADER}{Colors.BOLD}{text.center(80)}{Colors.ENDC}")
print(f"{Colors.HEADER}{Colors.BOLD}{'=' * 80}{Colors.ENDC}\n")
def print_step(text: str) -> None:
"""Print a formatted step description."""
print(f"{Colors.BLUE}{Colors.BOLD}>> {text}{Colors.ENDC}")
def print_success(text: str) -> None:
"""Print a success message."""
print(f"{Colors.GREEN}{text}{Colors.ENDC}")
def print_warning(text: str) -> None:
"""Print a warning message."""
print(f"{Colors.YELLOW}{text}{Colors.ENDC}")
def print_error(text: str) -> None:
"""Print an error message."""
print(f"{Colors.RED}{text}{Colors.ENDC}")
def print_json(data: Dict[str, Any]) -> None:
"""Pretty print JSON data."""
print(json.dumps(data, indent=2))
def make_request(method: str, url: str, params: Optional[Dict[str, Any]] = None,
json_data: Optional[Dict[str, Any]] = None,
expected_status: int = 200) -> Dict[str, Any]:
"""Make an HTTP request and handle errors."""
print_step(f"Making {method.upper()} request to {url}")
if params:
print(f" Parameters: {params}")
if json_data:
print(f" JSON Data: {json_data}")
try:
response = requests.request(
method=method,
url=url,
params=params,
json=json_data,
timeout=300 # 5 minute timeout for crawling operations
)
status_code = response.status_code
print(f" Status Code: {status_code}")
try:
data = response.json()
print(" Response:")
print_json(data)
if status_code != expected_status:
print_error(f"Expected status code {expected_status}, got {status_code}")
return data
print_success("Request successful")
return data
except ValueError:
print_error("Response is not valid JSON")
print(response.text)
return {"error": "Invalid JSON response"}
except requests.RequestException as e:
print_error(f"Request failed: {str(e)}")
return {"error": str(e)}
def test_health_check(base_url: str) -> bool:
"""Test the health check endpoint."""
print_header("Testing Health Check Endpoint")
response = make_request("GET", f"{base_url}/health_check")
if "status" in response and response["status"] == "online":
print_success("Health check passed")
return True
else:
print_error("Health check failed")
return False
def test_admin_create_user(base_url: str, admin_token: str, email: str, name: str) -> Optional[str]:
"""Test creating a new user."""
print_header("Testing Admin User Creation")
response = make_request(
"POST",
f"{base_url}/admin_create_user",
json_data={
"admin_token": admin_token,
"email": email,
"name": name
},
expected_status=201
)
if response.get("success") and "data" in response:
api_token = response["data"].get("api_token")
if api_token:
print_success(f"User created successfully with API token: {api_token}")
return api_token
print_error("Failed to create user")
return None
def test_check_credits(base_url: str, api_token: str) -> Optional[int]:
"""Test checking user credits."""
print_header("Testing Check Credits Endpoint")
response = make_request(
"GET",
f"{base_url}/check_credits",
params={"api_token": api_token}
)
if response.get("success") and "data" in response:
credits = response["data"].get("credits")
if credits is not None:
print_success(f"User has {credits} credits")
return credits
print_error("Failed to check credits")
return None
def test_crawl_endpoint(base_url: str, api_token: str, url: str) -> bool:
"""Test the crawl endpoint."""
print_header("Testing Crawl Endpoint")
response = make_request(
"POST",
f"{base_url}/crawl_endpoint",
json_data={
"api_token": api_token,
"url": url
}
)
if response.get("success") and "data" in response:
print_success("Crawl completed successfully")
# Display some crawl result data
data = response["data"]
if "title" in data:
print(f"Page Title: {data['title']}")
if "status" in data:
print(f"Status: {data['status']}")
if "links" in data:
print(f"Links found: {len(data['links'])}")
if "markdown_v2" in data and data["markdown_v2"] and "raw_markdown" in data["markdown_v2"]:
print("Markdown Preview (first 200 chars):")
print(data["markdown_v2"]["raw_markdown"][:200] + "...")
credits_remaining = response.get("credits_remaining")
if credits_remaining is not None:
print(f"Credits remaining: {credits_remaining}")
return True
print_error("Crawl failed")
return False
def test_admin_update_credits(base_url: str, admin_token: str, api_token: str, amount: int) -> bool:
"""Test updating user credits."""
print_header("Testing Admin Update Credits")
response = make_request(
"POST",
f"{base_url}/admin_update_credits",
json_data={
"admin_token": admin_token,
"api_token": api_token,
"amount": amount
}
)
if response.get("success") and "data" in response:
print_success(f"Credits updated successfully, new balance: {response['data'].get('credits')}")
return True
print_error("Failed to update credits")
return False
def test_admin_get_users(base_url: str, admin_token: str) -> List[Dict[str, Any]]:
"""Test getting all users."""
print_header("Testing Admin Get All Users")
response = make_request(
"GET",
f"{base_url}/admin_get_users",
params={"admin_token": admin_token}
)
if response.get("success") and "data" in response:
users = response["data"]
print_success(f"Retrieved {len(users)} users")
return users
print_error("Failed to get users")
return []
def run_full_test(base_url: str, admin_token: str) -> None:
"""Run all tests in sequence."""
# Remove trailing slash if present
base_url = base_url.rstrip('/')
# Test 1: Health Check
if not test_health_check(base_url):
print_error("Health check failed, aborting tests")
sys.exit(1)
# Test 2: Create a test user
email = f"test-user-{int(time.time())}@example.com"
name = "Test User"
api_token = test_admin_create_user(base_url, admin_token, email, name)
if not api_token:
print_error("User creation failed, aborting tests")
sys.exit(1)
# Test 3: Check initial credits
initial_credits = test_check_credits(base_url, api_token)
if initial_credits is None:
print_error("Credit check failed, aborting tests")
sys.exit(1)
# Test 4: Perform a crawl
test_url = "https://news.ycombinator.com"
crawl_success = test_crawl_endpoint(base_url, api_token, test_url)
if not crawl_success:
print_warning("Crawl test failed, but continuing with other tests")
# Test 5: Check credits after crawl
post_crawl_credits = test_check_credits(base_url, api_token)
if post_crawl_credits is not None and initial_credits is not None:
if post_crawl_credits == initial_credits - 1:
print_success("Credit deduction verified")
else:
print_warning(f"Unexpected credit change: {initial_credits} -> {post_crawl_credits}")
# Test 6: Add credits
add_credits_amount = 50
if test_admin_update_credits(base_url, admin_token, api_token, add_credits_amount):
print_success(f"Added {add_credits_amount} credits")
# Test 7: Check credits after addition
post_addition_credits = test_check_credits(base_url, api_token)
if post_addition_credits is not None and post_crawl_credits is not None:
if post_addition_credits == post_crawl_credits + add_credits_amount:
print_success("Credit addition verified")
else:
print_warning(f"Unexpected credit change: {post_crawl_credits} -> {post_addition_credits}")
# Test 8: Get all users
users = test_admin_get_users(base_url, admin_token)
if users:
# Check if our test user is in the list
test_user = next((user for user in users if user.get("email") == email), None)
if test_user:
print_success("Test user found in users list")
else:
print_warning("Test user not found in users list")
# Final report
print_header("Test Summary")
print_success("All endpoints tested successfully")
print(f"Test user created with email: {email}")
print(f"API token: {api_token}")
print(f"Final credit balance: {post_addition_credits}")
def main():
parser = argparse.ArgumentParser(description="Test Crawl4ai API endpoints")
parser.add_argument("--base-url", required=True, help="Base URL of the Crawl4ai API (e.g., https://username--crawl4ai-api.modal.run)")
parser.add_argument("--admin-token", required=True, help="Admin token for authentication")
args = parser.parse_args()
print_header("Crawl4ai API Test Script")
print(f"Testing API at: {args.base_url}")
run_full_test(args.base_url, args.admin_token)
if __name__ == "__main__":
main()