feat(api): add API token authentication and update Dockerfile description

This commit is contained in:
UncleCode
2024-11-16 18:08:56 +08:00
parent 1961adb530
commit 6360d0545a
4 changed files with 65 additions and 9 deletions

View File

@@ -12,7 +12,7 @@ ARG ENABLE_GPU=false
# Platform-specific labels
LABEL maintainer="unclecode"
LABEL description="Crawl4AI - Advanced Web Crawler with AI capabilities"
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
LABEL version="1.0"
# Environment setup
@@ -79,6 +79,7 @@ COPY . .
RUN pip install --no-cache-dir -r requirements.txt
# Install required library for FastAPI
RUN pip install .
RUN pip install fastapi uvicorn psutil
# Install ML dependencies first for better layer caching

33
docker-compose.yml Normal file
View File

@@ -0,0 +1,33 @@
version: '3.8'
services:
crawl4ai:
build:
context: .
dockerfile: Dockerfile
args:
PYTHON_VERSION: 3.10
INSTALL_TYPE: all
ENABLE_GPU: false
ports:
- "11235:11235" # FastAPI server
- "8000:8000" # Alternative port
- "9222:9222" # Browser debugging
- "8080:8080" # Additional port
environment:
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token
volumes:
- /dev/shm:/dev/shm # Shared memory for browser operations
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 1G
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s

View File

@@ -7,12 +7,14 @@ import os
from typing import Dict, Any
class Crawl4AiTester:
def __init__(self, base_url: str = "http://localhost:11235"):
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
self.base_url = base_url
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
# Submit crawl job
response = requests.post(f"{self.base_url}/crawl", json=request_data)
response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
task_id = response.json()["task_id"]
print(f"Task ID: {task_id}")
@@ -22,7 +24,7 @@ class Crawl4AiTester:
if time.time() - start_time > timeout:
raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
result = requests.get(f"{self.base_url}/task/{task_id}")
result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
status = result.json()
if status["status"] == "failed":
@@ -35,14 +37,17 @@ class Crawl4AiTester:
time.sleep(2)
def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60)
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
if response.status_code == 408:
raise TimeoutError("Task did not complete within server timeout")
response.raise_for_status()
return response.json()
def test_docker_deployment(version="basic"):
tester = Crawl4AiTester()
tester = Crawl4AiTester(
# base_url="http://localhost:11235"
base_url="https://crawl4ai-sby74.ondigitalocean.app"
)
print(f"Testing Crawl4AI Docker {version} version")
# Health check with timeout and retry

23
main.py
View File

@@ -10,6 +10,8 @@ from fastapi.exceptions import RequestValidationError
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.responses import FileResponse
from fastapi.responses import RedirectResponse
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from fastapi import Depends, Security
from pydantic import BaseModel, HttpUrl, Field
from typing import Optional, List, Dict, Any, Union
@@ -322,6 +324,21 @@ app.add_middleware(
# Mount the pages directory as a static directory
app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
# API token security
security = HTTPBearer()
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
if not CRAWL4AI_API_TOKEN:
return credentials # No token verification if CRAWL4AI_API_TOKEN is not set
if credentials.credentials != CRAWL4AI_API_TOKEN:
raise HTTPException(status_code=401, detail="Invalid token")
return credentials
# Helper function to conditionally apply security
def secure_endpoint():
return Depends(verify_token) if CRAWL4AI_API_TOKEN else None
# Check if site directory exists
if os.path.exists(__location__ + "/site"):
# Mount the site directory as a static directory
@@ -348,12 +365,12 @@ def read_root():
return {"message": "Crawl4AI API service is running"}
@app.post("/crawl")
@app.post("/crawl", dependencies=[Depends(verify_token)])
async def crawl(request: CrawlRequest) -> Dict[str, str]:
task_id = await crawler_service.submit_task(request)
return {"task_id": task_id}
@app.get("/task/{task_id}")
@app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
async def get_task_status(task_id: str):
task_info = crawler_service.task_manager.get_task(task_id)
if not task_info:
@@ -375,7 +392,7 @@ async def get_task_status(task_id: str):
return response
@app.post("/crawl_sync")
@app.post("/crawl_sync", dependencies=[Depends(verify_token)])
async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
task_id = await crawler_service.submit_task(request)