feat(api): add API token authentication and update Dockerfile description
This commit is contained in:
@@ -12,7 +12,7 @@ ARG ENABLE_GPU=false
|
||||
|
||||
# Platform-specific labels
|
||||
LABEL maintainer="unclecode"
|
||||
LABEL description="Crawl4AI - Advanced Web Crawler with AI capabilities"
|
||||
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||
LABEL version="1.0"
|
||||
|
||||
# Environment setup
|
||||
@@ -79,6 +79,7 @@ COPY . .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install required library for FastAPI
|
||||
RUN pip install .
|
||||
RUN pip install fastapi uvicorn psutil
|
||||
|
||||
# Install ML dependencies first for better layer caching
|
||||
|
||||
33
docker-compose.yml
Normal file
33
docker-compose.yml
Normal file
@@ -0,0 +1,33 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
crawl4ai:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
args:
|
||||
PYTHON_VERSION: 3.10
|
||||
INSTALL_TYPE: all
|
||||
ENABLE_GPU: false
|
||||
ports:
|
||||
- "11235:11235" # FastAPI server
|
||||
- "8000:8000" # Alternative port
|
||||
- "9222:9222" # Browser debugging
|
||||
- "8080:8080" # Additional port
|
||||
environment:
|
||||
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm # Shared memory for browser operations
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
reservations:
|
||||
memory: 1G
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
@@ -7,12 +7,14 @@ import os
|
||||
from typing import Dict, Any
|
||||
|
||||
class Crawl4AiTester:
|
||||
def __init__(self, base_url: str = "http://localhost:11235"):
|
||||
def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None):
|
||||
self.base_url = base_url
|
||||
self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback
|
||||
self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {}
|
||||
|
||||
def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]:
|
||||
# Submit crawl job
|
||||
response = requests.post(f"{self.base_url}/crawl", json=request_data)
|
||||
response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers)
|
||||
task_id = response.json()["task_id"]
|
||||
print(f"Task ID: {task_id}")
|
||||
|
||||
@@ -22,7 +24,7 @@ class Crawl4AiTester:
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds")
|
||||
|
||||
result = requests.get(f"{self.base_url}/task/{task_id}")
|
||||
result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers)
|
||||
status = result.json()
|
||||
|
||||
if status["status"] == "failed":
|
||||
@@ -35,14 +37,17 @@ class Crawl4AiTester:
|
||||
time.sleep(2)
|
||||
|
||||
def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60)
|
||||
response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60)
|
||||
if response.status_code == 408:
|
||||
raise TimeoutError("Task did not complete within server timeout")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def test_docker_deployment(version="basic"):
|
||||
tester = Crawl4AiTester()
|
||||
tester = Crawl4AiTester(
|
||||
# base_url="http://localhost:11235"
|
||||
base_url="https://crawl4ai-sby74.ondigitalocean.app"
|
||||
)
|
||||
print(f"Testing Crawl4AI Docker {version} version")
|
||||
|
||||
# Health check with timeout and retry
|
||||
|
||||
23
main.py
23
main.py
@@ -10,6 +10,8 @@ from fastapi.exceptions import RequestValidationError
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
from starlette.responses import FileResponse
|
||||
from fastapi.responses import RedirectResponse
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from fastapi import Depends, Security
|
||||
|
||||
from pydantic import BaseModel, HttpUrl, Field
|
||||
from typing import Optional, List, Dict, Any, Union
|
||||
@@ -322,6 +324,21 @@ app.add_middleware(
|
||||
# Mount the pages directory as a static directory
|
||||
app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages")
|
||||
|
||||
# API token security
|
||||
security = HTTPBearer()
|
||||
CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
|
||||
|
||||
async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
|
||||
if not CRAWL4AI_API_TOKEN:
|
||||
return credentials # No token verification if CRAWL4AI_API_TOKEN is not set
|
||||
if credentials.credentials != CRAWL4AI_API_TOKEN:
|
||||
raise HTTPException(status_code=401, detail="Invalid token")
|
||||
return credentials
|
||||
|
||||
# Helper function to conditionally apply security
|
||||
def secure_endpoint():
|
||||
return Depends(verify_token) if CRAWL4AI_API_TOKEN else None
|
||||
|
||||
# Check if site directory exists
|
||||
if os.path.exists(__location__ + "/site"):
|
||||
# Mount the site directory as a static directory
|
||||
@@ -348,12 +365,12 @@ def read_root():
|
||||
return {"message": "Crawl4AI API service is running"}
|
||||
|
||||
|
||||
@app.post("/crawl")
|
||||
@app.post("/crawl", dependencies=[Depends(verify_token)])
|
||||
async def crawl(request: CrawlRequest) -> Dict[str, str]:
|
||||
task_id = await crawler_service.submit_task(request)
|
||||
return {"task_id": task_id}
|
||||
|
||||
@app.get("/task/{task_id}")
|
||||
@app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
|
||||
async def get_task_status(task_id: str):
|
||||
task_info = crawler_service.task_manager.get_task(task_id)
|
||||
if not task_info:
|
||||
@@ -375,7 +392,7 @@ async def get_task_status(task_id: str):
|
||||
|
||||
return response
|
||||
|
||||
@app.post("/crawl_sync")
|
||||
@app.post("/crawl_sync", dependencies=[Depends(verify_token)])
|
||||
async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
|
||||
task_id = await crawler_service.submit_task(request)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user