From 6360d0545ac2812687a1a9a31de95fa64f600ed4 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 18:08:56 +0800 Subject: [PATCH] feat(api): add API token authentication and update Dockerfile description --- Dockerfile | 3 ++- docker-compose.yml | 33 +++++++++++++++++++++++++++++++++ docs/examples/docker_example.py | 15 ++++++++++----- main.py | 23 ++++++++++++++++++++--- 4 files changed, 65 insertions(+), 9 deletions(-) create mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile index ba29faf1..76b4e1cf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,7 +12,7 @@ ARG ENABLE_GPU=false # Platform-specific labels LABEL maintainer="unclecode" -LABEL description="Crawl4AI - Advanced Web Crawler with AI capabilities" +LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" LABEL version="1.0" # Environment setup @@ -79,6 +79,7 @@ COPY . . RUN pip install --no-cache-dir -r requirements.txt # Install required library for FastAPI +RUN pip install . RUN pip install fastapi uvicorn psutil # Install ML dependencies first for better layer caching diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..ef0dc9e4 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,33 @@ +version: '3.8' + +services: + crawl4ai: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: 3.10 + INSTALL_TYPE: all + ENABLE_GPU: false + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 502f1e52..6701f6ac 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -7,12 +7,14 @@ import os from typing import Dict, Any class Crawl4AiTester: - def __init__(self, base_url: str = "http://localhost:11235"): + def __init__(self, base_url: str = "http://localhost:11235", api_token: str = None): self.base_url = base_url + self.api_token = api_token or os.getenv('CRAWL4AI_API_TOKEN') # Check environment variable as fallback + self.headers = {'Authorization': f'Bearer {self.api_token}'} if self.api_token else {} def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: # Submit crawl job - response = requests.post(f"{self.base_url}/crawl", json=request_data) + response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) task_id = response.json()["task_id"] print(f"Task ID: {task_id}") @@ -22,7 +24,7 @@ class Crawl4AiTester: if time.time() - start_time > timeout: raise TimeoutError(f"Task {task_id} did not complete within {timeout} seconds") - result = requests.get(f"{self.base_url}/task/{task_id}") + result = requests.get(f"{self.base_url}/task/{task_id}", headers=self.headers) status = result.json() if status["status"] == "failed": @@ -35,14 +37,17 @@ class Crawl4AiTester: time.sleep(2) def submit_sync(self, request_data: Dict[str, Any]) -> Dict[str, Any]: - response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, timeout=60) + response = requests.post(f"{self.base_url}/crawl_sync", json=request_data, headers=self.headers, timeout=60) if response.status_code == 408: raise TimeoutError("Task did not complete within server timeout") response.raise_for_status() return response.json() def test_docker_deployment(version="basic"): - tester = Crawl4AiTester() + tester = Crawl4AiTester( + # base_url="http://localhost:11235" + base_url="https://crawl4ai-sby74.ondigitalocean.app" + ) print(f"Testing Crawl4AI Docker {version} version") # Health check with timeout and retry diff --git a/main.py b/main.py index 660c3366..92b1793b 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,8 @@ from fastapi.exceptions import RequestValidationError from starlette.middleware.base import BaseHTTPMiddleware from starlette.responses import FileResponse from fastapi.responses import RedirectResponse +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from fastapi import Depends, Security from pydantic import BaseModel, HttpUrl, Field from typing import Optional, List, Dict, Any, Union @@ -322,6 +324,21 @@ app.add_middleware( # Mount the pages directory as a static directory app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") +# API token security +security = HTTPBearer() +CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") + +async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)): + if not CRAWL4AI_API_TOKEN: + return credentials # No token verification if CRAWL4AI_API_TOKEN is not set + if credentials.credentials != CRAWL4AI_API_TOKEN: + raise HTTPException(status_code=401, detail="Invalid token") + return credentials + +# Helper function to conditionally apply security +def secure_endpoint(): + return Depends(verify_token) if CRAWL4AI_API_TOKEN else None + # Check if site directory exists if os.path.exists(__location__ + "/site"): # Mount the site directory as a static directory @@ -348,12 +365,12 @@ def read_root(): return {"message": "Crawl4AI API service is running"} -@app.post("/crawl") +@app.post("/crawl", dependencies=[Depends(verify_token)]) async def crawl(request: CrawlRequest) -> Dict[str, str]: task_id = await crawler_service.submit_task(request) return {"task_id": task_id} -@app.get("/task/{task_id}") +@app.get("/task/{task_id}", dependencies=[Depends(verify_token)]) async def get_task_status(task_id: str): task_info = crawler_service.task_manager.get_task(task_id) if not task_info: @@ -375,7 +392,7 @@ async def get_task_status(task_id: str): return response -@app.post("/crawl_sync") +@app.post("/crawl_sync", dependencies=[Depends(verify_token)]) async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]: task_id = await crawler_service.submit_task(request)