From 4b45b28f256ad62272d5ea75ae898de7882618ba Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 16 Nov 2024 18:44:47 +0800 Subject: [PATCH] feat(docs): enhance deployment documentation with one-click setup, API security details, and Docker Compose examples --- README.md | 15 ++ docs/examples/docker_example.py | 13 +- docs/md_v2/basic/docker-deploymeny.md | 230 ++++++++++++++++++++------ main.py | 3 + 4 files changed, 207 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index d250f936..a2806304 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,21 @@ cd crawl4ai pip install -e . ``` +## One-Click Deployment ๐Ÿš€ + +Deploy your own instance of Crawl4AI with one click: + +[![DigitalOcean Referral Badge](https://web-platforms.sfo2.cdn.digitaloceanspaces.com/WWW/Badge%203.svg)](https://www.digitalocean.com/?repo=https://github.com/unclecode/crawl4ai/tree/0.3.74&refcode=a0780f1bdb3d&utm_campaign=Referral_Invite&utm_medium=Referral_Program&utm_source=badge) + + +> ๐Ÿ’ก **Recommended specs**: 4GB RAM minimum. Select "professional-xs" or higher when deploying for stable operation. + +The deploy will: +- Set up a Docker container with Crawl4AI +- Configure Playwright and all dependencies +- Start the FastAPI server on port 11235 +- Set up health checks and auto-deployment + ### Using Docker ๐Ÿณ Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub (recommended) or build from the repository. diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 6701f6ac..b43e8ee6 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -15,6 +15,8 @@ class Crawl4AiTester: def submit_and_wait(self, request_data: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: # Submit crawl job response = requests.post(f"{self.base_url}/crawl", json=request_data, headers=self.headers) + if response.status_code == 403: + raise Exception("API token is invalid or missing") task_id = response.json()["task_id"] print(f"Task ID: {task_id}") @@ -45,8 +47,9 @@ class Crawl4AiTester: def test_docker_deployment(version="basic"): tester = Crawl4AiTester( - # base_url="http://localhost:11235" - base_url="https://crawl4ai-sby74.ondigitalocean.app" + base_url="http://localhost:11235" , + # base_url="https://crawl4ai-sby74.ondigitalocean.app", + api_token="test" ) print(f"Testing Crawl4AI Docker {version} version") @@ -83,7 +86,8 @@ def test_basic_crawl(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl ===") request = { "urls": "https://www.nbcnews.com/business", - "priority": 10 + "priority": 10, + "session_id": "test" } result = tester.submit_and_wait(request) @@ -95,7 +99,8 @@ def test_basic_crawl_sync(tester: Crawl4AiTester): print("\n=== Testing Basic Crawl (Sync) ===") request = { "urls": "https://www.nbcnews.com/business", - "priority": 10 + "priority": 10, + "session_id": "test" } result = tester.submit_sync(request) diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index a500ee21..30555708 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -1,12 +1,115 @@ -# Docker Deployment ๐Ÿณ +# Docker Deployment Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments. -## Docker Compose Setup ๐Ÿณ +## Quick Start ๐Ÿš€ -### Basic Usage +Pull and run the basic version: + +```bash +# Basic run without security +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic + +# Run with API security enabled +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic +``` + +## API Security ๐Ÿ”’ + +### Understanding CRAWL4AI_API_TOKEN + +The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance: + +- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication +- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible + +```bash +# Secured Instance +docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all + +# Unsecured Instance +docker run -p 11235:11235 unclecode/crawl4ai:all +``` + +### Making API Calls + +For secured instances, include the token in all requests: + +```python +import requests + +# Setup headers if token is being used +api_token = "your_secret_token" # Same token set in CRAWL4AI_API_TOKEN +headers = {"Authorization": f"Bearer {api_token}"} if api_token else {} + +# Making authenticated requests +response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json={ + "urls": "https://example.com", + "priority": 10 + } +) + +# Checking task status +task_id = response.json()["task_id"] +status = requests.get( + f"http://localhost:11235/task/{task_id}", + headers=headers +) +``` + +### Using with Docker Compose + +In your `docker-compose.yml`: +```yaml +services: + crawl4ai: + image: unclecode/crawl4ai:all + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional + # ... other configuration +``` + +Then either: +1. Set in `.env` file: +```env +CRAWL4AI_API_TOKEN=your_secret_token +``` + +2. Or set via command line: +```bash +CRAWL4AI_API_TOKEN=your_secret_token docker-compose up +``` + +> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`). + +## Configuration Options ๐Ÿ”ง + +### Environment Variables + +You can configure the service using environment variables: + +```bash +# Basic configuration +docker run -p 11235:11235 \ + -e MAX_CONCURRENT_TASKS=5 \ + unclecode/crawl4ai:all + +# With security and LLM support +docker run -p 11235:11235 \ + -e CRAWL4AI_API_TOKEN=your_secret_token \ + -e OPENAI_API_KEY=sk-... \ + -e ANTHROPIC_API_KEY=sk-ant-... \ + unclecode/crawl4ai:all +``` + +### Using Docker Compose (Recommended) ๐Ÿณ Create a `docker-compose.yml`: + ```yaml version: '3.8' @@ -15,83 +118,110 @@ services: image: unclecode/crawl4ai:all ports: - "11235:11235" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API security + - MAX_CONCURRENT_TASKS=5 + # LLM Provider Keys + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} volumes: - /dev/shm:/dev/shm deploy: resources: limits: memory: 4G - restart: unless-stopped + reservations: + memory: 1G ``` -Run with: +You can run it in two ways: + +1. Using environment variables directly: ```bash -docker-compose up -d +CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up ``` -### Secure Mode with API Token - -To enable API authentication, simply set the `CRAWL4AI_API_TOKEN`: -```bash -CRAWL4AI_API_TOKEN=your-secret-token docker-compose up -d -``` - -### Using Environment Variables - -Create a `.env` file for your API tokens: +2. Using a `.env` file (recommended): +Create a `.env` file in the same directory: ```env -# Crawl4AI API Security (optional) -CRAWL4AI_API_TOKEN=your-secret-token +# API Security (optional) +CRAWL4AI_API_TOKEN=your_secret_token -# LLM Provider API Keys +# LLM Provider Keys OPENAI_API_KEY=sk-... ANTHROPIC_API_KEY=sk-ant-... -GOOGLE_API_KEY=... -GEMINI_API_KEY=... -OLLAMA_API_KEY=... -# Additional Configuration +# Other Configuration MAX_CONCURRENT_TASKS=5 ``` -Docker Compose will automatically load variables from the `.env` file. No additional configuration needed! +Then simply run: +```bash +docker-compose up +``` -### Testing with API Token +### Testing the Deployment ๐Ÿงช ```python import requests -# Initialize headers with token if using secure mode -headers = {} -if api_token := os.getenv('CRAWL4AI_API_TOKEN'): - headers['Authorization'] = f'Bearer {api_token}' +# For unsecured instances +def test_unsecured(): + # Health check + health = requests.get("http://localhost:11235/health") + print("Health check:", health.json()) -# Test crawl with authentication -response = requests.post( - "http://localhost:11235/crawl", - headers=headers, - json={ - "urls": "https://www.nbcnews.com/business", - "priority": 10 + # Basic crawl + response = requests.post( + "http://localhost:11235/crawl", + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) + +# For secured instances +def test_secured(api_token): + headers = {"Authorization": f"Bearer {api_token}"} + + # Basic crawl with authentication + response = requests.post( + "http://localhost:11235/crawl", + headers=headers, + json={ + "urls": "https://www.nbcnews.com/business", + "priority": 10 + } + ) + task_id = response.json()["task_id"] + print("Task ID:", task_id) +``` + +### LLM Extraction Example ๐Ÿค– + +When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction: + +```python +request = { + "urls": "https://example.com", + "extraction_config": { + "type": "llm", + "params": { + "provider": "openai/gpt-4", + "instruction": "Extract main topics from the page" + } } -) -task_id = response.json()["task_id"] +} + +# Make the request (add headers if using API security) +response = requests.post("http://localhost:11235/crawl", json=request) ``` -### Security Best Practices ๐Ÿ”’ +> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure! -- Add `.env` to your `.gitignore` -- Use different API tokens for development and production -- Rotate API tokens periodically -- Use secure methods to pass tokens in production environments -``` -This addition to your documentation: -1. Shows how to use Docker Compose -2. Explains both secure and non-secure modes -3. Demonstrates environment variable configuration -4. Provides example code for authenticated requests -5. Includes security best practices diff --git a/main.py b/main.py index 92b1793b..41788d61 100644 --- a/main.py +++ b/main.py @@ -65,6 +65,7 @@ class CrawlRequest(BaseModel): screenshot: bool = False magic: bool = False extra: Optional[Dict[str, Any]] = {} + session_id: Optional[str] = None @dataclass class TaskInfo: @@ -284,6 +285,7 @@ class CrawlerService: css_selector=request.css_selector, screenshot=request.screenshot, magic=request.magic, + session_id=request.session_id, **request.extra, ) else: @@ -295,6 +297,7 @@ class CrawlerService: css_selector=request.css_selector, screenshot=request.screenshot, magic=request.magic, + session_id=request.session_id, **request.extra, )