feat(docker): enhance Docker deployment setup and configuration

Add comprehensive Docker deployment configuration with:
- New .dockerignore and .llm.env.example files
- Enhanced Dockerfile with multi-stage build and optimizations
- Detailed README with setup instructions and environment configurations
- Improved requirements.txt with Gunicorn
- Better error handling in async_configs.py

BREAKING CHANGE: Docker deployment now requires .llm.env file for API keys
This commit is contained in:
UncleCode
2025-02-01 19:33:27 +08:00
parent 20920fa17b
commit 2f15976b34
10 changed files with 324 additions and 121 deletions

1
.gitignore vendored
View File

@@ -250,3 +250,4 @@ continue_config.json
.prompts/
.llm.env

View File

@@ -1,3 +1,3 @@
# crawl4ai/_version.py
# __version__ = "0.4.3b3"
__version__ = "0.4.300b4"
__version__ = "0.4.300"

View File

@@ -1,3 +1,4 @@
from regex import B
from .config import (
MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -359,7 +360,7 @@ class BrowserConfig():
@staticmethod
def load( data: dict) -> "BrowserConfig":
# Deserialize the object from a dictionary
return from_serializable_dict(data)
return from_serializable_dict(data) if data else BrowserConfig()
class CrawlerRunConfig():
@@ -794,7 +795,7 @@ class CrawlerRunConfig():
@staticmethod
def load(data: dict) -> "CrawlerRunConfig":
# Deserialize the object from a dictionary
return from_serializable_dict(data)
return from_serializable_dict(data) if data else CrawlerRunConfig()
def to_dict(self):
return {

View File

@@ -0,0 +1,31 @@
# .dockerignore
*
# Allow specific files and directories when using local installation
!crawl4ai/
!docs/
!deploy/docker/
!setup.py
!pyproject.toml
!README.md
!LICENSE
!MANIFEST.in
!setup.cfg
!mkdocs.yml
.git/
__pycache__/
*.pyc
*.pyo
*.pyd
.DS_Store
.env
.venv
venv/
tests/
coverage.xml
*.log
*.swp
*.egg-info/
dist/
build/

View File

@@ -0,0 +1,8 @@
# LLM Provider Keys
OPENAI_API_KEY=your_openai_key_here
DEEPSEEK_API_KEY=your_deepseek_key_here
ANTHROPIC_API_KEY=your_anthropic_key_here
GROQ_API_KEY=your_groq_key_here
TOGETHER_API_KEY=your_together_key_here
MISTRAL_API_KEY=your_mistral_key_here
GEMINI_API_TOKEN=your_gemini_key_here

View File

@@ -1,18 +1,174 @@
FROM python:3.10-slim
# Install system dependencies
RUN apt-get update && apt-get install -y \
# Set build arguments
ARG APP_HOME=/app
ARG GITHUB_REPO=https://github.com/yourusername/crawl4ai.git
ARG GITHUB_BRANCH=main
ARG USE_LOCAL=false
# 🤓 Environment variables - because who doesn't love a good ENV party?
ENV PYTHONFAULTHANDLER=1 \
PYTHONHASHSEED=random \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_DEFAULT_TIMEOUT=100 \
DEBIAN_FRONTEND=noninteractive
# Other build arguments
ARG PYTHON_VERSION=3.10
ARG INSTALL_TYPE=default
ARG ENABLE_GPU=false
ARG TARGETARCH
# 🎯 Platform-specific labels - because even containers need ID badges
LABEL maintainer="unclecode"
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
LABEL version="1.0"
# 📦 Installing system dependencies... please hold, your package is being delivered
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
wget \
gnupg \
git \
cmake \
pkg-config \
python3-dev \
libjpeg-dev \
&& rm -rf /var/lib/apt/lists/*
# Install Playwright dependencies
# 🎭 Playwright dependencies - because browsers need their vitamins too
RUN apt-get update && apt-get install -y --no-install-recommends \
libglib2.0-0 \
libnss3 \
libnspr4 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libdrm2 \
libdbus-1-3 \
libxcb1 \
libxkbcommon0 \
libx11-6 \
libxcomposite1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxrandr2 \
libgbm1 \
libpango-1.0-0 \
libcairo2 \
libasound2 \
libatspi2.0-0 \
&& rm -rf /var/lib/apt/lists/*
# 🎮 GPU support - because sometimes CPU just doesn't cut it
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
apt-get update && apt-get install -y --no-install-recommends \
nvidia-cuda-toolkit \
&& rm -rf /var/lib/apt/lists/* ; \
else \
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
fi
# 🏗️ Platform-specific optimizations - because one size doesn't fit all
RUN if [ "$TARGETARCH" = "arm64" ]; then \
echo "🦾 Installing ARM-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \
libopenblas-dev \
&& rm -rf /var/lib/apt/lists/*; \
elif [ "$TARGETARCH" = "amd64" ]; then \
echo "🖥️ Installing AMD64-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \
libomp-dev \
&& rm -rf /var/lib/apt/lists/*; \
fi
WORKDIR ${APP_HOME}
# 🔄 Installation script - now with retry logic because sometimes Git needs a coffee break
RUN echo '#!/bin/bash\n\
if [ "$USE_LOCAL" = "true" ]; then\n\
echo "📦 Installing from local source..."\n\
pip install --no-cache-dir /tmp/project/\n\
else\n\
echo "🌐 Installing from GitHub..."\n\
for i in {1..3}; do \n\
git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
{ echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\
done\n\
pip install --no-cache-dir /tmp/crawl4ai\n\
fi' > /tmp/install.sh && chmod +x /tmp/install.sh
# Copy local project if USE_LOCAL is true
COPY . /tmp/project/
# Copy and install other requirements
COPY deploy/docker/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Install ML dependencies first for better layer caching
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
pip install --no-cache-dir \
torch \
torchvision \
torchaudio \
scikit-learn \
nltk \
transformers \
tokenizers && \
python -m nltk.downloader punkt stopwords ; \
fi
# Install the package
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
pip install "/tmp/project/[all]" && \
python -m crawl4ai.model_loader ; \
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
pip install "/tmp/project/[torch]" ; \
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
pip install "/tmp/project/[transformer]" && \
python -m crawl4ai.model_loader ; \
else \
pip install "/tmp/project" ; \
fi
# 🚀 Installation validation - trust but verify!
RUN pip install --no-cache-dir --upgrade pip && \
/tmp/install.sh && \
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
RUN playwright install --with-deps chromium
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
# Copy application files
COPY deploy/docker/* ${APP_HOME}/
COPY . .
# 🏥 Health check - now with memory validation!
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD bash -c '\
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
if [ $MEM -lt 2048 ]; then \
echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
exit 1; \
fi && \
curl -f http://localhost:8000/health || exit 1'
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
# Entrypoint script
COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
ENTRYPOINT ["docker-entrypoint.sh"]
# Default command - may the server be with you! 🚀
CMD ["gunicorn", \
"--bind", "0.0.0.0:8000", \
"--workers", "4", \
"--threads", "2", \
"--timeout", "120", \
"--graceful-timeout", "30", \
"--log-level", "info", \
"--worker-class", "uvicorn.workers.UvicornWorker", \
"server:app"]

113
deploy/docker/README.md Normal file
View File

@@ -0,0 +1,113 @@
# Crawl4AI Docker Setup
## Quick Start
1. Build the Docker image:
```bash
docker build -t crawl4ai-server:prod .
```
2. Run the container:
```bash
docker run -d -p 8000:8000 \
--env-file .llm.env \
--name crawl4ai \
crawl4ai-server:prod
```
---
## Configuration Options
### 1. **Using .llm.env File**
Create a `.llm.env` file with your API keys:
```bash
OPENAI_API_KEY=sk-your-key
DEEPSEEK_API_KEY=your-deepseek-key
```
Run with:
```bash
docker run -d -p 8000:8000 \
--env-file .llm.env \
crawl4ai-server:prod
```
### 2. **Direct Environment Variables**
Pass keys directly:
```bash
docker run -d -p 8000:8000 \
-e OPENAI_API_KEY="sk-your-key" \
-e DEEPSEEK_API_KEY="your-deepseek-key" \
crawl4ai-server:prod
```
### 3. **Copy Host Environment Variables**
Use the `--copy-env` flag to copy `.llm.env` from the host:
```bash
docker run -d -p 8000:8000 \
--copy-env \
crawl4ai-server:prod
```
### 4. **Advanced: Docker Compose**
Create a `docker-compose.yml`:
```yaml
version: '3.8'
services:
crawl4ai:
image: crawl4ai-server:prod
ports:
- "8000:8000"
env_file:
- .llm.env
restart: unless-stopped
```
Run with:
```bash
docker-compose up -d
```
---
## Supported Environment Variables
| Variable | Description |
|------------------------|--------------------------------------|
| `OPENAI_API_KEY` | OpenAI API key |
| `DEEPSEEK_API_KEY` | DeepSeek API key |
| `ANTHROPIC_API_KEY` | Anthropic API key |
| `GROQ_API_KEY` | Groq API key |
| `TOGETHER_API_KEY` | Together API key |
| `LLAMA_CLOUD_API_KEY` | Llama Cloud API key |
| `COHERE_API_KEY` | Cohere API key |
| `MISTRAL_API_KEY` | Mistral API key |
| `PERPLEXITY_API_KEY` | Perplexity API key |
| `VERTEXAI_PROJECT_ID` | Google Vertex AI project ID |
| `VERTEXAI_LOCATION` | Google Vertex AI location |
---
## Healthcheck
The container includes a healthcheck:
```bash
curl http://localhost:8000/health
```
---
## Troubleshooting
1. **Missing Keys**: Ensure all required keys are set in `.llm.env`.
2. **Permissions**: Run `chmod +x docker-entrypoint.sh` if permissions are denied.
3. **Logs**: Check logs with:
```bash
docker logs crawl4ai
```
---
## Security Best Practices
- Never commit `.llm.env` to version control.
- Use Docker secrets in production (Swarm/K8s).
- Rotate keys regularly.

View File

@@ -1,3 +1,4 @@
crawl4ai
fastapi
uvicorn
uvicorn
gunicorn>=23.0.0

View File

@@ -1,108 +0,0 @@
import httpx
import asyncio
import json
async def test_regular():
"""Test non-streaming API call"""
async with httpx.AsyncClient() as client:
response = await client.post("http://localhost:8000/crawl", json={
"urls": ["https://example.com"] * 3, # Test with 3 identical URLs
"browser_config": {
"headless": True,
"verbose": False
},
"crawler_config": {
"cache_mode": "BYPASS",
"stream": False
}
})
results = response.json()
print("\nRegular Response:")
print(f"Got {len(results['results'])} results at once")
for result in results['results']:
print(f"URL: {result['url']}, Success: {result['success']}")
async def test_streaming():
"""Test streaming API call"""
async with httpx.AsyncClient() as client:
try:
response = await client.post(
"http://localhost:8000/crawl",
json={
"urls": ["https://example.com"] * 3,
"browser_config": {
"headless": True,
"verbose": False
},
"crawler_config": {
"cache_mode": "BYPASS",
"stream": True
}
},
timeout=30.0
)
print("\nStreaming Response:")
async for line in response.aiter_lines():
if line.strip():
try:
result = json.loads(line)
print(f"Received result for URL: {result['url']}, Success: {result['success']}")
except json.JSONDecodeError as e:
print(f"Error decoding response: {e}")
continue
except Exception as e:
print(f"Error during streaming: {e}")
async def test_complex_config():
"""Test API with complex nested configurations"""
async with httpx.AsyncClient() as client:
response = await client.post("http://localhost:8000/crawl",
timeout=30.0, json={
"urls": ["https://en.wikipedia.org/wiki/Apple"],
"browser_config": {
"headless": True,
"verbose": False
},
"crawler_config": {
"cache_mode": "BYPASS",
"excluded_tags": ["nav", "footer", "aside"],
"remove_overlay_elements": True,
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.48,
"threshold_type": "fixed",
"min_word_threshold": 0
}
},
"options": {"ignore_links": True}
}
}
}
})
result = response.json()
if result['success']:
for r in result['results']:
print(f"Full Markdown Length: {len(r['markdown_v2']['raw_markdown'])}")
print(f"Fit Markdown Length: {len(r['markdown_v2']['fit_markdown'])}")
async def main():
"""Run both tests"""
print("Testing Crawl4AI API...")
# print("\n1. Testing regular (non-streaming) endpoint...")
# await test_regular()
# print("\n2. Testing streaming endpoint...")
# await test_streaming()
print("\n3. Testing complex configuration...")
await test_complex_config()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -164,7 +164,7 @@ async def main():
"""Run all tests"""
# Test direct API
print("Testing direct API calls...")
# await test_direct_api()
await test_direct_api()
# Test client SDK
print("\nTesting client SDK...")