feat(docker): enhance Docker deployment setup and configuration
Add comprehensive Docker deployment configuration with: - New .dockerignore and .llm.env.example files - Enhanced Dockerfile with multi-stage build and optimizations - Detailed README with setup instructions and environment configurations - Improved requirements.txt with Gunicorn - Better error handling in async_configs.py BREAKING CHANGE: Docker deployment now requires .llm.env file for API keys
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -250,3 +250,4 @@ continue_config.json
|
||||
|
||||
.prompts/
|
||||
|
||||
.llm.env
|
||||
@@ -1,3 +1,3 @@
|
||||
# crawl4ai/_version.py
|
||||
# __version__ = "0.4.3b3"
|
||||
__version__ = "0.4.300b4"
|
||||
__version__ = "0.4.300"
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from regex import B
|
||||
from .config import (
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
@@ -359,7 +360,7 @@ class BrowserConfig():
|
||||
@staticmethod
|
||||
def load( data: dict) -> "BrowserConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
return from_serializable_dict(data)
|
||||
return from_serializable_dict(data) if data else BrowserConfig()
|
||||
|
||||
|
||||
class CrawlerRunConfig():
|
||||
@@ -794,7 +795,7 @@ class CrawlerRunConfig():
|
||||
@staticmethod
|
||||
def load(data: dict) -> "CrawlerRunConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
return from_serializable_dict(data)
|
||||
return from_serializable_dict(data) if data else CrawlerRunConfig()
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
|
||||
31
deploy/docker/.dockerignore
Normal file
31
deploy/docker/.dockerignore
Normal file
@@ -0,0 +1,31 @@
|
||||
# .dockerignore
|
||||
*
|
||||
|
||||
# Allow specific files and directories when using local installation
|
||||
!crawl4ai/
|
||||
!docs/
|
||||
!deploy/docker/
|
||||
!setup.py
|
||||
!pyproject.toml
|
||||
!README.md
|
||||
!LICENSE
|
||||
!MANIFEST.in
|
||||
!setup.cfg
|
||||
!mkdocs.yml
|
||||
|
||||
.git/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
.DS_Store
|
||||
.env
|
||||
.venv
|
||||
venv/
|
||||
tests/
|
||||
coverage.xml
|
||||
*.log
|
||||
*.swp
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
8
deploy/docker/.llm.env.example
Normal file
8
deploy/docker/.llm.env.example
Normal file
@@ -0,0 +1,8 @@
|
||||
# LLM Provider Keys
|
||||
OPENAI_API_KEY=your_openai_key_here
|
||||
DEEPSEEK_API_KEY=your_deepseek_key_here
|
||||
ANTHROPIC_API_KEY=your_anthropic_key_here
|
||||
GROQ_API_KEY=your_groq_key_here
|
||||
TOGETHER_API_KEY=your_together_key_here
|
||||
MISTRAL_API_KEY=your_mistral_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
@@ -1,18 +1,174 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
# Set build arguments
|
||||
ARG APP_HOME=/app
|
||||
ARG GITHUB_REPO=https://github.com/yourusername/crawl4ai.git
|
||||
ARG GITHUB_BRANCH=main
|
||||
ARG USE_LOCAL=false
|
||||
|
||||
# 🤓 Environment variables - because who doesn't love a good ENV party?
|
||||
ENV PYTHONFAULTHANDLER=1 \
|
||||
PYTHONHASHSEED=random \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_DEFAULT_TIMEOUT=100 \
|
||||
DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Other build arguments
|
||||
ARG PYTHON_VERSION=3.10
|
||||
ARG INSTALL_TYPE=default
|
||||
ARG ENABLE_GPU=false
|
||||
ARG TARGETARCH
|
||||
|
||||
# 🎯 Platform-specific labels - because even containers need ID badges
|
||||
LABEL maintainer="unclecode"
|
||||
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||
LABEL version="1.0"
|
||||
|
||||
# 📦 Installing system dependencies... please hold, your package is being delivered
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
wget \
|
||||
gnupg \
|
||||
git \
|
||||
cmake \
|
||||
pkg-config \
|
||||
python3-dev \
|
||||
libjpeg-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Playwright dependencies
|
||||
# 🎭 Playwright dependencies - because browsers need their vitamins too
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libglib2.0-0 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libdbus-1-3 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
libx11-6 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
libatspi2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 🎮 GPU support - because sometimes CPU just doesn't cut it
|
||||
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
nvidia-cuda-toolkit \
|
||||
&& rm -rf /var/lib/apt/lists/* ; \
|
||||
else \
|
||||
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
||||
fi
|
||||
|
||||
# 🏗️ Platform-specific optimizations - because one size doesn't fit all
|
||||
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
||||
echo "🦾 Installing ARM-specific optimizations"; \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
libopenblas-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*; \
|
||||
elif [ "$TARGETARCH" = "amd64" ]; then \
|
||||
echo "🖥️ Installing AMD64-specific optimizations"; \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
libomp-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*; \
|
||||
fi
|
||||
|
||||
WORKDIR ${APP_HOME}
|
||||
|
||||
# 🔄 Installation script - now with retry logic because sometimes Git needs a coffee break
|
||||
RUN echo '#!/bin/bash\n\
|
||||
if [ "$USE_LOCAL" = "true" ]; then\n\
|
||||
echo "📦 Installing from local source..."\n\
|
||||
pip install --no-cache-dir /tmp/project/\n\
|
||||
else\n\
|
||||
echo "🌐 Installing from GitHub..."\n\
|
||||
for i in {1..3}; do \n\
|
||||
git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
|
||||
{ echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\
|
||||
done\n\
|
||||
pip install --no-cache-dir /tmp/crawl4ai\n\
|
||||
fi' > /tmp/install.sh && chmod +x /tmp/install.sh
|
||||
|
||||
# Copy local project if USE_LOCAL is true
|
||||
COPY . /tmp/project/
|
||||
|
||||
# Copy and install other requirements
|
||||
COPY deploy/docker/requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install ML dependencies first for better layer caching
|
||||
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
||||
pip install --no-cache-dir \
|
||||
torch \
|
||||
torchvision \
|
||||
torchaudio \
|
||||
scikit-learn \
|
||||
nltk \
|
||||
transformers \
|
||||
tokenizers && \
|
||||
python -m nltk.downloader punkt stopwords ; \
|
||||
fi
|
||||
|
||||
# Install the package
|
||||
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
||||
pip install "/tmp/project/[all]" && \
|
||||
python -m crawl4ai.model_loader ; \
|
||||
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
|
||||
pip install "/tmp/project/[torch]" ; \
|
||||
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
|
||||
pip install "/tmp/project/[transformer]" && \
|
||||
python -m crawl4ai.model_loader ; \
|
||||
else \
|
||||
pip install "/tmp/project" ; \
|
||||
fi
|
||||
|
||||
# 🚀 Installation validation - trust but verify!
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
/tmp/install.sh && \
|
||||
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
|
||||
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
|
||||
|
||||
RUN playwright install --with-deps chromium
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install -r requirements.txt
|
||||
# Copy application files
|
||||
COPY deploy/docker/* ${APP_HOME}/
|
||||
|
||||
COPY . .
|
||||
# 🏥 Health check - now with memory validation!
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD bash -c '\
|
||||
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
|
||||
if [ $MEM -lt 2048 ]; then \
|
||||
echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
|
||||
exit 1; \
|
||||
fi && \
|
||||
curl -f http://localhost:8000/health || exit 1'
|
||||
|
||||
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
# Entrypoint script
|
||||
COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/
|
||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||
ENTRYPOINT ["docker-entrypoint.sh"]
|
||||
|
||||
# Default command - may the server be with you! 🚀
|
||||
CMD ["gunicorn", \
|
||||
"--bind", "0.0.0.0:8000", \
|
||||
"--workers", "4", \
|
||||
"--threads", "2", \
|
||||
"--timeout", "120", \
|
||||
"--graceful-timeout", "30", \
|
||||
"--log-level", "info", \
|
||||
"--worker-class", "uvicorn.workers.UvicornWorker", \
|
||||
"server:app"]
|
||||
113
deploy/docker/README.md
Normal file
113
deploy/docker/README.md
Normal file
@@ -0,0 +1,113 @@
|
||||
# Crawl4AI Docker Setup
|
||||
|
||||
## Quick Start
|
||||
1. Build the Docker image:
|
||||
```bash
|
||||
docker build -t crawl4ai-server:prod .
|
||||
```
|
||||
|
||||
2. Run the container:
|
||||
```bash
|
||||
docker run -d -p 8000:8000 \
|
||||
--env-file .llm.env \
|
||||
--name crawl4ai \
|
||||
crawl4ai-server:prod
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Configuration Options
|
||||
|
||||
### 1. **Using .llm.env File**
|
||||
Create a `.llm.env` file with your API keys:
|
||||
```bash
|
||||
OPENAI_API_KEY=sk-your-key
|
||||
DEEPSEEK_API_KEY=your-deepseek-key
|
||||
```
|
||||
|
||||
Run with:
|
||||
```bash
|
||||
docker run -d -p 8000:8000 \
|
||||
--env-file .llm.env \
|
||||
crawl4ai-server:prod
|
||||
```
|
||||
|
||||
### 2. **Direct Environment Variables**
|
||||
Pass keys directly:
|
||||
```bash
|
||||
docker run -d -p 8000:8000 \
|
||||
-e OPENAI_API_KEY="sk-your-key" \
|
||||
-e DEEPSEEK_API_KEY="your-deepseek-key" \
|
||||
crawl4ai-server:prod
|
||||
```
|
||||
|
||||
### 3. **Copy Host Environment Variables**
|
||||
Use the `--copy-env` flag to copy `.llm.env` from the host:
|
||||
```bash
|
||||
docker run -d -p 8000:8000 \
|
||||
--copy-env \
|
||||
crawl4ai-server:prod
|
||||
```
|
||||
|
||||
### 4. **Advanced: Docker Compose**
|
||||
Create a `docker-compose.yml`:
|
||||
```yaml
|
||||
version: '3.8'
|
||||
services:
|
||||
crawl4ai:
|
||||
image: crawl4ai-server:prod
|
||||
ports:
|
||||
- "8000:8000"
|
||||
env_file:
|
||||
- .llm.env
|
||||
restart: unless-stopped
|
||||
```
|
||||
|
||||
Run with:
|
||||
```bash
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Supported Environment Variables
|
||||
| Variable | Description |
|
||||
|------------------------|--------------------------------------|
|
||||
| `OPENAI_API_KEY` | OpenAI API key |
|
||||
| `DEEPSEEK_API_KEY` | DeepSeek API key |
|
||||
| `ANTHROPIC_API_KEY` | Anthropic API key |
|
||||
| `GROQ_API_KEY` | Groq API key |
|
||||
| `TOGETHER_API_KEY` | Together API key |
|
||||
| `LLAMA_CLOUD_API_KEY` | Llama Cloud API key |
|
||||
| `COHERE_API_KEY` | Cohere API key |
|
||||
| `MISTRAL_API_KEY` | Mistral API key |
|
||||
| `PERPLEXITY_API_KEY` | Perplexity API key |
|
||||
| `VERTEXAI_PROJECT_ID` | Google Vertex AI project ID |
|
||||
| `VERTEXAI_LOCATION` | Google Vertex AI location |
|
||||
|
||||
---
|
||||
|
||||
## Healthcheck
|
||||
The container includes a healthcheck:
|
||||
```bash
|
||||
curl http://localhost:8000/health
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
1. **Missing Keys**: Ensure all required keys are set in `.llm.env`.
|
||||
2. **Permissions**: Run `chmod +x docker-entrypoint.sh` if permissions are denied.
|
||||
3. **Logs**: Check logs with:
|
||||
```bash
|
||||
docker logs crawl4ai
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Security Best Practices
|
||||
- Never commit `.llm.env` to version control.
|
||||
- Use Docker secrets in production (Swarm/K8s).
|
||||
- Rotate keys regularly.
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
crawl4ai
|
||||
fastapi
|
||||
uvicorn
|
||||
uvicorn
|
||||
gunicorn>=23.0.0
|
||||
@@ -1,108 +0,0 @@
|
||||
import httpx
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
async def test_regular():
|
||||
"""Test non-streaming API call"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post("http://localhost:8000/crawl", json={
|
||||
"urls": ["https://example.com"] * 3, # Test with 3 identical URLs
|
||||
"browser_config": {
|
||||
"headless": True,
|
||||
"verbose": False
|
||||
},
|
||||
"crawler_config": {
|
||||
"cache_mode": "BYPASS",
|
||||
"stream": False
|
||||
}
|
||||
})
|
||||
results = response.json()
|
||||
print("\nRegular Response:")
|
||||
print(f"Got {len(results['results'])} results at once")
|
||||
for result in results['results']:
|
||||
print(f"URL: {result['url']}, Success: {result['success']}")
|
||||
|
||||
async def test_streaming():
|
||||
"""Test streaming API call"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
"http://localhost:8000/crawl",
|
||||
json={
|
||||
"urls": ["https://example.com"] * 3,
|
||||
"browser_config": {
|
||||
"headless": True,
|
||||
"verbose": False
|
||||
},
|
||||
"crawler_config": {
|
||||
"cache_mode": "BYPASS",
|
||||
"stream": True
|
||||
}
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
|
||||
print("\nStreaming Response:")
|
||||
async for line in response.aiter_lines():
|
||||
if line.strip():
|
||||
try:
|
||||
result = json.loads(line)
|
||||
print(f"Received result for URL: {result['url']}, Success: {result['success']}")
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error decoding response: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Error during streaming: {e}")
|
||||
|
||||
async def test_complex_config():
|
||||
"""Test API with complex nested configurations"""
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post("http://localhost:8000/crawl",
|
||||
timeout=30.0, json={
|
||||
"urls": ["https://en.wikipedia.org/wiki/Apple"],
|
||||
"browser_config": {
|
||||
"headless": True,
|
||||
"verbose": False
|
||||
},
|
||||
"crawler_config": {
|
||||
"cache_mode": "BYPASS",
|
||||
"excluded_tags": ["nav", "footer", "aside"],
|
||||
"remove_overlay_elements": True,
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed",
|
||||
"min_word_threshold": 0
|
||||
}
|
||||
},
|
||||
"options": {"ignore_links": True}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
result = response.json()
|
||||
if result['success']:
|
||||
for r in result['results']:
|
||||
print(f"Full Markdown Length: {len(r['markdown_v2']['raw_markdown'])}")
|
||||
print(f"Fit Markdown Length: {len(r['markdown_v2']['fit_markdown'])}")
|
||||
|
||||
async def main():
|
||||
"""Run both tests"""
|
||||
print("Testing Crawl4AI API...")
|
||||
|
||||
# print("\n1. Testing regular (non-streaming) endpoint...")
|
||||
# await test_regular()
|
||||
|
||||
# print("\n2. Testing streaming endpoint...")
|
||||
# await test_streaming()
|
||||
|
||||
print("\n3. Testing complex configuration...")
|
||||
await test_complex_config()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -164,7 +164,7 @@ async def main():
|
||||
"""Run all tests"""
|
||||
# Test direct API
|
||||
print("Testing direct API calls...")
|
||||
# await test_direct_api()
|
||||
await test_direct_api()
|
||||
|
||||
# Test client SDK
|
||||
print("\nTesting client SDK...")
|
||||
|
||||
Reference in New Issue
Block a user