diff --git a/.gitignore b/.gitignore index 5d39e6e9..e10e3e88 100644 --- a/.gitignore +++ b/.gitignore @@ -250,3 +250,4 @@ continue_config.json .prompts/ +.llm.env \ No newline at end of file diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 5955f704..58b9c4ec 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,3 +1,3 @@ # crawl4ai/_version.py # __version__ = "0.4.3b3" -__version__ = "0.4.300b4" +__version__ = "0.4.300" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 2e401686..3e9f582f 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,3 +1,4 @@ +from regex import B from .config import ( MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, @@ -359,7 +360,7 @@ class BrowserConfig(): @staticmethod def load( data: dict) -> "BrowserConfig": # Deserialize the object from a dictionary - return from_serializable_dict(data) + return from_serializable_dict(data) if data else BrowserConfig() class CrawlerRunConfig(): @@ -794,7 +795,7 @@ class CrawlerRunConfig(): @staticmethod def load(data: dict) -> "CrawlerRunConfig": # Deserialize the object from a dictionary - return from_serializable_dict(data) + return from_serializable_dict(data) if data else CrawlerRunConfig() def to_dict(self): return { diff --git a/deploy/docker/.dockerignore b/deploy/docker/.dockerignore new file mode 100644 index 00000000..6f126444 --- /dev/null +++ b/deploy/docker/.dockerignore @@ -0,0 +1,31 @@ +# .dockerignore +* + +# Allow specific files and directories when using local installation +!crawl4ai/ +!docs/ +!deploy/docker/ +!setup.py +!pyproject.toml +!README.md +!LICENSE +!MANIFEST.in +!setup.cfg +!mkdocs.yml + +.git/ +__pycache__/ +*.pyc +*.pyo +*.pyd +.DS_Store +.env +.venv +venv/ +tests/ +coverage.xml +*.log +*.swp +*.egg-info/ +dist/ +build/ \ No newline at end of file diff --git a/deploy/docker/.llm.env.example b/deploy/docker/.llm.env.example new file mode 100644 index 00000000..5fee4a93 --- /dev/null +++ b/deploy/docker/.llm.env.example @@ -0,0 +1,8 @@ +# LLM Provider Keys +OPENAI_API_KEY=your_openai_key_here +DEEPSEEK_API_KEY=your_deepseek_key_here +ANTHROPIC_API_KEY=your_anthropic_key_here +GROQ_API_KEY=your_groq_key_here +TOGETHER_API_KEY=your_together_key_here +MISTRAL_API_KEY=your_mistral_key_here +GEMINI_API_TOKEN=your_gemini_key_here \ No newline at end of file diff --git a/deploy/docker/Dockerfile b/deploy/docker/Dockerfile index 864b9f27..d8aa2757 100644 --- a/deploy/docker/Dockerfile +++ b/deploy/docker/Dockerfile @@ -1,18 +1,174 @@ FROM python:3.10-slim -# Install system dependencies -RUN apt-get update && apt-get install -y \ +# Set build arguments +ARG APP_HOME=/app +ARG GITHUB_REPO=https://github.com/yourusername/crawl4ai.git +ARG GITHUB_BRANCH=main +ARG USE_LOCAL=false + +# 🤓 Environment variables - because who doesn't love a good ENV party? +ENV PYTHONFAULTHANDLER=1 \ + PYTHONHASHSEED=random \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=100 \ + DEBIAN_FRONTEND=noninteractive + +# Other build arguments +ARG PYTHON_VERSION=3.10 +ARG INSTALL_TYPE=default +ARG ENABLE_GPU=false +ARG TARGETARCH + +# 🎯 Platform-specific labels - because even containers need ID badges +LABEL maintainer="unclecode" +LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" +LABEL version="1.0" + +# 📦 Installing system dependencies... please hold, your package is being delivered +RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ + curl \ wget \ + gnupg \ + git \ + cmake \ + pkg-config \ + python3-dev \ + libjpeg-dev \ && rm -rf /var/lib/apt/lists/* -# Install Playwright dependencies +# 🎭 Playwright dependencies - because browsers need their vitamins too +RUN apt-get update && apt-get install -y --no-install-recommends \ + libglib2.0-0 \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libdbus-1-3 \ + libxcb1 \ + libxkbcommon0 \ + libx11-6 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libpango-1.0-0 \ + libcairo2 \ + libasound2 \ + libatspi2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# 🎮 GPU support - because sometimes CPU just doesn't cut it +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ +else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ +fi + +# 🏗️ Platform-specific optimizations - because one size doesn't fit all +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + echo "🦾 Installing ARM-specific optimizations"; \ + apt-get update && apt-get install -y --no-install-recommends \ + libopenblas-dev \ + && rm -rf /var/lib/apt/lists/*; \ +elif [ "$TARGETARCH" = "amd64" ]; then \ + echo "🖥️ Installing AMD64-specific optimizations"; \ + apt-get update && apt-get install -y --no-install-recommends \ + libomp-dev \ + && rm -rf /var/lib/apt/lists/*; \ +fi + +WORKDIR ${APP_HOME} + +# 🔄 Installation script - now with retry logic because sometimes Git needs a coffee break +RUN echo '#!/bin/bash\n\ +if [ "$USE_LOCAL" = "true" ]; then\n\ + echo "📦 Installing from local source..."\n\ + pip install --no-cache-dir /tmp/project/\n\ +else\n\ + echo "🌐 Installing from GitHub..."\n\ + for i in {1..3}; do \n\ + git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\ + { echo "Attempt $i/3 failed! Taking a short break... ☕"; sleep 5; }; \n\ + done\n\ + pip install --no-cache-dir /tmp/crawl4ai\n\ +fi' > /tmp/install.sh && chmod +x /tmp/install.sh + +# Copy local project if USE_LOCAL is true +COPY . /tmp/project/ + +# Copy and install other requirements +COPY deploy/docker/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Install ML dependencies first for better layer caching +RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ + pip install --no-cache-dir \ + torch \ + torchvision \ + torchaudio \ + scikit-learn \ + nltk \ + transformers \ + tokenizers && \ + python -m nltk.downloader punkt stopwords ; \ + fi + +# Install the package +RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ + pip install "/tmp/project/[all]" && \ + python -m crawl4ai.model_loader ; \ + elif [ "$INSTALL_TYPE" = "torch" ] ; then \ + pip install "/tmp/project/[torch]" ; \ + elif [ "$INSTALL_TYPE" = "transformer" ] ; then \ + pip install "/tmp/project/[transformer]" && \ + python -m crawl4ai.model_loader ; \ + else \ + pip install "/tmp/project" ; \ + fi + + # 🚀 Installation validation - trust but verify! +RUN pip install --no-cache-dir --upgrade pip && \ + /tmp/install.sh && \ + python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \ + python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')" + RUN playwright install --with-deps chromium -WORKDIR /app -COPY requirements.txt . -RUN pip install -r requirements.txt +# Copy application files +COPY deploy/docker/* ${APP_HOME}/ -COPY . . +# 🏥 Health check - now with memory validation! +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD bash -c '\ + MEM=$(free -m | awk "/^Mem:/{print \$2}"); \ + if [ $MEM -lt 2048 ]; then \ + echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \ + exit 1; \ + fi && \ + curl -f http://localhost:8000/health || exit 1' -CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file +# Entrypoint script +COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/ +RUN chmod +x /usr/local/bin/docker-entrypoint.sh +ENTRYPOINT ["docker-entrypoint.sh"] + +# Default command - may the server be with you! 🚀 +CMD ["gunicorn", \ + "--bind", "0.0.0.0:8000", \ + "--workers", "4", \ + "--threads", "2", \ + "--timeout", "120", \ + "--graceful-timeout", "30", \ + "--log-level", "info", \ + "--worker-class", "uvicorn.workers.UvicornWorker", \ + "server:app"] \ No newline at end of file diff --git a/deploy/docker/README.md b/deploy/docker/README.md new file mode 100644 index 00000000..b7d9533e --- /dev/null +++ b/deploy/docker/README.md @@ -0,0 +1,113 @@ +# Crawl4AI Docker Setup + +## Quick Start +1. Build the Docker image: + ```bash + docker build -t crawl4ai-server:prod . + ``` + +2. Run the container: + ```bash + docker run -d -p 8000:8000 \ + --env-file .llm.env \ + --name crawl4ai \ + crawl4ai-server:prod + ``` + +--- + +## Configuration Options + +### 1. **Using .llm.env File** +Create a `.llm.env` file with your API keys: +```bash +OPENAI_API_KEY=sk-your-key +DEEPSEEK_API_KEY=your-deepseek-key +``` + +Run with: +```bash +docker run -d -p 8000:8000 \ + --env-file .llm.env \ + crawl4ai-server:prod +``` + +### 2. **Direct Environment Variables** +Pass keys directly: +```bash +docker run -d -p 8000:8000 \ + -e OPENAI_API_KEY="sk-your-key" \ + -e DEEPSEEK_API_KEY="your-deepseek-key" \ + crawl4ai-server:prod +``` + +### 3. **Copy Host Environment Variables** +Use the `--copy-env` flag to copy `.llm.env` from the host: +```bash +docker run -d -p 8000:8000 \ + --copy-env \ + crawl4ai-server:prod +``` + +### 4. **Advanced: Docker Compose** +Create a `docker-compose.yml`: +```yaml +version: '3.8' +services: + crawl4ai: + image: crawl4ai-server:prod + ports: + - "8000:8000" + env_file: + - .llm.env + restart: unless-stopped +``` + +Run with: +```bash +docker-compose up -d +``` + +--- + +## Supported Environment Variables +| Variable | Description | +|------------------------|--------------------------------------| +| `OPENAI_API_KEY` | OpenAI API key | +| `DEEPSEEK_API_KEY` | DeepSeek API key | +| `ANTHROPIC_API_KEY` | Anthropic API key | +| `GROQ_API_KEY` | Groq API key | +| `TOGETHER_API_KEY` | Together API key | +| `LLAMA_CLOUD_API_KEY` | Llama Cloud API key | +| `COHERE_API_KEY` | Cohere API key | +| `MISTRAL_API_KEY` | Mistral API key | +| `PERPLEXITY_API_KEY` | Perplexity API key | +| `VERTEXAI_PROJECT_ID` | Google Vertex AI project ID | +| `VERTEXAI_LOCATION` | Google Vertex AI location | + +--- + +## Healthcheck +The container includes a healthcheck: +```bash +curl http://localhost:8000/health +``` + +--- + +## Troubleshooting +1. **Missing Keys**: Ensure all required keys are set in `.llm.env`. +2. **Permissions**: Run `chmod +x docker-entrypoint.sh` if permissions are denied. +3. **Logs**: Check logs with: + ```bash + docker logs crawl4ai + ``` + +--- + +## Security Best Practices +- Never commit `.llm.env` to version control. +- Use Docker secrets in production (Swarm/K8s). +- Rotate keys regularly. + + diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt index 9636247d..a395fa85 100644 --- a/deploy/docker/requirements.txt +++ b/deploy/docker/requirements.txt @@ -1,3 +1,4 @@ crawl4ai fastapi -uvicorn \ No newline at end of file +uvicorn +gunicorn>=23.0.0 \ No newline at end of file diff --git a/deploy/docker/test.py b/deploy/docker/test.py deleted file mode 100644 index c0b27ea9..00000000 --- a/deploy/docker/test.py +++ /dev/null @@ -1,108 +0,0 @@ -import httpx -import asyncio -import json - -async def test_regular(): - """Test non-streaming API call""" - async with httpx.AsyncClient() as client: - response = await client.post("http://localhost:8000/crawl", json={ - "urls": ["https://example.com"] * 3, # Test with 3 identical URLs - "browser_config": { - "headless": True, - "verbose": False - }, - "crawler_config": { - "cache_mode": "BYPASS", - "stream": False - } - }) - results = response.json() - print("\nRegular Response:") - print(f"Got {len(results['results'])} results at once") - for result in results['results']: - print(f"URL: {result['url']}, Success: {result['success']}") - -async def test_streaming(): - """Test streaming API call""" - async with httpx.AsyncClient() as client: - try: - response = await client.post( - "http://localhost:8000/crawl", - json={ - "urls": ["https://example.com"] * 3, - "browser_config": { - "headless": True, - "verbose": False - }, - "crawler_config": { - "cache_mode": "BYPASS", - "stream": True - } - }, - timeout=30.0 - ) - - print("\nStreaming Response:") - async for line in response.aiter_lines(): - if line.strip(): - try: - result = json.loads(line) - print(f"Received result for URL: {result['url']}, Success: {result['success']}") - except json.JSONDecodeError as e: - print(f"Error decoding response: {e}") - continue - except Exception as e: - print(f"Error during streaming: {e}") - -async def test_complex_config(): - """Test API with complex nested configurations""" - async with httpx.AsyncClient() as client: - response = await client.post("http://localhost:8000/crawl", - timeout=30.0, json={ - "urls": ["https://en.wikipedia.org/wiki/Apple"], - "browser_config": { - "headless": True, - "verbose": False - }, - "crawler_config": { - "cache_mode": "BYPASS", - "excluded_tags": ["nav", "footer", "aside"], - "remove_overlay_elements": True, - "markdown_generator": { - "type": "DefaultMarkdownGenerator", - "params": { - "content_filter": { - "type": "PruningContentFilter", - "params": { - "threshold": 0.48, - "threshold_type": "fixed", - "min_word_threshold": 0 - } - }, - "options": {"ignore_links": True} - } - } - } - }) - - result = response.json() - if result['success']: - for r in result['results']: - print(f"Full Markdown Length: {len(r['markdown_v2']['raw_markdown'])}") - print(f"Fit Markdown Length: {len(r['markdown_v2']['fit_markdown'])}") - -async def main(): - """Run both tests""" - print("Testing Crawl4AI API...") - - # print("\n1. Testing regular (non-streaming) endpoint...") - # await test_regular() - - # print("\n2. Testing streaming endpoint...") - # await test_streaming() - - print("\n3. Testing complex configuration...") - await test_complex_config() - -if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file diff --git a/tests/docker/test_docker.py b/tests/docker/test_docker.py index 5b40bda4..4327b3ac 100644 --- a/tests/docker/test_docker.py +++ b/tests/docker/test_docker.py @@ -164,7 +164,7 @@ async def main(): """Run all tests""" # Test direct API print("Testing direct API calls...") - # await test_direct_api() + await test_direct_api() # Test client SDK print("\nTesting client SDK...")