diff --git a/Dockerfile b/Dockerfile index 9796bcb6..8b84f797 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ ARG TARGETARCH LABEL maintainer="unclecode" LABEL description="πŸ”₯πŸ•·οΈ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" -LABEL version="1.0" +LABEL version="1.0" RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ @@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libjpeg-dev \ redis-server \ supervisor \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libcairo2 \ libasound2 \ libatspi2.0-0 \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/* RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \ apt-get update && apt-get install -y --no-install-recommends \ nvidia-cuda-toolkit \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/* ; \ else \ echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ @@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \ echo "🦾 Installing ARM-specific optimizations"; \ apt-get update && apt-get install -y --no-install-recommends \ libopenblas-dev \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/*; \ elif [ "$TARGETARCH" = "amd64" ]; then \ echo "πŸ–₯️ Installing AMD64-specific optimizations"; \ apt-get update && apt-get install -y --no-install-recommends \ libomp-dev \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/*; \ else \ echo "Skipping platform-specific optimizations (unsupported platform)"; \ fi +# Create a non-root user and group +RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser + +# Create and set permissions for appuser home directory +RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser + WORKDIR ${APP_HOME} RUN echo '#!/bin/bash\n\ @@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh COPY . /tmp/project/ +# Copy supervisor config first (might need root later, but okay for now) COPY deploy/docker/supervisord.conf . COPY deploy/docker/requirements.txt . @@ -131,16 +143,23 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ else \ pip install "/tmp/project" ; \ fi - + RUN pip install --no-cache-dir --upgrade pip && \ /tmp/install.sh && \ python -c "import crawl4ai; print('βœ… crawl4ai is ready to rock!')" && \ python -c "from playwright.sync_api import sync_playwright; print('βœ… Playwright is feeling dramatic!')" - + RUN playwright install --with-deps chromium +# Copy application code COPY deploy/docker/* ${APP_HOME}/ +# Change ownership of the application directory to the non-root user +RUN chown -R appuser:appuser ${APP_HOME} + +# give permissions to redis persistence dirs if used +RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis + HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ CMD bash -c '\ MEM=$(free -m | awk "/^Mem:/{print \$2}"); \ @@ -149,8 +168,10 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ exit 1; \ fi && \ redis-cli ping > /dev/null && \ - curl -f http://localhost:8000/health || exit 1' + curl -f http://localhost:11235/health || exit 1' EXPOSE 6379 -CMD ["supervisord", "-c", "supervisord.conf"] - +# Switch to the non-root user before starting the application +USER appuser + +CMD ["supervisord", "-c", "supervisord.conf"] \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index f99d1cb9..3278c731 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -409,7 +409,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): user_agent = kwargs.get("user_agent", self.user_agent) # Use browser_manager to get a fresh page & context assigned to this session_id - page, context = await self.browser_manager.get_page(session_id, user_agent) + page, context = await self.browser_manager.get_page(CrawlerRunConfig( + session_id=session_id, + user_agent=user_agent, + **kwargs, + )) return session_id async def crawl( @@ -447,12 +451,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): html = f.read() if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) + if config.capture_console_messages: + page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + captured_console = await self._capture_console_messages(page, url) + return AsyncCrawlResponse( html=html, response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, get_delayed_content=None, + console_messages=captured_console, ) elif url.startswith("raw:") or url.startswith("raw://"): @@ -582,7 +591,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "url": request.url, "method": request.method, "resource_type": request.resource_type, - "failure_text": request.failure.error_text if request.failure else "Unknown failure", + "failure_text": str(request.failure) if request.failure else "Unknown failure", "timestamp": time.time() }) except Exception as e: @@ -1274,6 +1283,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) return None + async def _capture_console_messages( + self, page: Page, file_path: str + ) -> List[Dict[str, Union[str, float]]]: + """ + Captures console messages from the page. + Args: + + page (Page): The Playwright page object + Returns: + List[Dict[str, Union[str, float]]]: A list of captured console messages + """ + captured_console = [] + + def handle_console_message(msg): + try: + message_type = msg.type + message_text = msg.text + + entry = { + "type": message_type, + "text": message_text, + "timestamp": time.time(), + } + captured_console.append(entry) + except Exception as e: + if self.logger: + self.logger.warning( + f"Error capturing console message: {e}", tag="CAPTURE" + ) + + page.on("console", handle_console_message) + + await page.goto(file_path) + + return captured_console + async def take_screenshot(self, page, **kwargs) -> str: """ Take a screenshot of the current page. diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 7fc819e0..f3c7d861 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -658,7 +658,7 @@ class BrowserManager: "name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url - if crawlerRunConfig + if crawlerRunConfig and crawlerRunConfig.url else "https://crawl4ai.com/", } ] diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt index b7e6d8ad..40a33a79 100644 --- a/deploy/docker/requirements.txt +++ b/deploy/docker/requirements.txt @@ -1,4 +1,3 @@ -crawl4ai fastapi uvicorn gunicorn>=23.0.0 diff --git a/deploy/docker/supervisord.conf b/deploy/docker/supervisord.conf index 1274f2c3..d51cc953 100644 --- a/deploy/docker/supervisord.conf +++ b/deploy/docker/supervisord.conf @@ -1,12 +1,28 @@ [supervisord] -nodaemon=true +nodaemon=true ; Run supervisord in the foreground +logfile=/dev/null ; Log supervisord output to stdout/stderr +logfile_maxbytes=0 [program:redis] -command=redis-server +command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine +user=appuser ; Run redis as our non-root user autorestart=true priority=10 +stdout_logfile=/dev/stdout ; Redirect redis stdout to container stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr ; Redirect redis stderr to container stderr +stderr_logfile_maxbytes=0 [program:gunicorn] -command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app +command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app +directory=/app ; Working directory for the app +user=appuser ; Run gunicorn as our non-root user autorestart=true -priority=20 \ No newline at end of file +priority=20 +environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs +stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr +stderr_logfile_maxbytes=0 + +# Optional: Add filebeat or other logging agents here if needed \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 6a7bf7cb..f112f9fd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,15 +1,31 @@ -# Base configuration (not a service, just a reusable config block) +# docker-compose.yml +# This file is in the root directory alongside Dockerfile + +# Base configuration anchor for reusability x-base-config: &base-config ports: + # Map host port 11235 to container port 11235 (where Gunicorn will listen) - "11235:11235" - - "8000:8000" - - "9222:9222" - - "8080:8080" + # - "8080:8080" # Uncomment if needed + + # Load API keys primarily from .llm.env file + # Create .llm.env in the root directory from deploy/docker/.llm.env.example + env_file: + - .llm.env + + # Define environment variables, allowing overrides from host environment + # Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} + - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} + - GROQ_API_KEY=${GROQ_API_KEY:-} + - TOGETHER_API_KEY=${TOGETHER_API_KEY:-} + - MISTRAL_API_KEY=${MISTRAL_API_KEY:-} + - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-} + volumes: + # Mount /dev/shm for Chromium/Playwright performance - /dev/shm:/dev/shm deploy: resources: @@ -19,47 +35,47 @@ x-base-config: &base-config memory: 1G restart: unless-stopped healthcheck: + # IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf test: ["CMD", "curl", "-f", "http://localhost:11235/health"] interval: 30s timeout: 10s retries: 3 - start_period: 40s + start_period: 40s # Give the server time to start + # Run the container as the non-root user defined in the Dockerfile + user: "appuser" services: - # Local build services for different platforms - crawl4ai-amd64: + # --- Local Build Services --- + crawl4ai-local-amd64: build: - context: . - dockerfile: Dockerfile + context: . # Build context is the root directory + dockerfile: Dockerfile # Dockerfile is in the root directory args: - PYTHON_VERSION: "3.10" - INSTALL_TYPE: ${INSTALL_TYPE:-basic} - ENABLE_GPU: false - platforms: - - linux/amd64 + INSTALL_TYPE: ${INSTALL_TYPE:-default} + ENABLE_GPU: ${ENABLE_GPU:-false} + # PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile + platform: linux/amd64 profiles: ["local-amd64"] - <<: *base-config # extends yerine doğrudan yapΔ±landΔ±rmayΔ± dahil ettik + <<: *base-config # Inherit base configuration - crawl4ai-arm64: + crawl4ai-local-arm64: build: - context: . - dockerfile: Dockerfile + context: . # Build context is the root directory + dockerfile: Dockerfile # Dockerfile is in the root directory args: - PYTHON_VERSION: "3.10" - INSTALL_TYPE: ${INSTALL_TYPE:-basic} - ENABLE_GPU: false - platforms: - - linux/arm64 + INSTALL_TYPE: ${INSTALL_TYPE:-default} + ENABLE_GPU: ${ENABLE_GPU:-false} + platform: linux/arm64 profiles: ["local-arm64"] <<: *base-config - # Hub services for different platforms and versions + # --- Docker Hub Image Services --- crawl4ai-hub-amd64: - image: unclecode/crawl4ai:${VERSION:-basic}-amd64 + image: unclecode/crawl4ai:${VERSION:-latest}-amd64 profiles: ["hub-amd64"] <<: *base-config crawl4ai-hub-arm64: - image: unclecode/crawl4ai:${VERSION:-basic}-arm64 + image: unclecode/crawl4ai:${VERSION:-latest}-arm64 profiles: ["hub-arm64"] <<: *base-config \ No newline at end of file diff --git a/docs/examples/network_console_capture_example.py b/docs/examples/network_console_capture_example.py index 5305ddc3..0208bdce 100644 --- a/docs/examples/network_console_capture_example.py +++ b/docs/examples/network_console_capture_example.py @@ -357,8 +357,7 @@ async def demo_performance_analysis(): async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( capture_network_requests=True, - wait_until="networkidle", - page_timeout=60000 # 60 seconds + page_timeout=60 * 2 * 1000 # 120 seconds ) result = await crawler.arun( @@ -406,6 +405,13 @@ async def demo_performance_analysis(): "url": url, "duration_ms": duration }) + if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing: + # Convert to milliseconds + duration = (timing["responseStart"] - timing["requestStart"]) * 1000 + resource_timings[resource_type].append({ + "url": url, + "duration_ms": duration + }) # Calculate statistics for each resource type print("\nPerformance by resource type:") @@ -455,14 +461,14 @@ async def main(): os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True) # Run basic examples - await demo_basic_network_capture() + # await demo_basic_network_capture() await demo_basic_console_capture() - await demo_combined_capture() + # await demo_combined_capture() # Run advanced examples - await analyze_spa_network_traffic() - await demo_security_analysis() - await demo_performance_analysis() + # await analyze_spa_network_traffic() + # await demo_security_analysis() + # await demo_performance_analysis() print("\n=== Examples Complete ===") print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}") diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md index a3d0def1..b4b6e414 100644 --- a/docs/md_v2/core/docker-deployment.md +++ b/docs/md_v2/core/docker-deployment.md @@ -1,702 +1,833 @@ -# Docker Deployment +# Crawl4AI Docker Guide 🐳 -Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments. +## Table of Contents +- [Prerequisites](#prerequisites) +- [Installation](#installation) + - [Local Build](#local-build) + - [Docker Hub](#docker-hub) +- [Dockerfile Parameters](#dockerfile-parameters) +- [Using the API](#using-the-api) + - [Understanding Request Schema](#understanding-request-schema) + - [REST API Examples](#rest-api-examples) + - [Python SDK](#python-sdk) +- [Metrics & Monitoring](#metrics--monitoring) +- [Deployment Scenarios](#deployment-scenarios) +- [Complete Examples](#complete-examples) +- [Getting Help](#getting-help) -## Quick Start πŸš€ +## Prerequisites -Pull and run the basic version: +Before we dive in, make sure you have: +- Docker installed and running (version 20.10.0 or higher) +- At least 4GB of RAM available for the container +- Python 3.10+ (if using the Python SDK) +- Node.js 16+ (if using the Node.js examples) + +> πŸ’‘ **Pro tip**: Run `docker info` to check your Docker installation and available resources. + +## Installation + +### Local Build + +Let's get your local environment set up step by step! + +#### 1. Building the Image + +First, clone the repository and build the Docker image: ```bash -# Basic run without security -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic +# Clone the repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai/deploy -# Run with API security enabled -docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic +# Build the Docker image +docker build --platform=linux/amd64 --no-cache -t crawl4ai . + +# Or build for arm64 +docker build --platform=linux/arm64 --no-cache -t crawl4ai . ``` -## Running with Docker Compose 🐳 +#### 2. Environment Setup -### Use Docker Compose (From Local Dockerfile or Docker Hub) +If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file: -Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub. - -### **Option 1: Using Docker Compose to Build Locally** -If you want to build the image locally, use the provided `docker-compose.local.yml` file. - -```bash -docker-compose -f docker-compose.local.yml up -d -``` - -This will: -1. Build the Docker image from the provided `Dockerfile`. -2. Start the container and expose it on `http://localhost:11235`. - ---- - -### **Option 2: Using Docker Compose with Pre-Built Image from Hub** -If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file. - -```bash -docker-compose -f docker-compose.hub.yml up -d -``` - -This will: -1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration). -2. Start the container and expose it on `http://localhost:11235`. - ---- - -### **Stopping the Running Services** - -To stop the services started via Docker Compose, you can use: - -```bash -docker-compose -f docker-compose.local.yml down -# OR -docker-compose -f docker-compose.hub.yml down -``` - -If the containers don’t stop and the application is still running, check the running containers: - -```bash -docker ps -``` - -Find the `CONTAINER ID` of the running service and stop it forcefully: - -```bash -docker stop -``` - ---- - -### **Debugging with Docker Compose** - -- **Check Logs**: To view the container logs: - ```bash - docker-compose -f docker-compose.local.yml logs -f - ``` - -- **Remove Orphaned Containers**: If the service is still running unexpectedly: - ```bash - docker-compose -f docker-compose.local.yml down --remove-orphans - ``` - -- **Manually Remove Network**: If the network is still in use: - ```bash - docker network ls - docker network rm crawl4ai_default - ``` - ---- - -### Why Use Docker Compose? - -Docker Compose is the recommended way to deploy Crawl4AI because: -1. It simplifies multi-container setups. -2. Allows you to define environment variables, resources, and ports in a single file. -3. Makes it easier to switch between local development and production-ready images. - -For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent. - - - - -## API Security πŸ”’ - -### Understanding CRAWL4AI_API_TOKEN - -The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance: - -- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication -- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible - -```bash -# Secured Instance -docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all - -# Unsecured Instance -docker run -p 11235:11235 unclecode/crawl4ai:all -``` - -### Making API Calls - -For secured instances, include the token in all requests: - -```python -import requests - -# Setup headers if token is being used -api_token = "your_secret_token" # Same token set in CRAWL4AI_API_TOKEN -headers = {"Authorization": f"Bearer {api_token}"} if api_token else {} - -# Making authenticated requests -response = requests.post( - "http://localhost:11235/crawl", - headers=headers, - json={ - "urls": "https://example.com", - "priority": 10 - } -) - -# Checking task status -task_id = response.json()["task_id"] -status = requests.get( - f"http://localhost:11235/task/{task_id}", - headers=headers -) -``` - -### Using with Docker Compose - -In your `docker-compose.yml`: -```yaml -services: - crawl4ai: - image: unclecode/crawl4ai:all - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional - # ... other configuration -``` - -Then either: -1. Set in `.env` file: ```env -CRAWL4AI_API_TOKEN=your_secret_token +# OpenAI +OPENAI_API_KEY=sk-your-key + +# Anthropic +ANTHROPIC_API_KEY=your-anthropic-key + +# DeepSeek +DEEPSEEK_API_KEY=your-deepseek-key + +# Check out https://docs.litellm.ai/docs/providers for more providers! ``` -2. Or set via command line: +> πŸ”‘ **Note**: Keep your API keys secure! Never commit them to version control. + +#### 3. Running the Container + +You have several options for running the container: + +Basic run (no LLM support): ```bash -CRAWL4AI_API_TOKEN=your_secret_token docker-compose up +docker run -d -p 8000:8000 --name crawl4ai crawl4ai ``` -> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`). +With LLM support: +```bash +docker run -d -p 8000:8000 \ + --env-file .llm.env \ + --name crawl4ai \ + crawl4ai +``` -## Configuration Options πŸ”§ +Using host environment variables (Not a good practice, but works for local testing): +```bash +docker run -d -p 8000:8000 \ + --env-file .llm.env \ + --env "$(env)" \ + --name crawl4ai \ + crawl4ai +``` -### Environment Variables - -You can configure the service using environment variables: +#### Multi-Platform Build +For distributing your image across different architectures, use `buildx`: ```bash -# Basic configuration -docker run -p 11235:11235 \ - -e MAX_CONCURRENT_TASKS=5 \ - unclecode/crawl4ai:all +# Set up buildx builder +docker buildx create --use -# With security and LLM support -docker run -p 11235:11235 \ - -e CRAWL4AI_API_TOKEN=your_secret_token \ - -e OPENAI_API_KEY=sk-... \ - -e ANTHROPIC_API_KEY=sk-ant-... \ - unclecode/crawl4ai:all +# Build for multiple platforms +docker buildx build \ + --platform linux/amd64,linux/arm64 \ + -t crawl4ai \ + --push \ + . ``` -### Using Docker Compose (Recommended) 🐳 +> πŸ’‘ **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry. -Create a `docker-compose.yml`: +#### Development Build +For development, you might want to enable all features: -```yaml -version: '3.8' - -services: - crawl4ai: - image: unclecode/crawl4ai:all - ports: - - "11235:11235" - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API security - - MAX_CONCURRENT_TASKS=5 - # LLM Provider Keys - - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} - volumes: - - /dev/shm:/dev/shm - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G -``` - -You can run it in two ways: - -1. Using environment variables directly: ```bash -CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up +docker build -t crawl4ai + --build-arg INSTALL_TYPE=all \ + --build-arg PYTHON_VERSION=3.10 \ + --build-arg ENABLE_GPU=true \ + . ``` -2. Using a `.env` file (recommended): -Create a `.env` file in the same directory: -```env -# API Security (optional) -CRAWL4AI_API_TOKEN=your_secret_token +#### GPU-Enabled Build +If you plan to use GPU acceleration: -# LLM Provider Keys -OPENAI_API_KEY=sk-... -ANTHROPIC_API_KEY=sk-ant-... - -# Other Configuration -MAX_CONCURRENT_TASKS=5 -``` - -Then simply run: ```bash -docker-compose up +docker build -t crawl4ai + --build-arg ENABLE_GPU=true \ + deploy/docker/ ``` -### Testing the Deployment πŸ§ͺ +### Build Arguments Explained + +| Argument | Description | Default | Options | +|----------|-------------|---------|----------| +| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 | +| INSTALL_TYPE | Feature set | default | default, all, torch, transformer | +| ENABLE_GPU | GPU support | false | true, false | +| APP_HOME | Install path | /app | any valid path | + +### Build Best Practices + +1. **Choose the Right Install Type** + - `default`: Basic installation, smallest image, to be honest, I use this most of the time. + - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them) + +2. **Platform Considerations** + - Let Docker auto-detect platform unless you need cross-compilation + - Use --platform for specific architecture requirements + - Consider buildx for multi-architecture distribution + +3. **Performance Optimization** + - The image automatically includes platform-specific optimizations + - AMD64 gets OpenMP optimizations + - ARM64 gets OpenBLAS optimizations + +### Docker Hub + +> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned! + +## Using the API + +In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail. + +### Python SDK + +The SDK makes things easier! Here's how to use it: ```python -import requests +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig -# For unsecured instances -def test_unsecured(): - # Health check - health = requests.get("http://localhost:11235/health") - print("Health check:", health.json()) - - # Basic crawl - response = requests.post( - "http://localhost:11235/crawl", - json={ - "urls": "https://www.nbcnews.com/business", - "priority": 10 - } - ) - task_id = response.json()["task_id"] - print("Task ID:", task_id) - -# For secured instances -def test_secured(api_token): - headers = {"Authorization": f"Bearer {api_token}"} - - # Basic crawl with authentication - response = requests.post( - "http://localhost:11235/crawl", - headers=headers, - json={ - "urls": "https://www.nbcnews.com/business", - "priority": 10 - } - ) - task_id = response.json()["task_id"] - print("Task ID:", task_id) -``` - -### LLM Extraction Example πŸ€– - -When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction: - -```python -request = { - "urls": "https://example.com", - "extraction_config": { - "type": "llm", - "params": { - "provider": "openai/gpt-4", - "instruction": "Extract main topics from the page" - } - } -} - -# Make the request (add headers if using API security) -response = requests.post("http://localhost:11235/crawl", json=request) -``` - -> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure! - - -## Usage Examples πŸ“ - -### Basic Crawling - -```python -request = { - "urls": "https://www.nbcnews.com/business", - "priority": 10 -} - -response = requests.post("http://localhost:11235/crawl", json=request) -task_id = response.json()["task_id"] - -# Get results -result = requests.get(f"http://localhost:11235/task/{task_id}") -``` - -### Structured Data Extraction - -```python -schema = { - "name": "Crypto Prices", - "baseSelector": ".cds-tableRow-t45thuk", - "fields": [ - { - "name": "crypto", - "selector": "td:nth-child(1) h2", - "type": "text", - }, - { - "name": "price", - "selector": "td:nth-child(2)", - "type": "text", - } - ], -} - -request = { - "urls": "https://www.coinbase.com/explore", - "extraction_config": { - "type": "json_css", - "params": {"schema": schema} - } -} -``` - -### Dynamic Content Handling - -```python -request = { - "urls": "https://www.nbcnews.com/business", - "js_code": [ - "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" - ], - "wait_for": "article.tease-card:nth-child(10)" -} -``` - -### AI-Powered Extraction (Full Version) - -```python -request = { - "urls": "https://www.nbcnews.com/business", - "extraction_config": { - "type": "cosine", - "params": { - "semantic_filter": "business finance economy", - "word_count_threshold": 10, - "max_dist": 0.2, - "top_k": 3 - } - } -} -``` - -## Platform-Specific Instructions πŸ’» - -### macOS -```bash -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic -``` - -### Ubuntu -```bash -# Basic version -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic - -# With GPU support -docker pull unclecode/crawl4ai:gpu -docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu -``` - -### Windows (PowerShell) -```powershell -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic -``` - -## Testing πŸ§ͺ - -Save this as `test_docker.py`: - -```python -import requests -import json -import time -import sys - -class Crawl4AiTester: - def __init__(self, base_url: str = "http://localhost:11235"): - self.base_url = base_url +async def main(): + async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client: + # If JWT is enabled, you can authenticate like this: (more on this later) + # await client.authenticate("test@example.com") - def submit_and_wait(self, request_data: dict, timeout: int = 300) -> dict: - # Submit crawl job - response = requests.post(f"{self.base_url}/crawl", json=request_data) - task_id = response.json()["task_id"] - print(f"Task ID: {task_id}") + # Non-streaming crawl + results = await client.crawl( + ["https://example.com", "https://python.org"], + browser_config=BrowserConfig(headless=True), + crawler_config=CrawlerRunConfig() + ) + print(f"Non-streaming results: {results}") - # Poll for result - start_time = time.time() - while True: - if time.time() - start_time > timeout: - raise TimeoutError(f"Task {task_id} timeout") - - result = requests.get(f"{self.base_url}/task/{task_id}") - status = result.json() - - if status["status"] == "completed": - return status - - time.sleep(2) - -def test_deployment(): - tester = Crawl4AiTester() - - # Test basic crawl - request = { - "urls": "https://www.nbcnews.com/business", - "priority": 10 - } - - result = tester.submit_and_wait(request) - print("Basic crawl successful!") - print(f"Content length: {len(result['result']['markdown'])}") + # Streaming crawl + crawler_config = CrawlerRunConfig(stream=True) + async for result in await client.crawl( + ["https://example.com", "https://python.org"], + browser_config=BrowserConfig(headless=True), + crawler_config=crawler_config + ): + print(f"Streamed result: {result}") + + # Get schema + schema = await client.get_schema() + print(f"Schema: {schema}") if __name__ == "__main__": - test_deployment() + asyncio.run(main()) ``` -## Advanced Configuration βš™οΈ +`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control: -### Crawler Parameters +- `base_url` (str): Base URL of the Crawl4AI Docker server +- `timeout` (float): Default timeout for requests in seconds +- `verify_ssl` (bool): Whether to verify SSL certificates +- `verbose` (bool): Whether to show logging output +- `log_file` (str, optional): Path to log file if file logging is desired -The `crawler_params` field allows you to configure the browser instance and crawling behavior. Here are key parameters you can use: +This client SDK generates a properly structured JSON request for the server's HTTP API. +## Second Approach: Direct API Calls + +This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works. + +### Understanding Configuration Structure + +Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity. + +#### The Basic Pattern + +Try this in Python to understand the structure: ```python -request = { - "urls": "https://example.com", - "crawler_params": { - # Browser Configuration - "headless": True, # Run in headless mode - "browser_type": "chromium", # chromium/firefox/webkit - "user_agent": "custom-agent", # Custom user agent - "proxy": "http://proxy:8080", # Proxy configuration - - # Performance & Behavior - "page_timeout": 30000, # Page load timeout (ms) - "verbose": True, # Enable detailed logging - "semaphore_count": 5, # Concurrent request limit - - # Anti-Detection Features - "simulate_user": True, # Simulate human behavior - "magic": True, # Advanced anti-detection - "override_navigator": True, # Override navigator properties - - # Session Management - "user_data_dir": "./browser-data", # Browser profile location - "use_managed_browser": True, # Use persistent browser +from crawl4ai import BrowserConfig + +# Create a config and see its structure +config = BrowserConfig(headless=True) +print(config.dump()) +``` + +This outputs: +```json +{ + "type": "BrowserConfig", + "params": { + "headless": true } } ``` -### Extra Parameters +#### Simple vs Complex Values -The `extra` field allows passing additional parameters directly to the crawler's `arun` function: +The structure follows these rules: +- Simple values (strings, numbers, booleans, lists) are passed directly +- Complex values (classes, dictionaries) use the type-params pattern -```python -request = { - "urls": "https://example.com", - "extra": { - "word_count_threshold": 10, # Min words per block - "only_text": True, # Extract only text - "bypass_cache": True, # Force fresh crawl - "process_iframes": True, # Include iframe content - } -} -``` - -### Complete Examples - -1.β€€**Advanced News Crawling** -```python -request = { - "urls": "https://www.nbcnews.com/business", - "crawler_params": { - "headless": True, - "page_timeout": 30000, - "remove_overlay_elements": True # Remove popups - }, - "extra": { - "word_count_threshold": 50, # Longer content blocks - "bypass_cache": True # Fresh content - }, - "css_selector": ".article-body" -} -``` - -2.β€€**Anti-Detection Configuration** -```python -request = { - "urls": "https://example.com", - "crawler_params": { - "simulate_user": True, - "magic": True, - "override_navigator": True, - "user_agent": "Mozilla/5.0 ...", - "headers": { - "Accept-Language": "en-US,en;q=0.9" - } - } -} -``` - -3.β€€**LLM Extraction with Custom Parameters** -```python -request = { - "urls": "https://openai.com/pricing", - "extraction_config": { - "type": "llm", +For example, with dictionaries: +```json +{ + "browser_config": { + "type": "BrowserConfig", "params": { - "provider": "openai/gpt-4", - "schema": pricing_schema + "headless": true, // Simple boolean - direct value + "viewport": { // Complex dictionary - needs type-params + "type": "dict", + "value": { + "width": 1200, + "height": 800 + } + } } - }, - "crawler_params": { - "verbose": True, - "page_timeout": 60000 - }, - "extra": { - "word_count_threshold": 1, - "only_text": True } } ``` -4.β€€**Session-Based Dynamic Content** +#### Strategy Pattern and Nesting + +Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration: + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "chunking_strategy": { + "type": "RegexChunking", // Strategy implementation + "params": { + "patterns": ["\n\n", "\\.\\s+"] + } + } + } + } +} +``` + +Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy. + +#### Complex Nested Example + +Let's look at a more complex example with content filtering: + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed" + } + } + } + } + } + } +} +``` + +This shows how deeply configurations can nest while maintaining a consistent structure. + +#### Quick Grammar Overview +``` +config := { + "type": string, + "params": { + key: simple_value | complex_value + } +} + +simple_value := string | number | boolean | [simple_value] +complex_value := config | dict_value + +dict_value := { + "type": "dict", + "value": object +} +``` + +#### Important Rules 🚨 + +- Always use the type-params pattern for class instances +- Use direct values for primitives (numbers, strings, booleans) +- Wrap dictionaries with {"type": "dict", "value": {...}} +- Arrays/lists are passed directly without type-params +- All parameters are optional unless specifically required + +#### Pro Tip πŸ’‘ + +The easiest way to get the correct structure is to: +1. Create configuration objects in Python +2. Use the `dump()` method to see their JSON representation +3. Use that JSON in your API calls + +Example: ```python -request = { - "urls": "https://example.com", - "crawler_params": { - "session_id": "dynamic_session", - "headless": False, - "page_timeout": 60000 - }, - "js_code": ["window.scrollTo(0, document.body.scrollHeight);"], - "wait_for": "js:() => document.querySelectorAll('.item').length > 10", - "extra": { - "delay_before_return_html": 2.0 +from crawl4ai import CrawlerRunConfig, PruningContentFilter + +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed") + ), + cache_mode= CacheMode.BYPASS +) +print(config.dump()) # Use this JSON in your API calls +``` + + +#### More Examples + +**Advanced Crawler Configuration** + +```json +{ + "urls": ["https://example.com"], + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "bypass", + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed", + "min_word_threshold": 0 + } + } + } + } + } } } ``` -5.β€€**Screenshot with Custom Timing** +**Extraction Strategy**: + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "baseSelector": "article.post", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "content", "selector": ".content", "type": "html"} + ] + } + } + } + } + } +} +``` + +**LLM Extraction Strategy** + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "instruction": "Extract article title, author, publication date and main content", + "provider": "openai/gpt-4", + "api_token": "your-api-token", + "schema": { + "type": "dict", + "value": { + "title": "Article Schema", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The article's headline" + }, + "author": { + "type": "string", + "description": "The author's name" + }, + "published_date": { + "type": "string", + "format": "date-time", + "description": "Publication date and time" + }, + "content": { + "type": "string", + "description": "The main article content" + } + }, + "required": ["title", "content"] + } + } + } + } + } + } +} +``` + +**Deep Crawler Example** + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "deep_crawl_strategy": { + "type": "BFSDeepCrawlStrategy", + "params": { + "max_depth": 3, + "filter_chain": { + "type": "FilterChain", + "params": { + "filters": [ + { + "type": "ContentTypeFilter", + "params": { + "allowed_types": ["text/html", "application/xhtml+xml"] + } + }, + { + "type": "DomainFilter", + "params": { + "allowed_domains": ["blog.*", "docs.*"], + } + } + ] + } + }, + "url_scorer": { + "type": "CompositeScorer", + "params": { + "scorers": [ + { + "type": "KeywordRelevanceScorer", + "params": { + "keywords": ["tutorial", "guide", "documentation"], + } + }, + { + "type": "PathDepthScorer", + "params": { + "weight": 0.5, + "optimal_depth": 3 + } + } + ] + } + } + } + } + } + } +} +``` + +### REST API Examples + +Let's look at some practical examples: + +#### Simple Crawl + ```python -request = { - "urls": "https://example.com", - "screenshot": True, - "crawler_params": { - "headless": True, - "screenshot_wait_for": ".main-content" - }, - "extra": { - "delay_before_return_html": 3.0 - } +import requests + +crawl_payload = { + "urls": ["https://example.com"], + "browser_config": {"headless": True}, + "crawler_config": {"stream": False} } +response = requests.post( + "http://localhost:8000/crawl", + # headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled, more on this later + json=crawl_payload +) +print(response.json()) # Print the response for debugging ``` -### Parameter Reference Table +#### Streaming Results -| Category | Parameter | Type | Description | -|----------|-----------|------|-------------| -| Browser | headless | bool | Run browser in headless mode | -| Browser | browser_type | str | Browser engine selection | -| Browser | user_agent | str | Custom user agent string | -| Network | proxy | str | Proxy server URL | -| Network | headers | dict | Custom HTTP headers | -| Timing | page_timeout | int | Page load timeout (ms) | -| Timing | delay_before_return_html | float | Wait before capture | -| Anti-Detection | simulate_user | bool | Human behavior simulation | -| Anti-Detection | magic | bool | Advanced protection | -| Session | session_id | str | Browser session ID | -| Session | user_data_dir | str | Profile directory | -| Content | word_count_threshold | int | Minimum words per block | -| Content | only_text | bool | Text-only extraction | -| Content | process_iframes | bool | Include iframe content | -| Debug | verbose | bool | Detailed logging | -| Debug | log_console | bool | Browser console logs | +```python +async def test_stream_crawl(session, token: str): + """Test the /crawl/stream endpoint with multiple URLs.""" + url = "http://localhost:8000/crawl/stream" + payload = { + "urls": [ + "https://example.com", + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + ], + "browser_config": {"headless": True, "viewport": {"width": 1200}}, + "crawler_config": {"stream": True, "cache_mode": "bypass"} + } -## Troubleshooting πŸ” + # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later + + try: + async with session.post(url, json=payload, headers=headers) as response: + status = response.status + print(f"Status: {status} (Expected: 200)") + assert status == 200, f"Expected 200, got {status}" + + # Read streaming response line-by-line (NDJSON) + async for line in response.content: + if line: + data = json.loads(line.decode('utf-8').strip()) + print(f"Streamed Result: {json.dumps(data, indent=2)}") + except Exception as e: + print(f"Error in streaming crawl test: {str(e)}") +``` -### Common Issues +## Metrics & Monitoring -1.β€€**Connection Refused** - ``` - Error: Connection refused at localhost:11235 - ``` - Solution: Ensure the container is running and ports are properly mapped. +Keep an eye on your crawler with these endpoints: -2.β€€**Resource Limits** - ``` - Error: No available slots - ``` - Solution: Increase MAX_CONCURRENT_TASKS or container resources. +- `/health` - Quick health check +- `/metrics` - Detailed Prometheus metrics +- `/schema` - Full API schema -3.β€€**GPU Access** - ``` - Error: GPU not found - ``` - Solution: Ensure proper NVIDIA drivers and use `--gpus all` flag. - -### Debug Mode - -Access container for debugging: +Example health check: ```bash -docker run -it --entrypoint /bin/bash unclecode/crawl4ai:all +curl http://localhost:8000/health ``` -View container logs: -```bash -docker logs [container_id] +## Deployment Scenarios + +> 🚧 Coming soon! We'll cover: +> - Kubernetes deployment +> - Cloud provider setups (AWS, GCP, Azure) +> - High-availability configurations +> - Load balancing strategies + +## Complete Examples + +Check out the `examples` folder in our repository for full working examples! Here are two to get you started: +[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py) +[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py) + +## Server Configuration + +The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security. + +### Understanding config.yml + +The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container. + +Here's a detailed breakdown of the configuration options: + +```yaml +# Application Configuration +app: + title: "Crawl4AI API" # Server title in OpenAPI docs + version: "1.0.0" # API version + host: "0.0.0.0" # Listen on all interfaces + port: 8000 # Server port + reload: True # Enable hot reloading (development only) + timeout_keep_alive: 300 # Keep-alive timeout in seconds + +# Rate Limiting Configuration +rate_limiting: + enabled: True # Enable/disable rate limiting + default_limit: "100/minute" # Rate limit format: "number/timeunit" + trusted_proxies: [] # List of trusted proxy IPs + storage_uri: "memory://" # Use "redis://localhost:6379" for production + +# Security Configuration +security: + enabled: false # Master toggle for security features + jwt_enabled: true # Enable JWT authentication + https_redirect: True # Force HTTPS + trusted_hosts: ["*"] # Allowed hosts (use specific domains in production) + headers: # Security headers + x_content_type_options: "nosniff" + x_frame_options: "DENY" + content_security_policy: "default-src 'self'" + strict_transport_security: "max-age=63072000; includeSubDomains" + +# Crawler Configuration +crawler: + memory_threshold_percent: 95.0 # Memory usage threshold + rate_limiter: + base_delay: [1.0, 2.0] # Min and max delay between requests + timeouts: + stream_init: 30.0 # Stream initialization timeout + batch_process: 300.0 # Batch processing timeout + +# Logging Configuration +logging: + level: "INFO" # Log level (DEBUG, INFO, WARNING, ERROR) + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Observability Configuration +observability: + prometheus: + enabled: True # Enable Prometheus metrics + endpoint: "/metrics" # Metrics endpoint + health_check: + endpoint: "/health" # Health check endpoint ``` -## Best Practices 🌟 +### JWT Authentication -1.β€€**Resource Management** - - Set appropriate memory and CPU limits - - Monitor resource usage via health endpoint - - Use basic version for simple crawling tasks +When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works: -2.β€€**Scaling** - - Use multiple containers for high load - - Implement proper load balancing - - Monitor performance metrics - -3.β€€**Security** - - Use environment variables for sensitive data - - Implement proper network isolation - - Regular security updates - -## API Reference πŸ“š - -### Health Check -```http -GET /health -``` - -### Submit Crawl Task -```http -POST /crawl +#### Getting a Token +```python +POST /token Content-Type: application/json { - "urls": "string or array", - "extraction_config": { - "type": "basic|llm|cosine|json_css", - "params": {} - }, - "priority": 1-10, - "ttl": 3600 + "email": "user@example.com" } ``` -### Get Task Status -```http -GET /task/{task_id} +The endpoint returns: +```json +{ + "email": "user@example.com", + "access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...", + "token_type": "bearer" +} ``` -For more details, visit the [official documentation](https://docs.crawl4ai.com/). \ No newline at end of file +#### Using the Token +Add the token to your requests: +```bash +curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl +``` + +Using the Python SDK: +```python +from crawl4ai.docker_client import Crawl4aiDockerClient + +async with Crawl4aiDockerClient() as client: + # Authenticate first + await client.authenticate("user@example.com") + + # Now all requests will include the token automatically + result = await client.crawl(urls=["https://example.com"]) +``` + +#### Production Considerations πŸ’‘ +The default implementation uses a simple email verification. For production use, consider: +- Email verification via OTP/magic links +- OAuth2 integration +- Rate limiting token generation +- Token expiration and refresh mechanisms +- IP-based restrictions + +### Configuration Tips and Best Practices + +1. **Production Settings** 🏭 + + ```yaml + app: + reload: False # Disable reload in production + timeout_keep_alive: 120 # Lower timeout for better resource management + + rate_limiting: + storage_uri: "redis://redis:6379" # Use Redis for distributed rate limiting + default_limit: "50/minute" # More conservative rate limit + + security: + enabled: true # Enable all security features + trusted_hosts: ["your-domain.com"] # Restrict to your domain + ``` + +2. **Development Settings** πŸ› οΈ + + ```yaml + app: + reload: True # Enable hot reloading + timeout_keep_alive: 300 # Longer timeout for debugging + + logging: + level: "DEBUG" # More verbose logging + ``` + +3. **High-Traffic Settings** 🚦 + + ```yaml + crawler: + memory_threshold_percent: 85.0 # More conservative memory limit + rate_limiter: + base_delay: [2.0, 4.0] # More aggressive rate limiting + ``` + +### Customizing Your Configuration + +#### Method 1: Pre-build Configuration + +```bash +# Copy and modify config before building +cd crawl4ai/deploy +vim custom-config.yml # Or use any editor + +# Build with custom config +docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest . +``` + +#### Method 2: Build-time Configuration + +Use a custom config during build: + +```bash +# Build with custom config +docker build --platform=linux/amd64 --no-cache \ + --build-arg CONFIG_PATH=/path/to/custom-config.yml \ + -t crawl4ai:latest . +``` + +#### Method 3: Runtime Configuration +```bash +# Mount custom config at runtime +docker run -d -p 8000:8000 \ + -v $(pwd)/custom-config.yml:/app/config.yml \ + crawl4ai-server:prod +``` + +> πŸ’‘ Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory. +> πŸ’‘ Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config. + +### Configuration Recommendations + +1. **Security First** πŸ”’ + - Always enable security in production + - Use specific trusted_hosts instead of wildcards + - Set up proper rate limiting to protect your server + - Consider your environment before enabling HTTPS redirect + +2. **Resource Management** πŸ’» + - Adjust memory_threshold_percent based on available RAM + - Set timeouts according to your content size and network conditions + - Use Redis for rate limiting in multi-container setups + +3. **Monitoring** πŸ“Š + - Enable Prometheus if you need metrics + - Set DEBUG logging in development, INFO in production + - Regular health check monitoring is crucial + +4. **Performance Tuning** ⚑ + - Start with conservative rate limiter delays + - Increase batch_process timeout for large content + - Adjust stream_init timeout based on initial response times + +## Getting Help + +We're here to help you succeed with Crawl4AI! Here's how to get support: + +- πŸ“– Check our [full documentation](https://docs.crawl4ai.com) +- πŸ› Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues) +- πŸ’¬ Join our [Discord community](https://discord.gg/crawl4ai) +- ⭐ Star us on GitHub to show support! + +## Summary + +In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment: +- Building and running the Docker container +- Configuring the environment +- Making API requests with proper typing +- Using the Python SDK +- Monitoring your deployment + +Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs. + +Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. πŸš€ + +Happy crawling! πŸ•·οΈ \ No newline at end of file diff --git a/docs/tutorials/coming_soon.md b/docs/tutorials/coming_soon.md new file mode 100644 index 00000000..e69de29b