From 108b2a8bfbfdca6b928603596002a91b608af860 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 10 Apr 2025 23:22:38 +0800 Subject: [PATCH 1/5] Fixed capturing console messages for case the url is the local file. Update docker configuration (work in progress) --- Dockerfile | 33 +- crawl4ai/async_crawler_strategy.py | 49 +- crawl4ai/browser_manager.py | 2 +- deploy/docker/requirements.txt | 1 - deploy/docker/supervisord.conf | 24 +- docker-compose.yml | 72 +- .../network_console_capture_example.py | 20 +- docs/md_v2/core/docker-deployment.md | 1361 +++++++++-------- docs/tutorials/coming_soon.md | 0 9 files changed, 898 insertions(+), 664 deletions(-) create mode 100644 docs/tutorials/coming_soon.md diff --git a/Dockerfile b/Dockerfile index 9796bcb6..8b84f797 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,7 +24,7 @@ ARG TARGETARCH LABEL maintainer="unclecode" LABEL description="πŸ”₯πŸ•·οΈ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" -LABEL version="1.0" +LABEL version="1.0" RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ @@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libjpeg-dev \ redis-server \ supervisor \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libcairo2 \ libasound2 \ libatspi2.0-0 \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/* RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \ apt-get update && apt-get install -y --no-install-recommends \ nvidia-cuda-toolkit \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/* ; \ else \ echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ @@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \ echo "🦾 Installing ARM-specific optimizations"; \ apt-get update && apt-get install -y --no-install-recommends \ libopenblas-dev \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/*; \ elif [ "$TARGETARCH" = "amd64" ]; then \ echo "πŸ–₯️ Installing AMD64-specific optimizations"; \ apt-get update && apt-get install -y --no-install-recommends \ libomp-dev \ + && apt-get clean \ && rm -rf /var/lib/apt/lists/*; \ else \ echo "Skipping platform-specific optimizations (unsupported platform)"; \ fi +# Create a non-root user and group +RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser + +# Create and set permissions for appuser home directory +RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser + WORKDIR ${APP_HOME} RUN echo '#!/bin/bash\n\ @@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh COPY . /tmp/project/ +# Copy supervisor config first (might need root later, but okay for now) COPY deploy/docker/supervisord.conf . COPY deploy/docker/requirements.txt . @@ -131,16 +143,23 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ else \ pip install "/tmp/project" ; \ fi - + RUN pip install --no-cache-dir --upgrade pip && \ /tmp/install.sh && \ python -c "import crawl4ai; print('βœ… crawl4ai is ready to rock!')" && \ python -c "from playwright.sync_api import sync_playwright; print('βœ… Playwright is feeling dramatic!')" - + RUN playwright install --with-deps chromium +# Copy application code COPY deploy/docker/* ${APP_HOME}/ +# Change ownership of the application directory to the non-root user +RUN chown -R appuser:appuser ${APP_HOME} + +# give permissions to redis persistence dirs if used +RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis + HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ CMD bash -c '\ MEM=$(free -m | awk "/^Mem:/{print \$2}"); \ @@ -149,8 +168,10 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ exit 1; \ fi && \ redis-cli ping > /dev/null && \ - curl -f http://localhost:8000/health || exit 1' + curl -f http://localhost:11235/health || exit 1' EXPOSE 6379 -CMD ["supervisord", "-c", "supervisord.conf"] - +# Switch to the non-root user before starting the application +USER appuser + +CMD ["supervisord", "-c", "supervisord.conf"] \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index f99d1cb9..3278c731 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -409,7 +409,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): user_agent = kwargs.get("user_agent", self.user_agent) # Use browser_manager to get a fresh page & context assigned to this session_id - page, context = await self.browser_manager.get_page(session_id, user_agent) + page, context = await self.browser_manager.get_page(CrawlerRunConfig( + session_id=session_id, + user_agent=user_agent, + **kwargs, + )) return session_id async def crawl( @@ -447,12 +451,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): html = f.read() if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) + if config.capture_console_messages: + page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + captured_console = await self._capture_console_messages(page, url) + return AsyncCrawlResponse( html=html, response_headers=response_headers, status_code=status_code, screenshot=screenshot_data, get_delayed_content=None, + console_messages=captured_console, ) elif url.startswith("raw:") or url.startswith("raw://"): @@ -582,7 +591,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "url": request.url, "method": request.method, "resource_type": request.resource_type, - "failure_text": request.failure.error_text if request.failure else "Unknown failure", + "failure_text": str(request.failure) if request.failure else "Unknown failure", "timestamp": time.time() }) except Exception as e: @@ -1274,6 +1283,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): ) return None + async def _capture_console_messages( + self, page: Page, file_path: str + ) -> List[Dict[str, Union[str, float]]]: + """ + Captures console messages from the page. + Args: + + page (Page): The Playwright page object + Returns: + List[Dict[str, Union[str, float]]]: A list of captured console messages + """ + captured_console = [] + + def handle_console_message(msg): + try: + message_type = msg.type + message_text = msg.text + + entry = { + "type": message_type, + "text": message_text, + "timestamp": time.time(), + } + captured_console.append(entry) + except Exception as e: + if self.logger: + self.logger.warning( + f"Error capturing console message: {e}", tag="CAPTURE" + ) + + page.on("console", handle_console_message) + + await page.goto(file_path) + + return captured_console + async def take_screenshot(self, page, **kwargs) -> str: """ Take a screenshot of the current page. diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 7fc819e0..f3c7d861 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -658,7 +658,7 @@ class BrowserManager: "name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url - if crawlerRunConfig + if crawlerRunConfig and crawlerRunConfig.url else "https://crawl4ai.com/", } ] diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt index b7e6d8ad..40a33a79 100644 --- a/deploy/docker/requirements.txt +++ b/deploy/docker/requirements.txt @@ -1,4 +1,3 @@ -crawl4ai fastapi uvicorn gunicorn>=23.0.0 diff --git a/deploy/docker/supervisord.conf b/deploy/docker/supervisord.conf index 1274f2c3..d51cc953 100644 --- a/deploy/docker/supervisord.conf +++ b/deploy/docker/supervisord.conf @@ -1,12 +1,28 @@ [supervisord] -nodaemon=true +nodaemon=true ; Run supervisord in the foreground +logfile=/dev/null ; Log supervisord output to stdout/stderr +logfile_maxbytes=0 [program:redis] -command=redis-server +command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine +user=appuser ; Run redis as our non-root user autorestart=true priority=10 +stdout_logfile=/dev/stdout ; Redirect redis stdout to container stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr ; Redirect redis stderr to container stderr +stderr_logfile_maxbytes=0 [program:gunicorn] -command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app +command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app +directory=/app ; Working directory for the app +user=appuser ; Run gunicorn as our non-root user autorestart=true -priority=20 \ No newline at end of file +priority=20 +environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs +stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout +stdout_logfile_maxbytes=0 +stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr +stderr_logfile_maxbytes=0 + +# Optional: Add filebeat or other logging agents here if needed \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 6a7bf7cb..f112f9fd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,15 +1,31 @@ -# Base configuration (not a service, just a reusable config block) +# docker-compose.yml +# This file is in the root directory alongside Dockerfile + +# Base configuration anchor for reusability x-base-config: &base-config ports: + # Map host port 11235 to container port 11235 (where Gunicorn will listen) - "11235:11235" - - "8000:8000" - - "9222:9222" - - "8080:8080" + # - "8080:8080" # Uncomment if needed + + # Load API keys primarily from .llm.env file + # Create .llm.env in the root directory from deploy/docker/.llm.env.example + env_file: + - .llm.env + + # Define environment variables, allowing overrides from host environment + # Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} + - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-} + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} + - GROQ_API_KEY=${GROQ_API_KEY:-} + - TOGETHER_API_KEY=${TOGETHER_API_KEY:-} + - MISTRAL_API_KEY=${MISTRAL_API_KEY:-} + - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-} + volumes: + # Mount /dev/shm for Chromium/Playwright performance - /dev/shm:/dev/shm deploy: resources: @@ -19,47 +35,47 @@ x-base-config: &base-config memory: 1G restart: unless-stopped healthcheck: + # IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf test: ["CMD", "curl", "-f", "http://localhost:11235/health"] interval: 30s timeout: 10s retries: 3 - start_period: 40s + start_period: 40s # Give the server time to start + # Run the container as the non-root user defined in the Dockerfile + user: "appuser" services: - # Local build services for different platforms - crawl4ai-amd64: + # --- Local Build Services --- + crawl4ai-local-amd64: build: - context: . - dockerfile: Dockerfile + context: . # Build context is the root directory + dockerfile: Dockerfile # Dockerfile is in the root directory args: - PYTHON_VERSION: "3.10" - INSTALL_TYPE: ${INSTALL_TYPE:-basic} - ENABLE_GPU: false - platforms: - - linux/amd64 + INSTALL_TYPE: ${INSTALL_TYPE:-default} + ENABLE_GPU: ${ENABLE_GPU:-false} + # PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile + platform: linux/amd64 profiles: ["local-amd64"] - <<: *base-config # extends yerine doğrudan yapΔ±landΔ±rmayΔ± dahil ettik + <<: *base-config # Inherit base configuration - crawl4ai-arm64: + crawl4ai-local-arm64: build: - context: . - dockerfile: Dockerfile + context: . # Build context is the root directory + dockerfile: Dockerfile # Dockerfile is in the root directory args: - PYTHON_VERSION: "3.10" - INSTALL_TYPE: ${INSTALL_TYPE:-basic} - ENABLE_GPU: false - platforms: - - linux/arm64 + INSTALL_TYPE: ${INSTALL_TYPE:-default} + ENABLE_GPU: ${ENABLE_GPU:-false} + platform: linux/arm64 profiles: ["local-arm64"] <<: *base-config - # Hub services for different platforms and versions + # --- Docker Hub Image Services --- crawl4ai-hub-amd64: - image: unclecode/crawl4ai:${VERSION:-basic}-amd64 + image: unclecode/crawl4ai:${VERSION:-latest}-amd64 profiles: ["hub-amd64"] <<: *base-config crawl4ai-hub-arm64: - image: unclecode/crawl4ai:${VERSION:-basic}-arm64 + image: unclecode/crawl4ai:${VERSION:-latest}-arm64 profiles: ["hub-arm64"] <<: *base-config \ No newline at end of file diff --git a/docs/examples/network_console_capture_example.py b/docs/examples/network_console_capture_example.py index 5305ddc3..0208bdce 100644 --- a/docs/examples/network_console_capture_example.py +++ b/docs/examples/network_console_capture_example.py @@ -357,8 +357,7 @@ async def demo_performance_analysis(): async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( capture_network_requests=True, - wait_until="networkidle", - page_timeout=60000 # 60 seconds + page_timeout=60 * 2 * 1000 # 120 seconds ) result = await crawler.arun( @@ -406,6 +405,13 @@ async def demo_performance_analysis(): "url": url, "duration_ms": duration }) + if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing: + # Convert to milliseconds + duration = (timing["responseStart"] - timing["requestStart"]) * 1000 + resource_timings[resource_type].append({ + "url": url, + "duration_ms": duration + }) # Calculate statistics for each resource type print("\nPerformance by resource type:") @@ -455,14 +461,14 @@ async def main(): os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True) # Run basic examples - await demo_basic_network_capture() + # await demo_basic_network_capture() await demo_basic_console_capture() - await demo_combined_capture() + # await demo_combined_capture() # Run advanced examples - await analyze_spa_network_traffic() - await demo_security_analysis() - await demo_performance_analysis() + # await analyze_spa_network_traffic() + # await demo_security_analysis() + # await demo_performance_analysis() print("\n=== Examples Complete ===") print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}") diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md index a3d0def1..b4b6e414 100644 --- a/docs/md_v2/core/docker-deployment.md +++ b/docs/md_v2/core/docker-deployment.md @@ -1,702 +1,833 @@ -# Docker Deployment +# Crawl4AI Docker Guide 🐳 -Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments. +## Table of Contents +- [Prerequisites](#prerequisites) +- [Installation](#installation) + - [Local Build](#local-build) + - [Docker Hub](#docker-hub) +- [Dockerfile Parameters](#dockerfile-parameters) +- [Using the API](#using-the-api) + - [Understanding Request Schema](#understanding-request-schema) + - [REST API Examples](#rest-api-examples) + - [Python SDK](#python-sdk) +- [Metrics & Monitoring](#metrics--monitoring) +- [Deployment Scenarios](#deployment-scenarios) +- [Complete Examples](#complete-examples) +- [Getting Help](#getting-help) -## Quick Start πŸš€ +## Prerequisites -Pull and run the basic version: +Before we dive in, make sure you have: +- Docker installed and running (version 20.10.0 or higher) +- At least 4GB of RAM available for the container +- Python 3.10+ (if using the Python SDK) +- Node.js 16+ (if using the Node.js examples) + +> πŸ’‘ **Pro tip**: Run `docker info` to check your Docker installation and available resources. + +## Installation + +### Local Build + +Let's get your local environment set up step by step! + +#### 1. Building the Image + +First, clone the repository and build the Docker image: ```bash -# Basic run without security -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic +# Clone the repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai/deploy -# Run with API security enabled -docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic +# Build the Docker image +docker build --platform=linux/amd64 --no-cache -t crawl4ai . + +# Or build for arm64 +docker build --platform=linux/arm64 --no-cache -t crawl4ai . ``` -## Running with Docker Compose 🐳 +#### 2. Environment Setup -### Use Docker Compose (From Local Dockerfile or Docker Hub) +If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file: -Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub. - -### **Option 1: Using Docker Compose to Build Locally** -If you want to build the image locally, use the provided `docker-compose.local.yml` file. - -```bash -docker-compose -f docker-compose.local.yml up -d -``` - -This will: -1. Build the Docker image from the provided `Dockerfile`. -2. Start the container and expose it on `http://localhost:11235`. - ---- - -### **Option 2: Using Docker Compose with Pre-Built Image from Hub** -If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file. - -```bash -docker-compose -f docker-compose.hub.yml up -d -``` - -This will: -1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration). -2. Start the container and expose it on `http://localhost:11235`. - ---- - -### **Stopping the Running Services** - -To stop the services started via Docker Compose, you can use: - -```bash -docker-compose -f docker-compose.local.yml down -# OR -docker-compose -f docker-compose.hub.yml down -``` - -If the containers don’t stop and the application is still running, check the running containers: - -```bash -docker ps -``` - -Find the `CONTAINER ID` of the running service and stop it forcefully: - -```bash -docker stop -``` - ---- - -### **Debugging with Docker Compose** - -- **Check Logs**: To view the container logs: - ```bash - docker-compose -f docker-compose.local.yml logs -f - ``` - -- **Remove Orphaned Containers**: If the service is still running unexpectedly: - ```bash - docker-compose -f docker-compose.local.yml down --remove-orphans - ``` - -- **Manually Remove Network**: If the network is still in use: - ```bash - docker network ls - docker network rm crawl4ai_default - ``` - ---- - -### Why Use Docker Compose? - -Docker Compose is the recommended way to deploy Crawl4AI because: -1. It simplifies multi-container setups. -2. Allows you to define environment variables, resources, and ports in a single file. -3. Makes it easier to switch between local development and production-ready images. - -For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent. - - - - -## API Security πŸ”’ - -### Understanding CRAWL4AI_API_TOKEN - -The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance: - -- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication -- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible - -```bash -# Secured Instance -docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all - -# Unsecured Instance -docker run -p 11235:11235 unclecode/crawl4ai:all -``` - -### Making API Calls - -For secured instances, include the token in all requests: - -```python -import requests - -# Setup headers if token is being used -api_token = "your_secret_token" # Same token set in CRAWL4AI_API_TOKEN -headers = {"Authorization": f"Bearer {api_token}"} if api_token else {} - -# Making authenticated requests -response = requests.post( - "http://localhost:11235/crawl", - headers=headers, - json={ - "urls": "https://example.com", - "priority": 10 - } -) - -# Checking task status -task_id = response.json()["task_id"] -status = requests.get( - f"http://localhost:11235/task/{task_id}", - headers=headers -) -``` - -### Using with Docker Compose - -In your `docker-compose.yml`: -```yaml -services: - crawl4ai: - image: unclecode/crawl4ai:all - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional - # ... other configuration -``` - -Then either: -1. Set in `.env` file: ```env -CRAWL4AI_API_TOKEN=your_secret_token +# OpenAI +OPENAI_API_KEY=sk-your-key + +# Anthropic +ANTHROPIC_API_KEY=your-anthropic-key + +# DeepSeek +DEEPSEEK_API_KEY=your-deepseek-key + +# Check out https://docs.litellm.ai/docs/providers for more providers! ``` -2. Or set via command line: +> πŸ”‘ **Note**: Keep your API keys secure! Never commit them to version control. + +#### 3. Running the Container + +You have several options for running the container: + +Basic run (no LLM support): ```bash -CRAWL4AI_API_TOKEN=your_secret_token docker-compose up +docker run -d -p 8000:8000 --name crawl4ai crawl4ai ``` -> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`). +With LLM support: +```bash +docker run -d -p 8000:8000 \ + --env-file .llm.env \ + --name crawl4ai \ + crawl4ai +``` -## Configuration Options πŸ”§ +Using host environment variables (Not a good practice, but works for local testing): +```bash +docker run -d -p 8000:8000 \ + --env-file .llm.env \ + --env "$(env)" \ + --name crawl4ai \ + crawl4ai +``` -### Environment Variables - -You can configure the service using environment variables: +#### Multi-Platform Build +For distributing your image across different architectures, use `buildx`: ```bash -# Basic configuration -docker run -p 11235:11235 \ - -e MAX_CONCURRENT_TASKS=5 \ - unclecode/crawl4ai:all +# Set up buildx builder +docker buildx create --use -# With security and LLM support -docker run -p 11235:11235 \ - -e CRAWL4AI_API_TOKEN=your_secret_token \ - -e OPENAI_API_KEY=sk-... \ - -e ANTHROPIC_API_KEY=sk-ant-... \ - unclecode/crawl4ai:all +# Build for multiple platforms +docker buildx build \ + --platform linux/amd64,linux/arm64 \ + -t crawl4ai \ + --push \ + . ``` -### Using Docker Compose (Recommended) 🐳 +> πŸ’‘ **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry. -Create a `docker-compose.yml`: +#### Development Build +For development, you might want to enable all features: -```yaml -version: '3.8' - -services: - crawl4ai: - image: unclecode/crawl4ai:all - ports: - - "11235:11235" - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API security - - MAX_CONCURRENT_TASKS=5 - # LLM Provider Keys - - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-} - volumes: - - /dev/shm:/dev/shm - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G -``` - -You can run it in two ways: - -1. Using environment variables directly: ```bash -CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up +docker build -t crawl4ai + --build-arg INSTALL_TYPE=all \ + --build-arg PYTHON_VERSION=3.10 \ + --build-arg ENABLE_GPU=true \ + . ``` -2. Using a `.env` file (recommended): -Create a `.env` file in the same directory: -```env -# API Security (optional) -CRAWL4AI_API_TOKEN=your_secret_token +#### GPU-Enabled Build +If you plan to use GPU acceleration: -# LLM Provider Keys -OPENAI_API_KEY=sk-... -ANTHROPIC_API_KEY=sk-ant-... - -# Other Configuration -MAX_CONCURRENT_TASKS=5 -``` - -Then simply run: ```bash -docker-compose up +docker build -t crawl4ai + --build-arg ENABLE_GPU=true \ + deploy/docker/ ``` -### Testing the Deployment πŸ§ͺ +### Build Arguments Explained + +| Argument | Description | Default | Options | +|----------|-------------|---------|----------| +| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 | +| INSTALL_TYPE | Feature set | default | default, all, torch, transformer | +| ENABLE_GPU | GPU support | false | true, false | +| APP_HOME | Install path | /app | any valid path | + +### Build Best Practices + +1. **Choose the Right Install Type** + - `default`: Basic installation, smallest image, to be honest, I use this most of the time. + - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them) + +2. **Platform Considerations** + - Let Docker auto-detect platform unless you need cross-compilation + - Use --platform for specific architecture requirements + - Consider buildx for multi-architecture distribution + +3. **Performance Optimization** + - The image automatically includes platform-specific optimizations + - AMD64 gets OpenMP optimizations + - ARM64 gets OpenBLAS optimizations + +### Docker Hub + +> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned! + +## Using the API + +In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail. + +### Python SDK + +The SDK makes things easier! Here's how to use it: ```python -import requests +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig -# For unsecured instances -def test_unsecured(): - # Health check - health = requests.get("http://localhost:11235/health") - print("Health check:", health.json()) - - # Basic crawl - response = requests.post( - "http://localhost:11235/crawl", - json={ - "urls": "https://www.nbcnews.com/business", - "priority": 10 - } - ) - task_id = response.json()["task_id"] - print("Task ID:", task_id) - -# For secured instances -def test_secured(api_token): - headers = {"Authorization": f"Bearer {api_token}"} - - # Basic crawl with authentication - response = requests.post( - "http://localhost:11235/crawl", - headers=headers, - json={ - "urls": "https://www.nbcnews.com/business", - "priority": 10 - } - ) - task_id = response.json()["task_id"] - print("Task ID:", task_id) -``` - -### LLM Extraction Example πŸ€– - -When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction: - -```python -request = { - "urls": "https://example.com", - "extraction_config": { - "type": "llm", - "params": { - "provider": "openai/gpt-4", - "instruction": "Extract main topics from the page" - } - } -} - -# Make the request (add headers if using API security) -response = requests.post("http://localhost:11235/crawl", json=request) -``` - -> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure! - - -## Usage Examples πŸ“ - -### Basic Crawling - -```python -request = { - "urls": "https://www.nbcnews.com/business", - "priority": 10 -} - -response = requests.post("http://localhost:11235/crawl", json=request) -task_id = response.json()["task_id"] - -# Get results -result = requests.get(f"http://localhost:11235/task/{task_id}") -``` - -### Structured Data Extraction - -```python -schema = { - "name": "Crypto Prices", - "baseSelector": ".cds-tableRow-t45thuk", - "fields": [ - { - "name": "crypto", - "selector": "td:nth-child(1) h2", - "type": "text", - }, - { - "name": "price", - "selector": "td:nth-child(2)", - "type": "text", - } - ], -} - -request = { - "urls": "https://www.coinbase.com/explore", - "extraction_config": { - "type": "json_css", - "params": {"schema": schema} - } -} -``` - -### Dynamic Content Handling - -```python -request = { - "urls": "https://www.nbcnews.com/business", - "js_code": [ - "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();" - ], - "wait_for": "article.tease-card:nth-child(10)" -} -``` - -### AI-Powered Extraction (Full Version) - -```python -request = { - "urls": "https://www.nbcnews.com/business", - "extraction_config": { - "type": "cosine", - "params": { - "semantic_filter": "business finance economy", - "word_count_threshold": 10, - "max_dist": 0.2, - "top_k": 3 - } - } -} -``` - -## Platform-Specific Instructions πŸ’» - -### macOS -```bash -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic -``` - -### Ubuntu -```bash -# Basic version -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic - -# With GPU support -docker pull unclecode/crawl4ai:gpu -docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu -``` - -### Windows (PowerShell) -```powershell -docker pull unclecode/crawl4ai:basic -docker run -p 11235:11235 unclecode/crawl4ai:basic -``` - -## Testing πŸ§ͺ - -Save this as `test_docker.py`: - -```python -import requests -import json -import time -import sys - -class Crawl4AiTester: - def __init__(self, base_url: str = "http://localhost:11235"): - self.base_url = base_url +async def main(): + async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client: + # If JWT is enabled, you can authenticate like this: (more on this later) + # await client.authenticate("test@example.com") - def submit_and_wait(self, request_data: dict, timeout: int = 300) -> dict: - # Submit crawl job - response = requests.post(f"{self.base_url}/crawl", json=request_data) - task_id = response.json()["task_id"] - print(f"Task ID: {task_id}") + # Non-streaming crawl + results = await client.crawl( + ["https://example.com", "https://python.org"], + browser_config=BrowserConfig(headless=True), + crawler_config=CrawlerRunConfig() + ) + print(f"Non-streaming results: {results}") - # Poll for result - start_time = time.time() - while True: - if time.time() - start_time > timeout: - raise TimeoutError(f"Task {task_id} timeout") - - result = requests.get(f"{self.base_url}/task/{task_id}") - status = result.json() - - if status["status"] == "completed": - return status - - time.sleep(2) - -def test_deployment(): - tester = Crawl4AiTester() - - # Test basic crawl - request = { - "urls": "https://www.nbcnews.com/business", - "priority": 10 - } - - result = tester.submit_and_wait(request) - print("Basic crawl successful!") - print(f"Content length: {len(result['result']['markdown'])}") + # Streaming crawl + crawler_config = CrawlerRunConfig(stream=True) + async for result in await client.crawl( + ["https://example.com", "https://python.org"], + browser_config=BrowserConfig(headless=True), + crawler_config=crawler_config + ): + print(f"Streamed result: {result}") + + # Get schema + schema = await client.get_schema() + print(f"Schema: {schema}") if __name__ == "__main__": - test_deployment() + asyncio.run(main()) ``` -## Advanced Configuration βš™οΈ +`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control: -### Crawler Parameters +- `base_url` (str): Base URL of the Crawl4AI Docker server +- `timeout` (float): Default timeout for requests in seconds +- `verify_ssl` (bool): Whether to verify SSL certificates +- `verbose` (bool): Whether to show logging output +- `log_file` (str, optional): Path to log file if file logging is desired -The `crawler_params` field allows you to configure the browser instance and crawling behavior. Here are key parameters you can use: +This client SDK generates a properly structured JSON request for the server's HTTP API. +## Second Approach: Direct API Calls + +This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works. + +### Understanding Configuration Structure + +Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity. + +#### The Basic Pattern + +Try this in Python to understand the structure: ```python -request = { - "urls": "https://example.com", - "crawler_params": { - # Browser Configuration - "headless": True, # Run in headless mode - "browser_type": "chromium", # chromium/firefox/webkit - "user_agent": "custom-agent", # Custom user agent - "proxy": "http://proxy:8080", # Proxy configuration - - # Performance & Behavior - "page_timeout": 30000, # Page load timeout (ms) - "verbose": True, # Enable detailed logging - "semaphore_count": 5, # Concurrent request limit - - # Anti-Detection Features - "simulate_user": True, # Simulate human behavior - "magic": True, # Advanced anti-detection - "override_navigator": True, # Override navigator properties - - # Session Management - "user_data_dir": "./browser-data", # Browser profile location - "use_managed_browser": True, # Use persistent browser +from crawl4ai import BrowserConfig + +# Create a config and see its structure +config = BrowserConfig(headless=True) +print(config.dump()) +``` + +This outputs: +```json +{ + "type": "BrowserConfig", + "params": { + "headless": true } } ``` -### Extra Parameters +#### Simple vs Complex Values -The `extra` field allows passing additional parameters directly to the crawler's `arun` function: +The structure follows these rules: +- Simple values (strings, numbers, booleans, lists) are passed directly +- Complex values (classes, dictionaries) use the type-params pattern -```python -request = { - "urls": "https://example.com", - "extra": { - "word_count_threshold": 10, # Min words per block - "only_text": True, # Extract only text - "bypass_cache": True, # Force fresh crawl - "process_iframes": True, # Include iframe content - } -} -``` - -### Complete Examples - -1.β€€**Advanced News Crawling** -```python -request = { - "urls": "https://www.nbcnews.com/business", - "crawler_params": { - "headless": True, - "page_timeout": 30000, - "remove_overlay_elements": True # Remove popups - }, - "extra": { - "word_count_threshold": 50, # Longer content blocks - "bypass_cache": True # Fresh content - }, - "css_selector": ".article-body" -} -``` - -2.β€€**Anti-Detection Configuration** -```python -request = { - "urls": "https://example.com", - "crawler_params": { - "simulate_user": True, - "magic": True, - "override_navigator": True, - "user_agent": "Mozilla/5.0 ...", - "headers": { - "Accept-Language": "en-US,en;q=0.9" - } - } -} -``` - -3.β€€**LLM Extraction with Custom Parameters** -```python -request = { - "urls": "https://openai.com/pricing", - "extraction_config": { - "type": "llm", +For example, with dictionaries: +```json +{ + "browser_config": { + "type": "BrowserConfig", "params": { - "provider": "openai/gpt-4", - "schema": pricing_schema + "headless": true, // Simple boolean - direct value + "viewport": { // Complex dictionary - needs type-params + "type": "dict", + "value": { + "width": 1200, + "height": 800 + } + } } - }, - "crawler_params": { - "verbose": True, - "page_timeout": 60000 - }, - "extra": { - "word_count_threshold": 1, - "only_text": True } } ``` -4.β€€**Session-Based Dynamic Content** +#### Strategy Pattern and Nesting + +Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration: + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "chunking_strategy": { + "type": "RegexChunking", // Strategy implementation + "params": { + "patterns": ["\n\n", "\\.\\s+"] + } + } + } + } +} +``` + +Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy. + +#### Complex Nested Example + +Let's look at a more complex example with content filtering: + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed" + } + } + } + } + } + } +} +``` + +This shows how deeply configurations can nest while maintaining a consistent structure. + +#### Quick Grammar Overview +``` +config := { + "type": string, + "params": { + key: simple_value | complex_value + } +} + +simple_value := string | number | boolean | [simple_value] +complex_value := config | dict_value + +dict_value := { + "type": "dict", + "value": object +} +``` + +#### Important Rules 🚨 + +- Always use the type-params pattern for class instances +- Use direct values for primitives (numbers, strings, booleans) +- Wrap dictionaries with {"type": "dict", "value": {...}} +- Arrays/lists are passed directly without type-params +- All parameters are optional unless specifically required + +#### Pro Tip πŸ’‘ + +The easiest way to get the correct structure is to: +1. Create configuration objects in Python +2. Use the `dump()` method to see their JSON representation +3. Use that JSON in your API calls + +Example: ```python -request = { - "urls": "https://example.com", - "crawler_params": { - "session_id": "dynamic_session", - "headless": False, - "page_timeout": 60000 - }, - "js_code": ["window.scrollTo(0, document.body.scrollHeight);"], - "wait_for": "js:() => document.querySelectorAll('.item').length > 10", - "extra": { - "delay_before_return_html": 2.0 +from crawl4ai import CrawlerRunConfig, PruningContentFilter + +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed") + ), + cache_mode= CacheMode.BYPASS +) +print(config.dump()) # Use this JSON in your API calls +``` + + +#### More Examples + +**Advanced Crawler Configuration** + +```json +{ + "urls": ["https://example.com"], + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "bypass", + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed", + "min_word_threshold": 0 + } + } + } + } + } } } ``` -5.β€€**Screenshot with Custom Timing** +**Extraction Strategy**: + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "baseSelector": "article.post", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "content", "selector": ".content", "type": "html"} + ] + } + } + } + } + } +} +``` + +**LLM Extraction Strategy** + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "instruction": "Extract article title, author, publication date and main content", + "provider": "openai/gpt-4", + "api_token": "your-api-token", + "schema": { + "type": "dict", + "value": { + "title": "Article Schema", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The article's headline" + }, + "author": { + "type": "string", + "description": "The author's name" + }, + "published_date": { + "type": "string", + "format": "date-time", + "description": "Publication date and time" + }, + "content": { + "type": "string", + "description": "The main article content" + } + }, + "required": ["title", "content"] + } + } + } + } + } + } +} +``` + +**Deep Crawler Example** + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "deep_crawl_strategy": { + "type": "BFSDeepCrawlStrategy", + "params": { + "max_depth": 3, + "filter_chain": { + "type": "FilterChain", + "params": { + "filters": [ + { + "type": "ContentTypeFilter", + "params": { + "allowed_types": ["text/html", "application/xhtml+xml"] + } + }, + { + "type": "DomainFilter", + "params": { + "allowed_domains": ["blog.*", "docs.*"], + } + } + ] + } + }, + "url_scorer": { + "type": "CompositeScorer", + "params": { + "scorers": [ + { + "type": "KeywordRelevanceScorer", + "params": { + "keywords": ["tutorial", "guide", "documentation"], + } + }, + { + "type": "PathDepthScorer", + "params": { + "weight": 0.5, + "optimal_depth": 3 + } + } + ] + } + } + } + } + } + } +} +``` + +### REST API Examples + +Let's look at some practical examples: + +#### Simple Crawl + ```python -request = { - "urls": "https://example.com", - "screenshot": True, - "crawler_params": { - "headless": True, - "screenshot_wait_for": ".main-content" - }, - "extra": { - "delay_before_return_html": 3.0 - } +import requests + +crawl_payload = { + "urls": ["https://example.com"], + "browser_config": {"headless": True}, + "crawler_config": {"stream": False} } +response = requests.post( + "http://localhost:8000/crawl", + # headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled, more on this later + json=crawl_payload +) +print(response.json()) # Print the response for debugging ``` -### Parameter Reference Table +#### Streaming Results -| Category | Parameter | Type | Description | -|----------|-----------|------|-------------| -| Browser | headless | bool | Run browser in headless mode | -| Browser | browser_type | str | Browser engine selection | -| Browser | user_agent | str | Custom user agent string | -| Network | proxy | str | Proxy server URL | -| Network | headers | dict | Custom HTTP headers | -| Timing | page_timeout | int | Page load timeout (ms) | -| Timing | delay_before_return_html | float | Wait before capture | -| Anti-Detection | simulate_user | bool | Human behavior simulation | -| Anti-Detection | magic | bool | Advanced protection | -| Session | session_id | str | Browser session ID | -| Session | user_data_dir | str | Profile directory | -| Content | word_count_threshold | int | Minimum words per block | -| Content | only_text | bool | Text-only extraction | -| Content | process_iframes | bool | Include iframe content | -| Debug | verbose | bool | Detailed logging | -| Debug | log_console | bool | Browser console logs | +```python +async def test_stream_crawl(session, token: str): + """Test the /crawl/stream endpoint with multiple URLs.""" + url = "http://localhost:8000/crawl/stream" + payload = { + "urls": [ + "https://example.com", + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + ], + "browser_config": {"headless": True, "viewport": {"width": 1200}}, + "crawler_config": {"stream": True, "cache_mode": "bypass"} + } -## Troubleshooting πŸ” + # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later + + try: + async with session.post(url, json=payload, headers=headers) as response: + status = response.status + print(f"Status: {status} (Expected: 200)") + assert status == 200, f"Expected 200, got {status}" + + # Read streaming response line-by-line (NDJSON) + async for line in response.content: + if line: + data = json.loads(line.decode('utf-8').strip()) + print(f"Streamed Result: {json.dumps(data, indent=2)}") + except Exception as e: + print(f"Error in streaming crawl test: {str(e)}") +``` -### Common Issues +## Metrics & Monitoring -1.β€€**Connection Refused** - ``` - Error: Connection refused at localhost:11235 - ``` - Solution: Ensure the container is running and ports are properly mapped. +Keep an eye on your crawler with these endpoints: -2.β€€**Resource Limits** - ``` - Error: No available slots - ``` - Solution: Increase MAX_CONCURRENT_TASKS or container resources. +- `/health` - Quick health check +- `/metrics` - Detailed Prometheus metrics +- `/schema` - Full API schema -3.β€€**GPU Access** - ``` - Error: GPU not found - ``` - Solution: Ensure proper NVIDIA drivers and use `--gpus all` flag. - -### Debug Mode - -Access container for debugging: +Example health check: ```bash -docker run -it --entrypoint /bin/bash unclecode/crawl4ai:all +curl http://localhost:8000/health ``` -View container logs: -```bash -docker logs [container_id] +## Deployment Scenarios + +> 🚧 Coming soon! We'll cover: +> - Kubernetes deployment +> - Cloud provider setups (AWS, GCP, Azure) +> - High-availability configurations +> - Load balancing strategies + +## Complete Examples + +Check out the `examples` folder in our repository for full working examples! Here are two to get you started: +[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py) +[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py) + +## Server Configuration + +The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security. + +### Understanding config.yml + +The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container. + +Here's a detailed breakdown of the configuration options: + +```yaml +# Application Configuration +app: + title: "Crawl4AI API" # Server title in OpenAPI docs + version: "1.0.0" # API version + host: "0.0.0.0" # Listen on all interfaces + port: 8000 # Server port + reload: True # Enable hot reloading (development only) + timeout_keep_alive: 300 # Keep-alive timeout in seconds + +# Rate Limiting Configuration +rate_limiting: + enabled: True # Enable/disable rate limiting + default_limit: "100/minute" # Rate limit format: "number/timeunit" + trusted_proxies: [] # List of trusted proxy IPs + storage_uri: "memory://" # Use "redis://localhost:6379" for production + +# Security Configuration +security: + enabled: false # Master toggle for security features + jwt_enabled: true # Enable JWT authentication + https_redirect: True # Force HTTPS + trusted_hosts: ["*"] # Allowed hosts (use specific domains in production) + headers: # Security headers + x_content_type_options: "nosniff" + x_frame_options: "DENY" + content_security_policy: "default-src 'self'" + strict_transport_security: "max-age=63072000; includeSubDomains" + +# Crawler Configuration +crawler: + memory_threshold_percent: 95.0 # Memory usage threshold + rate_limiter: + base_delay: [1.0, 2.0] # Min and max delay between requests + timeouts: + stream_init: 30.0 # Stream initialization timeout + batch_process: 300.0 # Batch processing timeout + +# Logging Configuration +logging: + level: "INFO" # Log level (DEBUG, INFO, WARNING, ERROR) + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Observability Configuration +observability: + prometheus: + enabled: True # Enable Prometheus metrics + endpoint: "/metrics" # Metrics endpoint + health_check: + endpoint: "/health" # Health check endpoint ``` -## Best Practices 🌟 +### JWT Authentication -1.β€€**Resource Management** - - Set appropriate memory and CPU limits - - Monitor resource usage via health endpoint - - Use basic version for simple crawling tasks +When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works: -2.β€€**Scaling** - - Use multiple containers for high load - - Implement proper load balancing - - Monitor performance metrics - -3.β€€**Security** - - Use environment variables for sensitive data - - Implement proper network isolation - - Regular security updates - -## API Reference πŸ“š - -### Health Check -```http -GET /health -``` - -### Submit Crawl Task -```http -POST /crawl +#### Getting a Token +```python +POST /token Content-Type: application/json { - "urls": "string or array", - "extraction_config": { - "type": "basic|llm|cosine|json_css", - "params": {} - }, - "priority": 1-10, - "ttl": 3600 + "email": "user@example.com" } ``` -### Get Task Status -```http -GET /task/{task_id} +The endpoint returns: +```json +{ + "email": "user@example.com", + "access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...", + "token_type": "bearer" +} ``` -For more details, visit the [official documentation](https://docs.crawl4ai.com/). \ No newline at end of file +#### Using the Token +Add the token to your requests: +```bash +curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl +``` + +Using the Python SDK: +```python +from crawl4ai.docker_client import Crawl4aiDockerClient + +async with Crawl4aiDockerClient() as client: + # Authenticate first + await client.authenticate("user@example.com") + + # Now all requests will include the token automatically + result = await client.crawl(urls=["https://example.com"]) +``` + +#### Production Considerations πŸ’‘ +The default implementation uses a simple email verification. For production use, consider: +- Email verification via OTP/magic links +- OAuth2 integration +- Rate limiting token generation +- Token expiration and refresh mechanisms +- IP-based restrictions + +### Configuration Tips and Best Practices + +1. **Production Settings** 🏭 + + ```yaml + app: + reload: False # Disable reload in production + timeout_keep_alive: 120 # Lower timeout for better resource management + + rate_limiting: + storage_uri: "redis://redis:6379" # Use Redis for distributed rate limiting + default_limit: "50/minute" # More conservative rate limit + + security: + enabled: true # Enable all security features + trusted_hosts: ["your-domain.com"] # Restrict to your domain + ``` + +2. **Development Settings** πŸ› οΈ + + ```yaml + app: + reload: True # Enable hot reloading + timeout_keep_alive: 300 # Longer timeout for debugging + + logging: + level: "DEBUG" # More verbose logging + ``` + +3. **High-Traffic Settings** 🚦 + + ```yaml + crawler: + memory_threshold_percent: 85.0 # More conservative memory limit + rate_limiter: + base_delay: [2.0, 4.0] # More aggressive rate limiting + ``` + +### Customizing Your Configuration + +#### Method 1: Pre-build Configuration + +```bash +# Copy and modify config before building +cd crawl4ai/deploy +vim custom-config.yml # Or use any editor + +# Build with custom config +docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest . +``` + +#### Method 2: Build-time Configuration + +Use a custom config during build: + +```bash +# Build with custom config +docker build --platform=linux/amd64 --no-cache \ + --build-arg CONFIG_PATH=/path/to/custom-config.yml \ + -t crawl4ai:latest . +``` + +#### Method 3: Runtime Configuration +```bash +# Mount custom config at runtime +docker run -d -p 8000:8000 \ + -v $(pwd)/custom-config.yml:/app/config.yml \ + crawl4ai-server:prod +``` + +> πŸ’‘ Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory. +> πŸ’‘ Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config. + +### Configuration Recommendations + +1. **Security First** πŸ”’ + - Always enable security in production + - Use specific trusted_hosts instead of wildcards + - Set up proper rate limiting to protect your server + - Consider your environment before enabling HTTPS redirect + +2. **Resource Management** πŸ’» + - Adjust memory_threshold_percent based on available RAM + - Set timeouts according to your content size and network conditions + - Use Redis for rate limiting in multi-container setups + +3. **Monitoring** πŸ“Š + - Enable Prometheus if you need metrics + - Set DEBUG logging in development, INFO in production + - Regular health check monitoring is crucial + +4. **Performance Tuning** ⚑ + - Start with conservative rate limiter delays + - Increase batch_process timeout for large content + - Adjust stream_init timeout based on initial response times + +## Getting Help + +We're here to help you succeed with Crawl4AI! Here's how to get support: + +- πŸ“– Check our [full documentation](https://docs.crawl4ai.com) +- πŸ› Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues) +- πŸ’¬ Join our [Discord community](https://discord.gg/crawl4ai) +- ⭐ Star us on GitHub to show support! + +## Summary + +In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment: +- Building and running the Docker container +- Configuring the environment +- Making API requests with proper typing +- Using the Python SDK +- Monitoring your deployment + +Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs. + +Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. πŸš€ + +Happy crawling! πŸ•·οΈ \ No newline at end of file diff --git a/docs/tutorials/coming_soon.md b/docs/tutorials/coming_soon.md new file mode 100644 index 00000000..e69de29b From 3179d6ad0c03e40080ba1ec8274f4690019a39bb Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 11 Apr 2025 20:58:39 +0800 Subject: [PATCH 2/5] fix(core): improve error handling and stability in core components Enhance error handling and stability across multiple components: - Add safety checks in async_configs.py for type and params existence - Fix browser manager initialization and cleanup logic - Add default LLM config fallback in extraction strategy - Add comprehensive Docker deployment guide and server tests BREAKING CHANGE: BrowserManager.start() now automatically closes existing instances --- crawl4ai/async_configs.py | 22 +- crawl4ai/browser_manager.py | 8 +- crawl4ai/extraction_strategy.py | 9 +- deploy/docker/README-new.md | 644 ++++++++++++++++++++++++++ deploy/docker/api.py | 28 +- deploy/docker/config.yml | 2 +- tests/docker/test_server_requests.py | 650 +++++++++++++++++++++++++++ 7 files changed, 1336 insertions(+), 27 deletions(-) create mode 100644 deploy/docker/README-new.md create mode 100644 tests/docker/test_server_requests.py diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index af98e607..2f421178 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -122,23 +122,25 @@ def from_serializable_dict(data: Any) -> Any: # Handle typed data if isinstance(data, dict) and "type" in data: # Handle plain dictionaries - if data["type"] == "dict": + if data["type"] == "dict" and "value" in data: return {k: from_serializable_dict(v) for k, v in data["value"].items()} # Import from crawl4ai for class instances import crawl4ai - cls = getattr(crawl4ai, data["type"]) + if hasattr(crawl4ai, data["type"]): + cls = getattr(crawl4ai, data["type"]) - # Handle Enum - if issubclass(cls, Enum): - return cls(data["params"]) + # Handle Enum + if issubclass(cls, Enum): + return cls(data["params"]) - # Handle class instances - constructor_args = { - k: from_serializable_dict(v) for k, v in data["params"].items() - } - return cls(**constructor_args) + if "params" in data: + # Handle class instances + constructor_args = { + k: from_serializable_dict(v) for k, v in data["params"].items() + } + return cls(**constructor_args) # Handle lists if isinstance(data, list): diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index f3c7d861..bfe22f4e 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -491,10 +491,12 @@ class BrowserManager: Note: This method should be called in a separate task to avoid blocking the main event loop. """ - if self.playwright is None: - from playwright.async_api import async_playwright + if self.playwright is not None: + await self.close() + + from playwright.async_api import async_playwright - self.playwright = await async_playwright().start() + self.playwright = await async_playwright().start() if self.config.cdp_url or self.config.use_managed_browser: self.config.use_managed_browser = True diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index bf4825cc..954fe37e 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -7,7 +7,9 @@ import time from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA from .config import ( - DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD, + DEFAULT_PROVIDER, + DEFAULT_PROVIDER_API_KEY, + CHUNK_TOKEN_THRESHOLD, OVERLAP_RATE, WORD_TOKEN_RATE, ) @@ -542,6 +544,11 @@ class LLMExtractionStrategy(ExtractionStrategy): """ super().__init__( input_format=input_format, **kwargs) self.llm_config = llm_config + if not self.llm_config: + self.llm_config = create_llm_config( + provider=DEFAULT_PROVIDER, + api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY), + ) self.instruction = instruction self.extract_type = extraction_type self.schema = schema diff --git a/deploy/docker/README-new.md b/deploy/docker/README-new.md new file mode 100644 index 00000000..3a9bdf52 --- /dev/null +++ b/deploy/docker/README-new.md @@ -0,0 +1,644 @@ +# Crawl4AI Docker Guide 🐳 + +## Table of Contents +- [Prerequisites](#prerequisites) +- [Installation](#installation) + - [Option 1: Using Docker Compose (Recommended)](#option-1-using-docker-compose-recommended) + - [Option 2: Manual Local Build & Run](#option-2-manual-local-build--run) + - [Option 3: Using Pre-built Docker Hub Images](#option-3-using-pre-built-docker-hub-images) +- [Dockerfile Parameters](#dockerfile-parameters) +- [Using the API](#using-the-api) + - [Understanding Request Schema](#understanding-request-schema) + - [REST API Examples](#rest-api-examples) + - [Python SDK](#python-sdk) +- [Metrics & Monitoring](#metrics--monitoring) +- [Deployment Scenarios](#deployment-scenarios) +- [Complete Examples](#complete-examples) +- [Server Configuration](#server-configuration) + - [Understanding config.yml](#understanding-configyml) + - [JWT Authentication](#jwt-authentication) + - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices) + - [Customizing Your Configuration](#customizing-your-configuration) + - [Configuration Recommendations](#configuration-recommendations) +- [Getting Help](#getting-help) + +## Prerequisites + +Before we dive in, make sure you have: +- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop). +- `git` for cloning the repository. +- At least 4GB of RAM available for the container (more recommended for heavy use). +- Python 3.10+ (if using the Python SDK). +- Node.js 16+ (if using the Node.js examples). + +> πŸ’‘ **Pro tip**: Run `docker info` to check your Docker installation and available resources. + +## Installation + +We offer several ways to get the Crawl4AI server running. Docker Compose is the easiest way to manage local builds and runs. + +### Option 1: Using Docker Compose (Recommended) + +Docker Compose simplifies building and running the service, especially for local development and testing across different platforms. + +#### 1. Clone Repository + +```bash +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +``` + +#### 2. Environment Setup (API Keys) + +If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**. + +```bash +# Make sure you are in the 'crawl4ai' root directory +cp deploy/docker/.llm.env.example .llm.env + +# Now edit .llm.env and add your API keys +# Example content: +# OPENAI_API_KEY=sk-your-key +# ANTHROPIC_API_KEY=your-anthropic-key +# ... +``` +> πŸ”‘ **Note**: Keep your API keys secure! Never commit `.llm.env` to version control. + +#### 3. Build and Run with Compose + +The `docker-compose.yml` file in the project root defines services for different scenarios using **profiles**. + +* **Build and Run Locally (AMD64):** + ```bash + # Builds the image locally using Dockerfile and runs it + docker compose --profile local-amd64 up --build -d + ``` + +* **Build and Run Locally (ARM64):** + ```bash + # Builds the image locally using Dockerfile and runs it + docker compose --profile local-arm64 up --build -d + ``` + +* **Run Pre-built Image from Docker Hub (AMD64):** + ```bash + # Pulls and runs the specified AMD64 image from Docker Hub + # (Set VERSION env var for specific tags, e.g., VERSION=0.5.1-d1) + docker compose --profile hub-amd64 up -d + ``` + +* **Run Pre-built Image from Docker Hub (ARM64):** + ```bash + # Pulls and runs the specified ARM64 image from Docker Hub + docker compose --profile hub-arm64 up -d + ``` + +> The server will be available at `http://localhost:11235`. + +#### 4. Stopping Compose Services + +```bash +# Stop the service(s) associated with a profile (e.g., local-amd64) +docker compose --profile local-amd64 down +``` + +### Option 2: Manual Local Build & Run + +If you prefer not to use Docker Compose for local builds. + +#### 1. Clone Repository & Setup Environment + +Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root). + +#### 2. Build the Image (Multi-Arch) + +Use `docker buildx` to build the image. This example builds for multiple platforms and loads the image matching your host architecture into the local Docker daemon. + +```bash +# Make sure you are in the 'crawl4ai' root directory +docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load . +``` + +#### 3. Run the Container + +* **Basic run (no LLM support):** + ```bash + # Replace --platform if your host is ARM64 + docker run -d \ + -p 11235:11235 \ + --name crawl4ai-standalone \ + --shm-size=1g \ + --platform linux/amd64 \ + crawl4ai-local:latest + ``` + +* **With LLM support:** + ```bash + # Make sure .llm.env is in the current directory (project root) + # Replace --platform if your host is ARM64 + docker run -d \ + -p 11235:11235 \ + --name crawl4ai-standalone \ + --env-file .llm.env \ + --shm-size=1g \ + --platform linux/amd64 \ + crawl4ai-local:latest + ``` + +> The server will be available at `http://localhost:11235`. + +#### 4. Stopping the Manual Container + +```bash +docker stop crawl4ai-standalone && docker rm crawl4ai-standalone +``` + +### Option 3: Using Pre-built Docker Hub Images + +Pull and run images directly from Docker Hub without building locally. + +#### 1. Pull the Image + +We use a versioning scheme like `LIBRARY_VERSION-dREVISION` (e.g., `0.5.1-d1`). The `latest` tag points to the most recent stable release. Images are built with multi-arch manifests, so Docker usually pulls the correct version for your system automatically. + +```bash +# Pull a specific version (recommended for stability) +docker pull unclecode/crawl4ai:0.5.1-d1 + +# Or pull the latest stable version +docker pull unclecode/crawl4ai:latest +``` + +#### 2. Setup Environment (API Keys) + +If using LLMs, create the `.llm.env` file in a directory of your choice, similar to Step 2 in the Compose section. + +#### 3. Run the Container + +* **Basic run:** + ```bash + docker run -d \ + -p 11235:11235 \ + --name crawl4ai-hub \ + --shm-size=1g \ + unclecode/crawl4ai:0.5.1-d1 # Or use :latest + ``` + +* **With LLM support:** + ```bash + # Make sure .llm.env is in the current directory you are running docker from + docker run -d \ + -p 11235:11235 \ + --name crawl4ai-hub \ + --env-file .llm.env \ + --shm-size=1g \ + unclecode/crawl4ai:0.5.1-d1 # Or use :latest + ``` + +> The server will be available at `http://localhost:11235`. + +#### 4. Stopping the Hub Container + +```bash +docker stop crawl4ai-hub && docker rm crawl4ai-hub +``` + +#### Docker Hub Versioning Explained + +* **Image Name:** `unclecode/crawl4ai` +* **Tag Format:** `LIBRARY_VERSION-dREVISION` + * `LIBRARY_VERSION`: The Semantic Version of the core `crawl4ai` Python library included (e.g., `0.5.1`). + * `dREVISION`: An incrementing number (starting at `d1`) for Docker build changes made *without* changing the library version (e.g., base image updates, dependency fixes). Resets to `d1` for each new `LIBRARY_VERSION`. +* **Example:** `unclecode/crawl4ai:0.5.1-d1` +* **`latest` Tag:** Points to the most recent stable `LIBRARY_VERSION-dREVISION`. +* **Multi-Arch:** Images support `linux/amd64` and `linux/arm64`. Docker automatically selects the correct architecture. + +--- + +*(Rest of the document remains largely the same, but with key updates below)* + +--- + +## Dockerfile Parameters + +You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file. + +```bash +# Example: Build with 'all' features using buildx +docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --build-arg INSTALL_TYPE=all \ + -t yourname/crawl4ai-all:latest \ + --load \ + . # Build from root context +``` + +### Build Arguments Explained + +| Argument | Description | Default | Options | +| :----------- | :--------------------------------------- | :-------- | :--------------------------------- | +| INSTALL_TYPE | Feature set | `default` | `default`, `all`, `torch`, `transformer` | +| ENABLE_GPU | GPU support (CUDA for AMD64) | `false` | `true`, `false` | +| APP_HOME | Install path inside container (advanced) | `/app` | any valid path | +| USE_LOCAL | Install library from local source | `true` | `true`, `false` | +| GITHUB_REPO | Git repo to clone if USE_LOCAL=false | *(see Dockerfile)* | any git URL | +| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false | `main` | any branch name | + +*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)* + +### Build Best Practices + +1. **Choose the Right Install Type** + * `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation. + * `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras. +2. **Platform Considerations** + * Use `buildx` for building multi-architecture images, especially for pushing to registries. + * Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds. +3. **Performance Optimization** + * The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64). + +--- + +## Using the API + +Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests. + +### Python SDK + +Install the SDK: `pip install crawl4ai` + +```python +import asyncio +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed + +async def main(): + # Point to the correct server port + async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client: + # If JWT is enabled on the server, authenticate first: + # await client.authenticate("user@example.com") # See Server Configuration section + + # Example Non-streaming crawl + print("--- Running Non-Streaming Crawl ---") + results = await client.crawl( + ["https://httpbin.org/html"], + browser_config=BrowserConfig(headless=True), # Use library classes for config aid + crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + if results: # client.crawl returns None on failure + print(f"Non-streaming results success: {results.success}") + if results.success: + for result in results: # Iterate through the CrawlResultContainer + print(f"URL: {result.url}, Success: {result.success}") + else: + print("Non-streaming crawl failed.") + + + # Example Streaming crawl + print("\n--- Running Streaming Crawl ---") + stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS) + try: + async for result in await client.crawl( # client.crawl returns an async generator for streaming + ["https://httpbin.org/html", "https://httpbin.org/links/5/0"], + browser_config=BrowserConfig(headless=True), + crawler_config=stream_config + ): + print(f"Streamed result: URL: {result.url}, Success: {result.success}") + except Exception as e: + print(f"Streaming crawl failed: {e}") + + + # Example Get schema + print("\n--- Getting Schema ---") + schema = await client.get_schema() + print(f"Schema received: {bool(schema)}") # Print whether schema was received + +if __name__ == "__main__": + asyncio.run(main()) +``` + +*(SDK parameters like timeout, verify_ssl etc. remain the same)* + +### Second Approach: Direct API Calls + +Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`. + +*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)* + +#### More Examples *(Ensure Schema example uses type/value wrapper)* + +**Advanced Crawler Configuration** +*(Keep example, ensure cache_mode uses valid enum value like "bypass")* + +**Extraction Strategy** +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "type": "dict", + "value": { + "baseSelector": "article.post", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "content", "selector": ".content", "type": "html"} + ] + } + } + } + } + } + } +} +``` + +**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)* +*(Keep Deep Crawler Example)* + +### REST API Examples + +Update URLs to use port `11235`. + +#### Simple Crawl + +```python +import requests + +# Configuration objects converted to the required JSON structure +browser_config_payload = { + "type": "BrowserConfig", + "params": {"headless": True} +} +crawler_config_payload = { + "type": "CrawlerRunConfig", + "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum +} + +crawl_payload = { + "urls": ["https://httpbin.org/html"], + "browser_config": browser_config_payload, + "crawler_config": crawler_config_payload +} +response = requests.post( + "http://localhost:11235/crawl", # Updated port + # headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled + json=crawl_payload +) +print(f"Status Code: {response.status_code}") +if response.ok: + print(response.json()) +else: + print(f"Error: {response.text}") + +``` + +#### Streaming Results + +```python +import json +import httpx # Use httpx for async streaming example + +async def test_stream_crawl(token: str = None): # Made token optional + """Test the /crawl/stream endpoint with multiple URLs.""" + url = "http://localhost:11235/crawl/stream" # Updated port + payload = { + "urls": [ + "https://httpbin.org/html", + "https://httpbin.org/links/5/0", + ], + "browser_config": { + "type": "BrowserConfig", + "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": {"stream": True, "cache_mode": "bypass"} + } + } + + headers = {} + # if token: + # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled + + try: + async with httpx.AsyncClient() as client: + async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response: + print(f"Status: {response.status_code} (Expected: 200)") + response.raise_for_status() # Raise exception for bad status codes + + # Read streaming response line-by-line (NDJSON) + async for line in response.aiter_lines(): + if line: + try: + data = json.loads(line) + # Check for completion marker + if data.get("status") == "completed": + print("Stream completed.") + break + print(f"Streamed Result: {json.dumps(data, indent=2)}") + except json.JSONDecodeError: + print(f"Warning: Could not decode JSON line: {line}") + + except httpx.HTTPStatusError as e: + print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}") + except Exception as e: + print(f"Error in streaming crawl test: {str(e)}") + +# To run this example: +# import asyncio +# asyncio.run(test_stream_crawl()) +``` + +--- + +## Metrics & Monitoring + +Keep an eye on your crawler with these endpoints: + +- `/health` - Quick health check +- `/metrics` - Detailed Prometheus metrics +- `/schema` - Full API schema + +Example health check: +```bash +curl http://localhost:11235/health +``` + +--- + +*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)* + +--- + +## Server Configuration + +The server's behavior can be customized through the `config.yml` file. + +### Understanding config.yml + +The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build. + +Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`): + +```yaml +# Application Configuration +app: + title: "Crawl4AI API" + version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1" + host: "0.0.0.0" + port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf). + reload: False # Default set to False - suitable for production + timeout_keep_alive: 300 + +# Default LLM Configuration +llm: + provider: "openai/gpt-4o-mini" + api_key_env: "OPENAI_API_KEY" + # api_key: sk-... # If you pass the API key directly then api_key_env will be ignored + +# Redis Configuration (Used by internal Redis server managed by supervisord) +redis: + host: "localhost" + port: 6379 + db: 0 + password: "" + # ... other redis options ... + +# Rate Limiting Configuration +rate_limiting: + enabled: True + default_limit: "1000/minute" + trusted_proxies: [] + storage_uri: "memory://" # Use "redis://localhost:6379" if you need persistent/shared limits + +# Security Configuration +security: + enabled: false # Master toggle for security features + jwt_enabled: false # Enable JWT authentication (requires security.enabled=true) + https_redirect: false # Force HTTPS (requires security.enabled=true) + trusted_hosts: ["*"] # Allowed hosts (use specific domains in production) + headers: # Security headers (applied if security.enabled=true) + x_content_type_options: "nosniff" + x_frame_options: "DENY" + content_security_policy: "default-src 'self'" + strict_transport_security: "max-age=63072000; includeSubDomains" + +# Crawler Configuration +crawler: + memory_threshold_percent: 95.0 + rate_limiter: + base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher + timeouts: + stream_init: 30.0 # Timeout for stream initialization + batch_process: 300.0 # Timeout for non-streaming /crawl processing + +# Logging Configuration +logging: + level: "INFO" + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Observability Configuration +observability: + prometheus: + enabled: True + endpoint: "/metrics" + health_check: + endpoint: "/health" +``` + +*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)* + +*(Configuration Tips and Best Practices remain the same)* + +### Customizing Your Configuration + +You can override the default `config.yml`. + +#### Method 1: Modify Before Build + +1. Edit the `deploy/docker/config.yml` file in your local repository clone. +2. Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image. + +#### Method 2: Runtime Mount (Recommended for Custom Deploys) + +1. Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections. +2. Mount it when running the container: + + * **Using `docker run`:** + ```bash + # Assumes my-custom-config.yml is in the current directory + docker run -d -p 11235:11235 \ + --name crawl4ai-custom-config \ + --env-file .llm.env \ + --shm-size=1g \ + -v $(pwd)/my-custom-config.yml:/app/config.yml \ + unclecode/crawl4ai:latest # Or your specific tag + ``` + + * **Using `docker-compose.yml`:** Add a `volumes` section to the service definition: + ```yaml + services: + crawl4ai-hub-amd64: # Or your chosen service + image: unclecode/crawl4ai:latest + profiles: ["hub-amd64"] + <<: *base-config + volumes: + # Mount local custom config over the default one in the container + - ./my-custom-config.yml:/app/config.yml + # Keep the shared memory volume from base-config + - /dev/shm:/dev/shm + ``` + *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)* + +> πŸ’‘ When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration. + +### Configuration Recommendations + +1. **Security First** πŸ”’ + - Always enable security in production + - Use specific trusted_hosts instead of wildcards + - Set up proper rate limiting to protect your server + - Consider your environment before enabling HTTPS redirect + +2. **Resource Management** πŸ’» + - Adjust memory_threshold_percent based on available RAM + - Set timeouts according to your content size and network conditions + - Use Redis for rate limiting in multi-container setups + +3. **Monitoring** πŸ“Š + - Enable Prometheus if you need metrics + - Set DEBUG logging in development, INFO in production + - Regular health check monitoring is crucial + +4. **Performance Tuning** ⚑ + - Start with conservative rate limiter delays + - Increase batch_process timeout for large content + - Adjust stream_init timeout based on initial response times + +## Getting Help + +We're here to help you succeed with Crawl4AI! Here's how to get support: + +- πŸ“– Check our [full documentation](https://docs.crawl4ai.com) +- πŸ› Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues) +- πŸ’¬ Join our [Discord community](https://discord.gg/crawl4ai) +- ⭐ Star us on GitHub to show support! + +## Summary + +In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment: +- Building and running the Docker container +- Configuring the environment +- Making API requests with proper typing +- Using the Python SDK +- Monitoring your deployment + +Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs. + +Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. πŸš€ + +Happy crawling! πŸ•·οΈ diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 33802772..c01696b2 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -388,21 +388,25 @@ async def handle_crawl_request( ) ) - async with AsyncWebCrawler(config=browser_config) as crawler: - results = [] - func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") - partial_func = partial(func, - urls[0] if len(urls) == 1 else urls, - config=crawler_config, - dispatcher=dispatcher) - results = await partial_func() - return { - "success": True, - "results": [result.model_dump() for result in results] - } + crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + results = [] + func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") + partial_func = partial(func, + urls[0] if len(urls) == 1 else urls, + config=crawler_config, + dispatcher=dispatcher) + results = await partial_func() + await crawler.close() + return { + "success": True, + "results": [result.model_dump() for result in results] + } except Exception as e: logger.error(f"Crawl error: {str(e)}", exc_info=True) + if 'crawler' in locals(): + await crawler.close() raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml index b7ef4885..3b5fead6 100644 --- a/deploy/docker/config.yml +++ b/deploy/docker/config.yml @@ -4,7 +4,7 @@ app: version: "1.0.0" host: "0.0.0.0" port: 8020 - reload: True + reload: False timeout_keep_alive: 300 # Default LLM Configuration diff --git a/tests/docker/test_server_requests.py b/tests/docker/test_server_requests.py new file mode 100644 index 00000000..ab8b8ced --- /dev/null +++ b/tests/docker/test_server_requests.py @@ -0,0 +1,650 @@ +import pytest +import pytest_asyncio +import httpx +import json +import asyncio +import os +from typing import List, Dict, Any, AsyncGenerator + +# Optional: Import crawl4ai classes directly for reference/easier payload creation aid +# You don't strictly NEED these imports for the tests to run against the server, +# but they help in understanding the structure you are mimicking in JSON. +from crawl4ai import ( + BrowserConfig, + CrawlerRunConfig, + CacheMode, + DefaultMarkdownGenerator, + PruningContentFilter, + BM25ContentFilter, + BFSDeepCrawlStrategy, + FilterChain, + ContentTypeFilter, + DomainFilter, + CompositeScorer, + KeywordRelevanceScorer, + PathDepthScorer, + JsonCssExtractionStrategy, + LLMExtractionStrategy, + LLMConfig +) + +# --- Test Configuration --- +BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable +# Use a known simple HTML page for basic tests +SIMPLE_HTML_URL = "https://httpbin.org/html" +# Use a site suitable for scraping tests +SCRAPE_TARGET_URL = "http://books.toscrape.com/" +# Use a site with internal links for deep crawl tests +DEEP_CRAWL_URL = "https://python.org" + +# --- Pytest Fixtures --- + +# Use the built-in event_loop fixture from pytest_asyncio +# The custom implementation was causing issues with closing the loop + +@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues +async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]: + """Provides an async HTTP client""" + client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0) + yield client + await client.aclose() + +# --- Helper Functions --- + +async def check_server_health(client: httpx.AsyncClient): + """Check if the server is healthy before running tests.""" + try: + response = await client.get("/health") + response.raise_for_status() + print(f"\nServer healthy: {response.json()}") + return True + except (httpx.RequestError, httpx.HTTPStatusError) as e: + pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False) + +async def assert_crawl_result_structure(result: Dict[str, Any]): + """Asserts the basic structure of a single crawl result.""" + assert isinstance(result, dict) + assert "url" in result + assert "success" in result + assert "html" in result + # Add more common checks if needed + +async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]: + """Processes an NDJSON streaming response.""" + results = [] + completed = False + async for line in response.aiter_lines(): + if line: + try: + data = json.loads(line) + if data.get("status") == "completed": + completed = True + break # Stop processing after completion marker + else: + results.append(data) + except json.JSONDecodeError: + pytest.fail(f"Failed to decode JSON line: {line}") + assert completed, "Streaming response did not end with a completion marker." + return results + + +# --- Test Class --- + +@pytest.mark.asyncio +class TestCrawlEndpoints: + + @pytest_asyncio.fixture(autouse=True) + async def check_health_before_tests(self, async_client: httpx.AsyncClient): + """Fixture to ensure server is healthy before each test in the class.""" + await check_server_health(async_client) + + # 1. Simple Requests (Primitives) + async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient): + """Test /crawl with a single URL and simple config values.""" + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True, + } + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": False, # Explicitly false for /crawl + "screenshot": False, + "cache_mode": CacheMode.BYPASS.value # Use enum value + } + } + } + try: + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error: {e}") + print(f"Response content: {e.response.text}") + raise + + assert data["success"] is True + assert isinstance(data["results"], list) + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert result["url"] == SIMPLE_HTML_URL + assert "

Herman Melville - Moby-Dick

" in result["html"] + # We don't specify a markdown generator in this test, so don't make assumptions about markdown field + # It might be null, missing, or populated depending on the server's default behavior + + async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient): + """Test /crawl/stream with a single URL and simple config values.""" + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True, + } + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": True, # Must be true for /crawl/stream + "screenshot": False, + "cache_mode": CacheMode.BYPASS.value + } + } + } + async with async_client.stream("POST", "/crawl/stream", json=payload) as response: + response.raise_for_status() + results = await process_streaming_response(response) + + assert len(results) == 1 + result = results[0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert result["url"] == SIMPLE_HTML_URL + assert "

Herman Melville - Moby-Dick

" in result["html"] + + + # 2. Multi-URL and Dispatcher + async def test_multi_url_crawl(self, async_client: httpx.AsyncClient): + """Test /crawl with multiple URLs, implicitly testing dispatcher.""" + urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"] + payload = { + "urls": urls, + "browser_config": { + "type": "BrowserConfig", + "params": {"headless": True} + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": {"stream": False, "cache_mode": CacheMode.BYPASS.value} + } + } + try: + print(f"Sending deep crawl request to server...") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert isinstance(data["results"], list) + assert len(data["results"]) == len(urls) + for result in data["results"]: + await assert_crawl_result_structure(result) + assert result["success"] is True + assert result["url"] in urls + + async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient): + """Test /crawl/stream with multiple URLs.""" + urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"] + payload = { + "urls": urls, + "browser_config": { + "type": "BrowserConfig", + "params": {"headless": True} + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": {"stream": True, "cache_mode": CacheMode.BYPASS.value} + } + } + async with async_client.stream("POST", "/crawl/stream", json=payload) as response: + response.raise_for_status() + results = await process_streaming_response(response) + + assert len(results) == len(urls) + processed_urls = set() + for result in results: + await assert_crawl_result_structure(result) + assert result["success"] is True + assert result["url"] in urls + processed_urls.add(result["url"]) + assert processed_urls == set(urls) # Ensure all URLs were processed + + + # 3. Class Values and Nested Classes (Markdown Generator) + async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient): + """Test /crawl with MarkdownGenerator using PruningContentFilter.""" + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": CacheMode.ENABLED.value, # Test different cache mode + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.5, # Example param + "threshold_type": "relative" + } + } + } + } + } + } + } + try: + print(f"Sending deep crawl request to server...") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "markdown" in result + assert isinstance(result["markdown"], dict) + assert "raw_markdown" in result["markdown"] + assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown + assert "Moby-Dick" in result["markdown"]["raw_markdown"] + # Fit markdown content might be different/shorter due to pruning + assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"]) + + async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient): + """Test /crawl with MarkdownGenerator using BM25ContentFilter.""" + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "BM25ContentFilter", + "params": { + "user_query": "Herman Melville", # Query for BM25 + "bm25_threshold": 0.1, # Lower threshold to increase matches + "language": "english" # Valid parameters + } + } + } + } + } + } + } + try: + print(f"Payload for BM25 test: {json.dumps(payload)}") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "markdown" in result + assert isinstance(result["markdown"], dict) + assert "raw_markdown" in result["markdown"] + assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown + + # Print values for debug + print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}") + print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}") + + # Either fit_markdown has content (possibly including our query terms) + # or it might be empty if no good BM25 matches were found + # Don't assert specific content since it can be environment-dependent + + + # 4. Deep Crawling + async def test_deep_crawl(self, async_client: httpx.AsyncClient): + """Test /crawl with a deep crawl strategy.""" + payload = { + "urls": [DEEP_CRAWL_URL], # Start URL + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": False, + "cache_mode": CacheMode.BYPASS.value, + "deep_crawl_strategy": { + "type": "BFSDeepCrawlStrategy", + "params": { + "max_depth": 1, # Limit depth for testing speed + "max_pages": 5, # Limit pages to crawl + "filter_chain": { + "type": "FilterChain", + "params": { + "filters": [ + { + "type": "ContentTypeFilter", + "params": {"allowed_types": ["text/html"]} + }, + { + "type": "DomainFilter", + "params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains + } + ] + } + }, + "url_scorer": { + "type": "CompositeScorer", + "params": { + "scorers": [ + { + "type": "KeywordRelevanceScorer", + "params": {"keywords": ["documentation", "tutorial"]} + }, + { + "type": "PathDepthScorer", + "params": {"weight": 0.5, "optimal_depth": 2} + } + ] + } + } + } + } + } + } + } + try: + print(f"Sending deep crawl request to server...") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert isinstance(data["results"], list) + # Expect more than 1 result due to deep crawl (start URL + crawled links) + assert len(data["results"]) > 1 + assert len(data["results"]) <= 6 # Start URL + max_links=5 + + start_url_found = False + crawled_urls_found = False + for result in data["results"]: + await assert_crawl_result_structure(result) + assert result["success"] is True + + # Print URL for debugging + print(f"Crawled URL: {result['url']}") + + # Allow URLs that contain python.org (including subdomains like docs.python.org) + assert "python.org" in result["url"] + if result["url"] == DEEP_CRAWL_URL: + start_url_found = True + else: + crawled_urls_found = True + + assert start_url_found + assert crawled_urls_found + + + # 5. Extraction without LLM (JSON/CSS) + async def test_json_css_extraction(self, async_client: httpx.AsyncClient): + """Test /crawl with JsonCssExtractionStrategy.""" + payload = { + "urls": [SCRAPE_TARGET_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": CacheMode.BYPASS.value, + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "type": "dict", # IMPORTANT: Wrap schema dict with type/value structure + "value": { + "name": "BookList", + "baseSelector": "ol.row li.col-xs-6", # Select each book item + "fields": [ + {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"}, + {"name": "price", "selector": "article.product_pod .price_color", "type": "text"}, + {"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"} + ] + } + } + } + } + } + } + } + try: + print(f"Sending deep crawl request to server...") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "extracted_content" in result + assert result["extracted_content"] is not None + + # Extracted content should be a JSON string representing a list of dicts + try: + extracted_data = json.loads(result["extracted_content"]) + assert isinstance(extracted_data, list) + assert len(extracted_data) > 0 # Should find some books + # Check structure of the first extracted item + first_item = extracted_data[0] + assert "title" in first_item + assert "price" in first_item + assert "rating" in first_item + assert "star-rating" in first_item["rating"] # e.g., "star-rating Three" + except (json.JSONDecodeError, AssertionError) as e: + pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}") + + + # 6. Extraction with LLM + async def test_llm_extraction(self, async_client: httpx.AsyncClient): + """ + Test /crawl with LLMExtractionStrategy. + NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY) + configured via .llm.env or environment variables. + This test uses the default provider configured in the server's config.yml. + """ + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": CacheMode.BYPASS.value, + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "instruction": "Extract the main title and the author mentioned in the text into JSON.", + # LLMConfig is implicitly defined by server's config.yml and .llm.env + # If you needed to override provider/token PER REQUEST: + "llm_config": { + "type": "LLMConfig", + "params": { + "provider": "openai/gpt-4o", # Example override + "api_token": os.getenv("OPENAI_API_KEY") # Example override + } + }, + "schema": { # Optional: Provide a schema for structured output + "type": "dict", # IMPORTANT: Wrap schema dict + "value": { + "title": "Book Info", + "type": "object", + "properties": { + "title": {"type": "string", "description": "The main title of the work"}, + "author": {"type": "string", "description": "The author of the work"} + }, + "required": ["title", "author"] + } + } + } + } + } + } + } + + try: + response = await async_client.post("/crawl", json=payload) + response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key) + data = response.json() + except httpx.HTTPStatusError as e: + # Catch potential server errors (like 500 due to missing/invalid API keys) + pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.") + except httpx.RequestError as e: + pytest.fail(f"LLM extraction request failed: {e}.") + + assert data["success"] is True + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "extracted_content" in result + assert result["extracted_content"] is not None + + # Extracted content should be JSON (because we provided a schema) + try: + extracted_data = json.loads(result["extracted_content"]) + print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification + + # Handle both dict and list formats (server returns a list) + if isinstance(extracted_data, list): + assert len(extracted_data) > 0 + extracted_item = extracted_data[0] # Take first item + assert isinstance(extracted_item, dict) + assert "title" in extracted_item + assert "author" in extracted_item + assert "Moby-Dick" in extracted_item.get("title", "") + assert "Herman Melville" in extracted_item.get("author", "") + else: + assert isinstance(extracted_data, dict) + assert "title" in extracted_data + assert "author" in extracted_data + assert "Moby-Dick" in extracted_data.get("title", "") + assert "Herman Melville" in extracted_data.get("author", "") + except (json.JSONDecodeError, AssertionError) as e: + pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}") + except Exception as e: # Catch any other unexpected error + pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}") + +if __name__ == "__main__": + # Define arguments for pytest programmatically + # -v: verbose output + # -s: show print statements immediately (useful for debugging) + # __file__: tells pytest to run tests in the current file + pytest_args = ["-v", "-s", __file__] + + # You can add more pytest arguments here if needed, for example: + # '-k test_llm_extraction': Run only the LLM test function + # pytest_args.append("-k test_llm_extraction") + + print(f"Running pytest with args: {pytest_args}") + + # Execute pytest + exit_code = pytest.main(pytest_args) + + print(f"Pytest finished with exit code: {exit_code}") \ No newline at end of file From ecec53a8c1560b082bfe8f9cb1f5223a83f5e2f7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 13 Apr 2025 20:14:41 +0800 Subject: [PATCH 3/5] Docker tested on Windows machine. --- Dockerfile | 14 +++++++++++++- docker-compose.yml | 3 +-- tests/docker/test_server_requests.py | 7 ++++++- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8b84f797..a4ab56df 100644 --- a/Dockerfile +++ b/Dockerfile @@ -149,7 +149,15 @@ RUN pip install --no-cache-dir --upgrade pip && \ python -c "import crawl4ai; print('βœ… crawl4ai is ready to rock!')" && \ python -c "from playwright.sync_api import sync_playwright; print('βœ… Playwright is feeling dramatic!')" -RUN playwright install --with-deps chromium +RUN crawl4ai-setup + +RUN playwright install --with-deps + +RUN mkdir -p /home/appuser/.cache/ms-playwright \ + && cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \ + && chown -R appuser:appuser /home/appuser/.cache/ms-playwright + +RUN crawl4ai-doctor # Copy application code COPY deploy/docker/* ${APP_HOME}/ @@ -174,4 +182,8 @@ EXPOSE 6379 # Switch to the non-root user before starting the application USER appuser +# Set environment variables to ptoduction +ENV PYTHON_ENV=production + +# Start the application using supervisord CMD ["supervisord", "-c", "supervisord.conf"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index f112f9fd..4331d219 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,4 @@ # docker-compose.yml -# This file is in the root directory alongside Dockerfile # Base configuration anchor for reusability x-base-config: &base-config @@ -9,7 +8,7 @@ x-base-config: &base-config # - "8080:8080" # Uncomment if needed # Load API keys primarily from .llm.env file - # Create .llm.env in the root directory from deploy/docker/.llm.env.example + # Create .llm.env in the root directory .llm.env.example env_file: - .llm.env diff --git a/tests/docker/test_server_requests.py b/tests/docker/test_server_requests.py index ab8b8ced..56d2ada4 100644 --- a/tests/docker/test_server_requests.py +++ b/tests/docker/test_server_requests.py @@ -6,6 +6,10 @@ import asyncio import os from typing import List, Dict, Any, AsyncGenerator +from dotenv import load_dotenv +load_dotenv() + + # Optional: Import crawl4ai classes directly for reference/easier payload creation aid # You don't strictly NEED these imports for the tests to run against the server, # but they help in understanding the structure you are mimicking in JSON. @@ -29,7 +33,8 @@ from crawl4ai import ( ) # --- Test Configuration --- -BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable +# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable +BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable # Use a known simple HTML page for basic tests SIMPLE_HTML_URL = "https://httpbin.org/html" # Use a site suitable for scraping tests From c56974cf5996302deb80a489163258607ec3cfde Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 14 Apr 2025 20:46:32 +0800 Subject: [PATCH 4/5] feat(docs): enhance documentation UI with ToC and GitHub stats Add new features to documentation UI: - Add table of contents with scroll spy functionality - Add GitHub repository statistics badge - Implement new centered layout system with fixed sidebar - Add conditional Playwright installation based on CRAWL4AI_MODE Breaking changes: None --- crawl4ai/install.py | 19 +- docs/md_v2/assets/github_stats.js | 119 ++++++++++++ docs/md_v2/assets/layout.css | 297 ++++++++++++++++++++++++++++++ docs/md_v2/assets/styles.css | 13 +- docs/md_v2/assets/toc.js | 144 +++++++++++++++ mkdocs.yml | 5 +- 6 files changed, 593 insertions(+), 4 deletions(-) create mode 100644 docs/md_v2/assets/github_stats.js create mode 100644 docs/md_v2/assets/layout.css create mode 100644 docs/md_v2/assets/toc.js diff --git a/crawl4ai/install.py b/crawl4ai/install.py index c0c3ab0d..b2fcca78 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -40,10 +40,25 @@ def setup_home_directory(): f.write("") def post_install(): - """Run all post-installation tasks""" + """ + Run all post-installation tasks. + Checks CRAWL4AI_MODE environment variable. If set to 'api', + skips Playwright browser installation. + """ logger.info("Running post-installation setup...", tag="INIT") setup_home_directory() - install_playwright() + + # Check environment variable to conditionally skip Playwright install + run_mode = os.getenv('CRAWL4AI_MODE') + if run_mode == 'api': + logger.warning( + "CRAWL4AI_MODE=api detected. Skipping Playwright browser installation.", + tag="SETUP" + ) + else: + # Proceed with installation only if mode is not 'api' + install_playwright() + run_migration() # TODO: Will be added in the future # setup_builtin_browser() diff --git a/docs/md_v2/assets/github_stats.js b/docs/md_v2/assets/github_stats.js new file mode 100644 index 00000000..a48b3de1 --- /dev/null +++ b/docs/md_v2/assets/github_stats.js @@ -0,0 +1,119 @@ +// ==== File: assets/github_stats.js ==== + +document.addEventListener('DOMContentLoaded', async () => { + // --- Configuration --- + const targetHeaderSelector = '.terminal .container:first-child'; // Selector for your header container + const insertBeforeSelector = '.terminal-nav'; // Selector for the element to insert the badge BEFORE (e.g., the main nav) + // Or set to null to append at the end of the header. + + // --- Find elements --- + const headerContainer = document.querySelector(targetHeaderSelector); + if (!headerContainer) { + console.warn('GitHub Stats: Header container not found with selector:', targetHeaderSelector); + return; + } + + const repoLinkElement = headerContainer.querySelector('a[href*="github.com/"]'); // Find the existing GitHub link + let repoUrl = 'https://github.com/unclecode/crawl4ai'; + // if (repoLinkElement) { + // repoUrl = repoLinkElement.href; + // } else { + // // Fallback: Try finding from config (requires template injection - harder) + // // Or hardcode if necessary, but reading from the link is better. + // console.warn('GitHub Stats: GitHub repo link not found in header.'); + // // Try to get repo_url from mkdocs config if available globally (less likely) + // // repoUrl = window.mkdocs_config?.repo_url; // Requires setting this variable + // // if (!repoUrl) return; // Exit if still no URL + // return; // Exit for now if link isn't found + // } + + + // --- Extract Repo Owner/Name --- + let owner = ''; + let repo = ''; + try { + const url = new URL(repoUrl); + const pathParts = url.pathname.split('/').filter(part => part.length > 0); + if (pathParts.length >= 2) { + owner = pathParts[0]; + repo = pathParts[1]; + } + } catch (e) { + console.error('GitHub Stats: Could not parse repository URL:', repoUrl, e); + return; + } + + if (!owner || !repo) { + console.warn('GitHub Stats: Could not extract owner/repo from URL:', repoUrl); + return; + } + + // --- Get Version (Attempt to extract from site title) --- + let version = ''; + const siteTitleElement = headerContainer.querySelector('.terminal-title, .site-title'); // Adjust selector based on theme's title element + // Example title: "Crawl4AI Documentation (v0.5.x)" + if (siteTitleElement) { + const match = siteTitleElement.textContent.match(/\((v?[^)]+)\)/); // Look for text in parentheses starting with 'v' (optional) + if (match && match[1]) { + version = match[1].trim(); + } + } + if (!version) { + console.info('GitHub Stats: Could not extract version from title. You might need to adjust the selector or regex.'); + // You could fallback to config.extra.version if injected into JS + // version = window.mkdocs_config?.extra?.version || 'N/A'; + } + + + // --- Fetch GitHub API Data --- + let stars = '...'; + let forks = '...'; + try { + const apiUrl = `https://api.github.com/repos/${owner}/${repo}`; + const response = await fetch(apiUrl); + + if (response.ok) { + const data = await response.json(); + // Format large numbers (optional) + stars = data.stargazers_count > 1000 ? `${(data.stargazers_count / 1000).toFixed(1)}k` : data.stargazers_count; + forks = data.forks_count > 1000 ? `${(data.forks_count / 1000).toFixed(1)}k` : data.forks_count; + } else { + console.warn(`GitHub Stats: API request failed with status ${response.status}. Rate limit exceeded?`); + stars = 'N/A'; + forks = 'N/A'; + } + } catch (error) { + console.error('GitHub Stats: Error fetching repository data:', error); + stars = 'N/A'; + forks = 'N/A'; + } + + // --- Create Badge HTML --- + const badgeContainer = document.createElement('div'); + badgeContainer.className = 'github-stats-badge'; + + // Use innerHTML for simplicity, including potential icons (requires FontAwesome or similar) + // Ensure your theme loads FontAwesome or add it yourself if you want icons. + badgeContainer.innerHTML = ` + + + + ${owner}/${repo} + ${version ? ` ${version}` : ''} + ${stars} + ${forks} + + `; + + // --- Inject Badge into Header --- + const insertBeforeElement = insertBeforeSelector ? headerContainer.querySelector(insertBeforeSelector) : null; + if (insertBeforeElement) { + // headerContainer.insertBefore(badgeContainer, insertBeforeElement); + headerContainer.querySelector(insertBeforeSelector).appendChild(badgeContainer); + } else { + headerContainer.appendChild(badgeContainer); + } + + console.info('GitHub Stats: Badge added to header.'); + +}); \ No newline at end of file diff --git a/docs/md_v2/assets/layout.css b/docs/md_v2/assets/layout.css new file mode 100644 index 00000000..db5fac55 --- /dev/null +++ b/docs/md_v2/assets/layout.css @@ -0,0 +1,297 @@ +/* ==== File: assets/layout.css (Non-Fluid Centered Layout) ==== */ + +:root { + --header-height: 55px; /* Adjust if needed */ + --sidebar-width: 280px; /* Adjust if needed */ + --toc-width: 340px; /* As specified */ + --content-max-width: 90em; /* Max width for the centered content */ + --layout-transition-speed: 0.2s; + --global-space: 10px; +} + +/* --- Basic Setup --- */ +html { + scroll-behavior: smooth; + scroll-padding-top: calc(var(--header-height) + 15px); + box-sizing: border-box; +} +*, *:before, *:after { + box-sizing: inherit; +} + +body { + padding-top: 0; + padding-bottom: 0; + background-color: var(--background-color); + color: var(--font-color); + /* Prevents horizontal scrollbars during transitions */ + overflow-x: hidden; +} + +/* --- Fixed Header --- */ +/* Full width, fixed header */ +.terminal .container:first-child { /* Assuming this targets the header container */ + position: fixed; + top: 0; + left: 0; + right: 0; + height: var(--header-height); + background-color: var(--background-color); + z-index: 1000; + border-bottom: 1px solid var(--progress-bar-background); + max-width: none; /* Override any container max-width */ + padding: 0 calc(var(--global-space) * 2); +} + +/* --- Main Layout Container (Below Header) --- */ +/* This container just provides space for the fixed header */ +.container:has(.terminal-mkdocs-main-grid) { + margin: 0 auto; + padding: 0; + padding-top: var(--header-height); /* Space for fixed header */ +} + +/* --- Flex Container: Grid holding content and toc (CENTERED) --- */ +/* THIS is the main centered block */ +.terminal-mkdocs-main-grid { + display: flex; + align-items: flex-start; + /* Enforce max-width and center */ + max-width: var(--content-max-width); + margin-left: auto; + margin-right: auto; + position: relative; + /* Apply side padding within the centered block */ + padding-left: calc(var(--global-space) * 2); + padding-right: calc(var(--global-space) * 2); + /* Add margin-left to clear the fixed sidebar */ + margin-left: var(--sidebar-width); +} + +/* --- 1. Fixed Left Sidebar (Viewport Relative) --- */ +#terminal-mkdocs-side-panel { + position: fixed; + top: var(--header-height); + left: max(0px, calc((100vw - var(--content-max-width)) / 2)); + bottom: 0; + width: var(--sidebar-width); + background-color: var(--background-color); + border-right: 1px solid var(--progress-bar-background); + overflow-y: auto; + z-index: 900; + padding: 1em calc(var(--global-space) * 2); + padding-bottom: 2em; + /* transition: left var(--layout-transition-speed) ease-in-out; */ +} + +/* --- 2. Main Content Area (Within Centered Grid) --- */ +#terminal-mkdocs-main-content { + flex-grow: 1; + flex-shrink: 1; + min-width: 0; /* Flexbox shrink fix */ + + /* No left/right margins needed here - handled by parent grid */ + margin-left: 0; + margin-right: 0; + + /* Internal Padding */ + padding: 1.5em 2em; + + position: relative; + z-index: 1; +} + +/* --- 3. Right Table of Contents (Sticky, Within Centered Grid) --- */ +#toc-sidebar { + flex-basis: var(--toc-width); + flex-shrink: 0; + width: var(--toc-width); + + position: sticky; /* Sticks within the centered grid */ + top: var(--header-height); + align-self: stretch; + height: calc(100vh - var(--header-height)); + overflow-y: auto; + + padding: 1.5em 1em; + font-size: 0.85em; + border-left: 1px solid var(--progress-bar-background); + z-index: 800; + /* display: none; /* JS handles */ +} + +/* (ToC link styles remain the same) */ +#toc-sidebar h4 { margin-top: 0; margin-bottom: 1em; font-size: 1.1em; color: var(--secondary-color); padding-left: 0.8em; } +#toc-sidebar ul { list-style: none; padding: 0; margin: 0; } +#toc-sidebar ul li a { display: block; padding: 0.3em 0; color: var(--secondary-color); text-decoration: none; border-left: 3px solid transparent; padding-left: 0.8em; transition: all 0.1s ease-in-out; line-height: 1.4; word-break: break-word; } +#toc-sidebar ul li.toc-level-3 a { padding-left: 1.8em; } +#toc-sidebar ul li.toc-level-4 a { padding-left: 2.8em; } +#toc-sidebar ul li a:hover { color: var(--font-color); background-color: rgba(255, 255, 255, 0.05); } +#toc-sidebar ul li a.active { color: var(--primary-color); border-left-color: var(--primary-color); background-color: rgba(80, 255, 255, 0.08); } + + +/* --- Footer Styling (Respects Centered Layout) --- */ +footer { + background-color: var(--code-bg-color); + color: var(--secondary-color); + position: relative; + z-index: 10; + margin-top: 2em; + + /* Apply margin-left to clear the fixed sidebar */ + margin-left: var(--sidebar-width); + + /* Constrain width relative to the centered grid it follows */ + max-width: calc(var(--content-max-width) - var(--sidebar-width)); + margin-right: auto; /* Keep it left-aligned within the space next to sidebar */ + + /* Use padding consistent with the grid */ + padding: 2em calc(var(--global-space) * 2); +} + +/* Adjust footer grid if needed */ +.terminal-mkdocs-footer-grid { + display: grid; + grid-template-columns: 1fr auto; + gap: 1em; + align-items: center; +} + +/* ========================================================================== + RESPONSIVENESS (Adapting the Non-Fluid Layout) + ========================================================================== */ + +/* --- Medium screens: Hide ToC --- */ +@media screen and (max-width: 1200px) { + #toc-sidebar { + display: none; + } + + .terminal-mkdocs-main-grid { + /* Grid adjusts automatically as ToC is removed */ + /* Ensure grid padding remains */ + padding-left: calc(var(--global-space) * 2); + padding-right: calc(var(--global-space) * 2); + } + + #terminal-mkdocs-main-content { + /* Content area naturally expands */ + } + + footer { + /* Footer still respects the left sidebar and overall max width */ + margin-left: var(--sidebar-width); + max-width: calc(var(--content-max-width) - var(--sidebar-width)); + /* Padding remains consistent */ + padding-left: calc(var(--global-space) * 2); + padding-right: calc(var(--global-space) * 2); + } +} + +/* --- Small screens: Hide left sidebar, full width content & footer --- */ +@media screen and (max-width: 768px) { + + #terminal-mkdocs-side-panel { + left: calc(-1 * var(--sidebar-width)); + z-index: 1100; + box-shadow: 2px 0 10px rgba(0,0,0,0.3); + } + #terminal-mkdocs-side-panel.sidebar-visible { + left: 0; + } + + .terminal-mkdocs-main-grid { + /* Grid now takes full width (minus body padding) */ + margin-left: 0; /* Override sidebar margin */ + margin-right: 0; /* Override auto margin */ + max-width: 100%; /* Allow full width */ + padding-left: var(--global-space); /* Reduce padding */ + padding-right: var(--global-space); + } + + #terminal-mkdocs-main-content { + padding: 1.5em 1em; /* Adjust internal padding */ + } + + footer { + margin-left: 0; /* Full width footer */ + max-width: 100%; /* Allow full width */ + padding: 2em 1em; /* Adjust internal padding */ + } + + .terminal-mkdocs-footer-grid { + grid-template-columns: 1fr; /* Stack footer items */ + text-align: center; + gap: 0.5em; + } + /* Remember JS for toggle button & overlay */ +} + + +/* ==== GitHub Stats Badge Styling ==== */ + +.github-stats-badge { + display: inline-block; /* Or flex if needed */ + margin-left: 2em; /* Adjust spacing */ + vertical-align: middle; /* Align with other header items */ + font-size: 0.9em; /* Slightly smaller font */ +} + +.github-stats-badge a { + color: var(--secondary-color); /* Use secondary color */ + text-decoration: none; + display: flex; /* Use flex for alignment */ + align-items: center; + gap: 0.8em; /* Space between items */ + padding: 0.2em 0.5em; + border: 1px solid var(--progress-bar-background); /* Subtle border */ + border-radius: 4px; + transition: color 0.2s, background-color 0.2s; +} + +.github-stats-badge a:hover { + color: var(--font-color); /* Brighter color on hover */ + background-color: var(--progress-bar-background); /* Subtle background on hover */ +} + +.github-stats-badge .repo-name { + color: var(--font-color); /* Make repo name stand out slightly */ + font-weight: 500; /* Optional bolder weight */ +} + +.github-stats-badge .stat { + /* Styles for individual stats (version, stars, forks) */ + white-space: nowrap; /* Prevent wrapping */ +} + +.github-stats-badge .stat i { + /* Optional: Style for FontAwesome icons */ + margin-right: 0.3em; + color: var(--secondary-dimmed-color); /* Dimmer color for icons */ +} + + +/* Adjust positioning relative to search/nav if needed */ +/* Example: If search is floated right */ +/* .terminal-nav { float: left; } */ +/* .github-stats-badge { float: left; } */ +/* #mkdocs-search-query { float: right; } */ + +/* --- Responsive adjustments --- */ +@media screen and (max-width: 900px) { /* Example breakpoint */ + .github-stats-badge .repo-name { + display: none; /* Hide full repo name on smaller screens */ + } + .github-stats-badge { + margin-left: 1em; + } + .github-stats-badge a { + gap: 0.5em; + } +} +@media screen and (max-width: 768px) { + /* Further hide or simplify on mobile if needed */ + .github-stats-badge { + display: none; /* Example: Hide completely on smallest screens */ + } +} \ No newline at end of file diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css index 8ee8cbb1..751aabb7 100644 --- a/docs/md_v2/assets/styles.css +++ b/docs/md_v2/assets/styles.css @@ -50,8 +50,17 @@ --display-h1-decoration: none; --display-h1-decoration: none; + + --header-height: 65px; /* Adjust based on your actual header height */ + --sidebar-width: 280px; /* Adjust based on your desired sidebar width */ + --toc-width: 240px; /* Adjust based on your desired ToC width */ + --layout-transition-speed: 0.2s; /* For potential future animations */ + + --page-width : 90em; /* Adjust based on your design */ } + + /* body { background-color: var(--background-color); color: var(--font-color); @@ -256,4 +265,6 @@ div.badges a { } div.badges a > img { width: auto; -} \ No newline at end of file +} + + diff --git a/docs/md_v2/assets/toc.js b/docs/md_v2/assets/toc.js new file mode 100644 index 00000000..8dad06b2 --- /dev/null +++ b/docs/md_v2/assets/toc.js @@ -0,0 +1,144 @@ +// ==== File: assets/toc.js ==== + +document.addEventListener('DOMContentLoaded', () => { + const mainContent = document.getElementById('terminal-mkdocs-main-content'); + const tocContainer = document.getElementById('toc-sidebar'); + const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container + + if (!mainContent) { + console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found."); + return; + } + + // --- Create ToC container if it doesn't exist --- + let tocElement = tocContainer; + if (!tocElement) { + if (!mainGrid) { + console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC."); + return; + } + tocElement = document.createElement('aside'); + tocElement.id = 'toc-sidebar'; + tocElement.style.display = 'none'; // Keep hidden initially + // Append it as the last child of the flex grid + mainGrid.appendChild(tocElement); + console.info("TOC Generator: Created '#toc-sidebar' element."); + } + + // --- Find Headings (h2, h3, h4 are common for ToC) --- + const headings = mainContent.querySelectorAll('h2, h3, h4'); + if (headings.length === 0) { + console.info("TOC Generator: No headings found on this page. ToC not generated."); + tocElement.style.display = 'none'; // Ensure it's hidden + return; + } + + // --- Generate ToC List --- + const tocList = document.createElement('ul'); + const observerTargets = []; // Store headings for IntersectionObserver + + headings.forEach((heading, index) => { + // Ensure heading has an ID for linking + if (!heading.id) { + // Create a simple slug-like ID + heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`; + } + + const listItem = document.createElement('li'); + const link = document.createElement('a'); + + link.href = `#${heading.id}`; + link.textContent = heading.textContent; + + // Add class for styling based on heading level + const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4 + listItem.classList.add(`toc-level-${level}`); + + listItem.appendChild(link); + tocList.appendChild(listItem); + observerTargets.push(heading); // Add to observer list + }); + + // --- Populate and Show ToC --- + // Optional: Add a title + const tocTitle = document.createElement('h4'); + tocTitle.textContent = 'On this page'; // Customize title if needed + + tocElement.innerHTML = ''; // Clear previous content if any + tocElement.appendChild(tocTitle); + tocElement.appendChild(tocList); + tocElement.style.display = ''; // Show the ToC container + + console.info(`TOC Generator: Generated ToC with ${headings.length} items.`); + + // --- Scroll Spy using Intersection Observer --- + const tocLinks = tocElement.querySelectorAll('a'); + let activeLink = null; // Keep track of the current active link + + const observerOptions = { + // Observe changes relative to the viewport, offset by the header height + // Negative top margin pushes the intersection trigger point down + // Negative bottom margin ensures elements low on the screen can trigger before they exit + rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`, + threshold: 0 // Trigger as soon as any part enters/exits the boundary + }; + + const observerCallback = (entries) => { + let topmostVisibleHeading = null; + + entries.forEach(entry => { + const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`); + if (!link) return; + + // Check if the heading is intersecting (partially or fully visible within rootMargin) + if (entry.isIntersecting) { + // Among visible headings, find the one closest to the top edge (within the rootMargin) + if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) { + topmostVisibleHeading = entry.target; + } + } + }); + + // If we found a topmost visible heading, activate its link + if (topmostVisibleHeading) { + const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`); + if (newActiveLink && newActiveLink !== activeLink) { + // Remove active class from previous link + if (activeLink) { + activeLink.classList.remove('active'); + activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling + } + // Add active class to the new link + newActiveLink.classList.add('active'); + newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling + activeLink = newActiveLink; + + // Optional: Scroll the ToC sidebar to keep the active link visible + // newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' }); + } + } + // If no headings are intersecting (scrolled past the last one?), maybe deactivate all + // Or keep the last one active - depends on desired behavior. Current logic keeps last active. + }; + + const observer = new IntersectionObserver(observerCallback, observerOptions); + + // Observe all target headings + observerTargets.forEach(heading => observer.observe(heading)); + + // Initial check in case a heading is already in view on load + // (Requires slight delay for accurate layout calculation) + setTimeout(() => { + observerCallback(observer.takeRecords()); // Process initial state + }, 100); + + // move footer and the hr before footer to the end of the main content + const footer = document.querySelector('footer'); + const hr = footer.previousElementSibling; + if (hr && hr.tagName === 'HR') { + mainContent.appendChild(hr); + } + mainContent.appendChild(footer); + console.info("TOC Generator: Footer moved to the end of the main content."); + +}); \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 82b2fa02..1c7be7a3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -76,6 +76,7 @@ extra: version: !ENV [CRAWL4AI_VERSION, 'development'] extra_css: + - assets/layout.css - assets/styles.css - assets/highlight.css - assets/dmvendor.css @@ -83,4 +84,6 @@ extra_css: extra_javascript: - assets/highlight.min.js - assets/highlight_init.js - - https://buttons.github.io/buttons.js \ No newline at end of file + - https://buttons.github.io/buttons.js + - assets/toc.js + - assets/github_stats.js \ No newline at end of file From cd7ff6f9c137348003493606b1b453637c624fac Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 14 Apr 2025 23:00:47 +0800 Subject: [PATCH 5/5] feat(docs): add AI assistant interface and code copy button Add new AI assistant chat interface with features: - Real-time chat with markdown support - Chat history management - Citation tracking - Selection-to-query functionality Also adds code copy button to documentation code blocks and adjusts layout/styling. Breaking changes: None --- docs/md_v2/ask_ai/ask-ai.css | 444 ++++++++++++++ docs/md_v2/ask_ai/ask-ai.js | 603 ++++++++++++++++++++ docs/md_v2/ask_ai/index.html | 64 +++ docs/md_v2/assets/copy_code.js | 62 ++ docs/md_v2/assets/floating_ask_ai_button.js | 39 ++ docs/md_v2/assets/layout.css | 146 ++++- docs/md_v2/assets/selection_ask_ai.js | 109 ++++ docs/md_v2/assets/styles.css | 6 +- docs/md_v2/core/ask-ai.md | 74 +++ mkdocs.yml | 8 +- 10 files changed, 1549 insertions(+), 6 deletions(-) create mode 100644 docs/md_v2/ask_ai/ask-ai.css create mode 100644 docs/md_v2/ask_ai/ask-ai.js create mode 100644 docs/md_v2/ask_ai/index.html create mode 100644 docs/md_v2/assets/copy_code.js create mode 100644 docs/md_v2/assets/floating_ask_ai_button.js create mode 100644 docs/md_v2/assets/selection_ask_ai.js create mode 100644 docs/md_v2/core/ask-ai.md diff --git a/docs/md_v2/ask_ai/ask-ai.css b/docs/md_v2/ask_ai/ask-ai.css new file mode 100644 index 00000000..c464d43b --- /dev/null +++ b/docs/md_v2/ask_ai/ask-ai.css @@ -0,0 +1,444 @@ +/* ==== File: docs/ask_ai/ask_ai.css ==== */ + +/* --- Basic Reset & Font --- */ +body { + /* Attempt to inherit variables from parent window (iframe context) */ + /* Fallback values if variables are not inherited */ + --fallback-bg: #070708; + --fallback-font: #e8e9ed; + --fallback-secondary: #a3abba; + --fallback-primary: #50ffff; + --fallback-primary-dimmed: #09b5a5; + --fallback-border: #1d1d20; + --fallback-code-bg: #1e1e1e; + --fallback-invert-font: #222225; + --font-stack: dm, Monaco, Courier New, monospace, serif; + + font-family: var(--font-stack, "Courier New", monospace); /* Use theme font stack */ + background-color: var(--background-color, var(--fallback-bg)); + color: var(--font-color, var(--fallback-font)); + margin: 0; + padding: 0; + font-size: 14px; /* Match global font size */ + line-height: 1.5em; /* Match global line height */ + height: 100vh; /* Ensure body takes full height */ + overflow: hidden; /* Prevent body scrollbars, panels handle scroll */ + display: flex; /* Use flex for the main container */ +} + +a { + color: var(--secondary-color, var(--fallback-secondary)); + text-decoration: none; + transition: color 0.2s; +} +a:hover { + color: var(--primary-color, var(--fallback-primary)); +} + +/* --- Main Container Layout --- */ +.ai-assistant-container { + display: flex; + width: 100%; + height: 100%; + background-color: var(--background-color, var(--fallback-bg)); +} + +/* --- Sidebar Styling --- */ +.sidebar { + flex-shrink: 0; /* Prevent sidebars from shrinking */ + height: 100%; + display: flex; + flex-direction: column; + /* background-color: var(--code-bg-color, var(--fallback-code-bg)); */ + overflow-y: hidden; /* Header fixed, list scrolls */ +} + +.left-sidebar { + flex-basis: 240px; /* Width of history panel */ + border-right: 1px solid var(--progress-bar-background, var(--fallback-border)); +} + +.right-sidebar { + flex-basis: 280px; /* Width of citations panel */ + border-left: 1px solid var(--progress-bar-background, var(--fallback-border)); +} + +.sidebar header { + padding: 0.6em 1em; + border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border)); + flex-shrink: 0; + display: flex; + justify-content: space-between; + align-items: center; +} + +.sidebar header h3 { + margin: 0; + font-size: 1.1em; + color: var(--font-color, var(--fallback-font)); +} + +.sidebar ul { + list-style: none; + padding: 0; + margin: 0; + overflow-y: auto; /* Enable scrolling for the list */ + flex-grow: 1; /* Allow list to take remaining space */ + padding: 0.5em 0; +} + +.sidebar ul li { + padding: 0.3em 1em; +} +.sidebar ul li.no-citations, +.sidebar ul li.no-history { + color: var(--secondary-color, var(--fallback-secondary)); + font-style: italic; + font-size: 0.9em; + padding-left: 1em; +} + +.sidebar ul li a { + color: var(--secondary-color, var(--fallback-secondary)); + text-decoration: none; + display: block; + padding: 0.2em 0.5em; + border-radius: 3px; + transition: background-color 0.2s, color 0.2s; +} + +.sidebar ul li a:hover { + color: var(--primary-color, var(--fallback-primary)); + background-color: rgba(80, 255, 255, 0.08); /* Use primary color with alpha */ +} +/* Style for active history item */ +#history-list li.active a { + color: var(--primary-dimmed-color, var(--fallback-primary-dimmed)); + font-weight: bold; + background-color: rgba(80, 255, 255, 0.12); +} + +/* --- Chat Panel Styling --- */ +#chat-panel { + flex-grow: 1; /* Take remaining space */ + display: flex; + flex-direction: column; + height: 100%; + overflow: hidden; /* Prevent overflow, internal elements handle scroll */ +} + +#chat-messages { + flex-grow: 1; + overflow-y: auto; /* Scrollable chat history */ + padding: 1em 1.5em; + border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border)); +} + +.message { + margin-bottom: 1em; + padding: 0.8em 1.2em; + border-radius: 8px; + max-width: 90%; /* Slightly wider */ + line-height: 1.6; + /* Apply pre-wrap for better handling of spaces/newlines AND wrapping */ + white-space: pre-wrap; + word-wrap: break-word; /* Ensure long words break */ +} + +.user-message { + background-color: var(--progress-bar-background, var(--fallback-border)); /* User message background */ + color: var(--font-color, var(--fallback-font)); + margin-left: auto; /* Align user messages to the right */ + text-align: left; +} + +.ai-message { + background-color: var(--code-bg-color, var(--fallback-code-bg)); /* AI message background */ + color: var(--font-color, var(--fallback-font)); + margin-right: auto; /* Align AI messages to the left */ + border: 1px solid var(--progress-bar-background, var(--fallback-border)); +} +.ai-message.welcome-message { + border: none; + background-color: transparent; + max-width: 100%; + text-align: center; + color: var(--secondary-color, var(--fallback-secondary)); + white-space: normal; +} + +/* Styles for code within messages */ +.ai-message code { + background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; /* Use light bg for code */ + /* color: var(--background-color, var(--fallback-bg)) !important; Dark text */ + padding: 0.1em 0.4em; + border-radius: 4px; + font-size: 0.9em; +} +.ai-message pre { + background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; + color: var(--background-color, var(--fallback-bg)) !important; + padding: 1em; + border-radius: 5px; + overflow-x: auto; + margin: 0.8em 0; + white-space: pre; +} +.ai-message pre code { + background-color: transparent !important; + padding: 0; + font-size: inherit; +} + +/* Override white-space for specific elements generated by Markdown */ +.ai-message p, +.ai-message ul, +.ai-message ol, +.ai-message blockquote { + white-space: normal; /* Allow standard wrapping for block elements */ +} + +/* --- Markdown Element Styling within Messages --- */ +.message p { + margin-top: 0; + margin-bottom: 0.5em; +} +.message p:last-child { + margin-bottom: 0; +} +.message ul, +.message ol { + margin: 0.5em 0 0.5em 1.5em; + padding: 0; +} +.message li { + margin-bottom: 0.2em; +} + +/* Code block styling (adjusts previous rules slightly) */ +.message code { + /* Inline code */ + background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; + color: var(--font-color); + padding: 0.1em 0.4em; + border-radius: 4px; + font-size: 0.9em; + /* Ensure inline code breaks nicely */ + word-break: break-all; + white-space: normal; /* Allow inline code to wrap if needed */ +} +.message pre { + /* Code block container */ + background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; + color: var(--background-color, var(--fallback-bg)) !important; + padding: 1em; + border-radius: 5px; + overflow-x: auto; + margin: 0.8em 0; + font-size: 0.9em; /* Slightly smaller code blocks */ +} +.message pre code { + /* Code within code block */ + background-color: transparent !important; + padding: 0; + font-size: inherit; + word-break: normal; /* Don't break words in code blocks */ + white-space: pre; /* Preserve whitespace strictly in code blocks */ +} + +/* Thinking indicator */ +.message-thinking { + display: inline-block; + width: 5px; + height: 5px; + background-color: var(--primary-color, var(--fallback-primary)); + border-radius: 50%; + margin-left: 8px; + vertical-align: middle; + animation: thinking 1s infinite ease-in-out; +} +@keyframes thinking { + 0%, + 100% { + opacity: 0.5; + transform: scale(0.8); + } + 50% { + opacity: 1; + transform: scale(1.2); + } +} + +/* --- Thinking Indicator (Blinking Cursor Style) --- */ +.thinking-indicator-cursor { + display: inline-block; + width: 10px; /* Width of the cursor */ + height: 1.1em; /* Match line height */ + background-color: var(--primary-color, var(--fallback-primary)); + margin-left: 5px; + vertical-align: text-bottom; /* Align with text baseline */ + animation: blink-cursor 1s step-end infinite; +} + +@keyframes blink-cursor { + from, + to { + background-color: transparent; + } + 50% { + background-color: var(--primary-color, var(--fallback-primary)); + } +} + +#chat-input-area { + flex-shrink: 0; /* Prevent input area from shrinking */ + padding: 1em 1.5em; + display: flex; + align-items: flex-end; /* Align items to bottom */ + gap: 10px; + background-color: var(--code-bg-color, var(--fallback-code-bg)); /* Match sidebars */ +} + +#chat-input-area textarea { + flex-grow: 1; + padding: 0.8em 1em; + border: 1px solid var(--progress-bar-background, var(--fallback-border)); + background-color: var(--background-color, var(--fallback-bg)); + color: var(--font-color, var(--fallback-font)); + border-radius: 5px; + resize: none; /* Disable manual resize */ + font-family: inherit; + font-size: 1em; + line-height: 1.4; + max-height: 150px; /* Limit excessive height */ + overflow-y: auto; + /* rows: 2; */ +} + +#chat-input-area button { + /* Basic button styling - maybe inherit from main theme? */ + padding: 0.6em 1.2em; + border: 1px solid var(--primary-dimmed-color, var(--fallback-primary-dimmed)); + background-color: var(--primary-dimmed-color, var(--fallback-primary-dimmed)); + color: var(--background-color, var(--fallback-bg)); + border-radius: 5px; + cursor: pointer; + font-size: 0.9em; + transition: background-color 0.2s, border-color 0.2s; + height: min-content; /* Align with bottom of textarea */ +} + +#chat-input-area button:hover { + background-color: var(--primary-color, var(--fallback-primary)); + border-color: var(--primary-color, var(--fallback-primary)); +} +#chat-input-area button:disabled { + opacity: 0.6; + cursor: not-allowed; +} + +.loading-indicator { + font-size: 0.9em; + color: var(--secondary-color, var(--fallback-secondary)); + margin-right: 10px; + align-self: center; +} + +/* --- Buttons --- */ +/* Inherit some button styles if possible */ +.btn.btn-sm { + color: var(--font-color, var(--fallback-font)); + padding: 0.2em 0.5em; + font-size: 0.8em; + border: 1px solid var(--secondary-color, var(--fallback-secondary)); + background: none; + border-radius: 3px; + cursor: pointer; +} +.btn.btn-sm:hover { + border-color: var(--font-color, var(--fallback-font)); + background-color: var(--progress-bar-background, var(--fallback-border)); +} + +/* --- Basic Responsiveness --- */ +@media screen and (max-width: 900px) { + .left-sidebar { + flex-basis: 200px; /* Shrink history */ + } + .right-sidebar { + flex-basis: 240px; /* Shrink citations */ + } +} + +@media screen and (max-width: 768px) { + /* Stack layout on mobile? Or hide sidebars? Hiding for now */ + .sidebar { + display: none; /* Hide sidebars on small screens */ + } + /* Could add toggle buttons later */ +} + + +/* ==== File: docs/ask_ai/ask-ai.css (Updates V4 - Delete Button) ==== */ + + +.sidebar ul li { + /* Use flexbox to align link and delete button */ + display: flex; + justify-content: space-between; + align-items: center; + padding: 0; /* Remove padding from li, add to link/button */ + margin: 0.1em 0; /* Small vertical margin */ +} + +.sidebar ul li a { + /* Link takes most space */ + flex-grow: 1; + padding: 0.3em 0.5em 0.3em 1em; /* Adjust padding */ + /* Make ellipsis work for long titles */ + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + /* Keep existing link styles */ + color: var(--secondary-color, var(--fallback-secondary)); + text-decoration: none; + display: block; + border-radius: 3px; + transition: background-color 0.2s, color 0.2s; +} +.sidebar ul li a:hover { + color: var(--primary-color, var(--fallback-primary)); + background-color: rgba(80, 255, 255, 0.08); +} + +/* Style for active history item's link */ +#history-list li.active a { + color: var(--primary-dimmed-color, var(--fallback-primary-dimmed)); + font-weight: bold; + background-color: rgba(80, 255, 255, 0.12); +} + +/* --- Delete Chat Button --- */ +.delete-chat-btn { + flex-shrink: 0; /* Don't shrink */ + background: none; + border: none; + color: var(--secondary-color, var(--fallback-secondary)); + cursor: pointer; + padding: 0.4em 0.8em; /* Padding around icon */ + font-size: 0.9em; + opacity: 0.5; /* Dimmed by default */ + transition: opacity 0.2s, color 0.2s; + margin-left: 5px; /* Space between link and button */ + border-radius: 3px; +} + +.sidebar ul li:hover .delete-chat-btn, +.delete-chat-btn:hover { + opacity: 1; /* Show fully on hover */ + color: var(--error-color, #ff3c74); /* Use error color on hover */ +} +.delete-chat-btn:focus { + outline: 1px dashed var(--error-color, #ff3c74); /* Accessibility */ + opacity: 1; +} diff --git a/docs/md_v2/ask_ai/ask-ai.js b/docs/md_v2/ask_ai/ask-ai.js new file mode 100644 index 00000000..2710923e --- /dev/null +++ b/docs/md_v2/ask_ai/ask-ai.js @@ -0,0 +1,603 @@ +// ==== File: docs/ask_ai/ask-ai.js (Marked, Streaming, History) ==== + +document.addEventListener("DOMContentLoaded", () => { + console.log("AI Assistant JS V2 Loaded"); + + // --- DOM Element Selectors --- + const historyList = document.getElementById("history-list"); + const newChatButton = document.getElementById("new-chat-button"); + const chatMessages = document.getElementById("chat-messages"); + const chatInput = document.getElementById("chat-input"); + const sendButton = document.getElementById("send-button"); + const citationsList = document.getElementById("citations-list"); + + // --- Constants --- + const CHAT_INDEX_KEY = "aiAssistantChatIndex_v1"; + const CHAT_PREFIX = "aiAssistantChat_v1_"; + + // --- State --- + let currentChatId = null; + let conversationHistory = []; // Holds message objects { sender: 'user'/'ai', text: '...' } + let isThinking = false; + let streamInterval = null; // To control the streaming interval + + // --- Event Listeners --- + sendButton.addEventListener("click", handleSendMessage); + chatInput.addEventListener("keydown", handleInputKeydown); + newChatButton.addEventListener("click", handleNewChat); + chatInput.addEventListener("input", autoGrowTextarea); + + // --- Initialization --- + loadChatHistoryIndex(); // Load history list on startup + const initialQuery = checkForInitialQuery(window.parent.location); // Check for query param + if (!initialQuery) { + loadInitialChat(); // Load normally if no query + } + + // --- Core Functions --- + + function handleSendMessage() { + const userMessageText = chatInput.value.trim(); + if (!userMessageText || isThinking) return; + + setThinking(true); // Start thinking state + + // Add user message to state and UI + const userMessage = { sender: "user", text: userMessageText }; + conversationHistory.push(userMessage); + addMessageToChat(userMessage, false); // Add user message without parsing markdown + + chatInput.value = ""; + autoGrowTextarea(); // Reset textarea height + + // Prepare for AI response (create empty div) + const aiMessageDiv = addMessageToChat({ sender: "ai", text: "" }, true); // Add empty div with thinking indicator + + // TODO: Generate fingerprint/JWT here + + // TODO: Send `conversationHistory` + JWT to backend API + // Replace placeholder below with actual API call + // The backend should ideally return a stream of text tokens + + // --- Placeholder Streaming Simulation --- + const simulatedFullResponse = `Okay, Here’s a minimal Python script that creates an AsyncWebCrawler, fetches a webpage, and prints the first 300 characters of its Markdown output: + +\`\`\`python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown[:300]) # Print first 300 chars + +if __name__ == "__main__": + asyncio.run(main()) +\`\`\` + +A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`; + + // Simulate receiving the response stream + streamSimulatedResponse(aiMessageDiv, simulatedFullResponse); + + // // Simulate receiving citations *after* stream starts (or with first chunk) + // setTimeout(() => { + // addCitations([ + // { title: "Simulated Doc 1", url: "#sim1" }, + // { title: "Another Concept", url: "#sim2" }, + // ]); + // }, 500); // Citations appear shortly after thinking starts + } + + function handleInputKeydown(event) { + if (event.key === "Enter" && !event.shiftKey) { + event.preventDefault(); + handleSendMessage(); + } + } + + function addMessageToChat(message, addThinkingIndicator = false) { + const messageDiv = document.createElement("div"); + messageDiv.classList.add("message", `${message.sender}-message`); + + // Parse markdown and set HTML + messageDiv.innerHTML = message.text ? marked.parse(message.text) : ""; + + if (message.sender === "ai") { + // Apply Syntax Highlighting AFTER setting innerHTML + messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => { + if (typeof hljs !== "undefined") { + // Check if already highlighted to prevent double-highlighting issues + if (!block.classList.contains("hljs")) { + hljs.highlightElement(block); + } + } else { + console.warn("highlight.js (hljs) not found for syntax highlighting."); + } + }); + + // Add thinking indicator if needed (and not already present) + if (addThinkingIndicator && !message.text && !messageDiv.querySelector(".thinking-indicator-cursor")) { + const thinkingDiv = document.createElement("div"); + thinkingDiv.className = "thinking-indicator-cursor"; + messageDiv.appendChild(thinkingDiv); + } + } else { + // User messages remain plain text + // messageDiv.textContent = message.text; + } + + // wrap each pre in a div.terminal + messageDiv.querySelectorAll("pre").forEach((block) => { + const wrapper = document.createElement("div"); + wrapper.className = "terminal"; + block.parentNode.insertBefore(wrapper, block); + wrapper.appendChild(block); + }); + + chatMessages.appendChild(messageDiv); + // Scroll only if user is near the bottom? (More advanced) + // Simple scroll for now: + scrollToBottom(); + return messageDiv; // Return the created element + } + + function streamSimulatedResponse(messageDiv, fullText) { + const thinkingIndicator = messageDiv.querySelector(".thinking-indicator-cursor"); + if (thinkingIndicator) thinkingIndicator.remove(); + + const tokens = fullText.split(/(\s+)/); + let currentText = ""; + let tokenIndex = 0; + // Clear previous interval just in case + if (streamInterval) clearInterval(streamInterval); + + streamInterval = setInterval(() => { + const cursorSpan = ''; // Cursor for streaming + if (tokenIndex < tokens.length) { + currentText += tokens[tokenIndex]; + // Render intermediate markdown + cursor + messageDiv.innerHTML = marked.parse(currentText + cursorSpan); + // Re-highlight code blocks on each stream update - might be slightly inefficient + // but ensures partial code blocks look okay. Highlight only final on completion. + // messageDiv.querySelectorAll('pre code:not(.hljs)').forEach((block) => { + // hljs.highlightElement(block); + // }); + scrollToBottom(); // Keep scrolling as content streams + tokenIndex++; + } else { + // Streaming finished + clearInterval(streamInterval); + streamInterval = null; + + // Final render without cursor + messageDiv.innerHTML = marked.parse(currentText); + + // === Final Syntax Highlighting === + messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => { + if (typeof hljs !== "undefined" && !block.classList.contains("hljs")) { + hljs.highlightElement(block); + } + }); + + // === Extract Citations === + const citations = extractMarkdownLinks(currentText); + + // Wrap each pre in a div.terminal + messageDiv.querySelectorAll("pre").forEach((block) => { + const wrapper = document.createElement("div"); + wrapper.className = "terminal"; + block.parentNode.insertBefore(wrapper, block); + wrapper.appendChild(block); + }); + + const aiMessage = { sender: "ai", text: currentText, citations: citations }; + conversationHistory.push(aiMessage); + updateCitationsDisplay(); + saveCurrentChat(); + setThinking(false); + } + }, 50); // Adjust speed + } + + // === NEW Function to Extract Links === + function extractMarkdownLinks(markdownText) { + const regex = /\[([^\]]+)\]\(([^)]+)\)/g; // [text](url) + const citations = []; + let match; + while ((match = regex.exec(markdownText)) !== null) { + // Avoid adding self-links from within the citations list if AI includes them + if (!match[2].startsWith("#citation-")) { + citations.push({ + title: match[1].trim(), + url: match[2].trim(), + }); + } + } + // Optional: Deduplicate links based on URL + const uniqueCitations = citations.filter( + (citation, index, self) => index === self.findIndex((c) => c.url === citation.url) + ); + return uniqueCitations; + } + + // === REVISED Function to Display Citations === + function updateCitationsDisplay() { + let lastCitations = null; + // Find the most recent AI message with citations + for (let i = conversationHistory.length - 1; i >= 0; i--) { + if ( + conversationHistory[i].sender === "ai" && + conversationHistory[i].citations && + conversationHistory[i].citations.length > 0 + ) { + lastCitations = conversationHistory[i].citations; + break; // Found the latest citations + } + } + + citationsList.innerHTML = ""; // Clear previous + if (!lastCitations) { + citationsList.innerHTML = '
  • No citations available.
  • '; + return; + } + + lastCitations.forEach((citation, index) => { + const li = document.createElement("li"); + const a = document.createElement("a"); + // Generate a unique ID for potential internal linking if needed + // a.id = `citation-${index}`; + a.href = citation.url || "#"; + a.textContent = citation.title; + a.target = "_top"; // Open in main window + li.appendChild(a); + citationsList.appendChild(li); + }); + } + + function addCitations(citations) { + citationsList.innerHTML = ""; // Clear + if (!citations || citations.length === 0) { + citationsList.innerHTML = '
  • No citations available.
  • '; + return; + } + citations.forEach((citation) => { + const li = document.createElement("li"); + const a = document.createElement("a"); + a.href = citation.url || "#"; + a.textContent = citation.title; + a.target = "_top"; // Open in main window + li.appendChild(a); + citationsList.appendChild(li); + }); + } + + function setThinking(thinking) { + isThinking = thinking; + sendButton.disabled = thinking; + chatInput.disabled = thinking; + chatInput.placeholder = thinking ? "AI is responding..." : "Ask about Crawl4AI..."; + // Stop any existing stream if we start thinking again (e.g., rapid resend) + if (thinking && streamInterval) { + clearInterval(streamInterval); + streamInterval = null; + } + } + + function autoGrowTextarea() { + chatInput.style.height = "auto"; + chatInput.style.height = `${chatInput.scrollHeight}px`; + } + + function scrollToBottom() { + chatMessages.scrollTop = chatMessages.scrollHeight; + } + + // --- Query Parameter Handling --- + function checkForInitialQuery(locationToCheck) { + // <-- Receive location object + if (!locationToCheck) { + console.warn("Ask AI: Could not access parent window location."); + return false; + } + const urlParams = new URLSearchParams(locationToCheck.search); // <-- Use passed location's search string + const encodedQuery = urlParams.get("qq"); // <-- Use 'qq' + + if (encodedQuery) { + console.log("Initial query found (qq):", encodedQuery); + try { + const decodedText = decodeURIComponent(escape(atob(encodedQuery))); + console.log("Decoded query:", decodedText); + + // Start new chat immediately + handleNewChat(true); + + // Delay setting input and sending message slightly + setTimeout(() => { + chatInput.value = decodedText; + autoGrowTextarea(); + handleSendMessage(); + + // Clean the PARENT window's URL + try { + const cleanUrl = locationToCheck.pathname; + // Use parent's history object + window.parent.history.replaceState({}, window.parent.document.title, cleanUrl); + } catch (e) { + console.warn("Ask AI: Could not clean parent URL using replaceState.", e); + // This might fail due to cross-origin restrictions if served differently, + // but should work fine with mkdocs serve on the same origin. + } + }, 100); + + return true; // Query processed + } catch (e) { + console.error("Error decoding initial query (qq):", e); + // Clean the PARENT window's URL even on error + try { + const cleanUrl = locationToCheck.pathname; + window.parent.history.replaceState({}, window.parent.document.title, cleanUrl); + } catch (cleanError) { + console.warn("Ask AI: Could not clean parent URL after decode error.", cleanError); + } + return false; + } + } + return false; // No 'qq' query found + } + + // --- History Management --- + + function handleNewChat(isFromQuery = false) { + if (isThinking) return; // Don't allow new chat while responding + + // Only save if NOT triggered immediately by a query parameter load + if (!isFromQuery) { + saveCurrentChat(); + } + + currentChatId = `chat_${Date.now()}`; + conversationHistory = []; // Clear message history state + chatMessages.innerHTML = ""; // Start with clean slate for query + if (!isFromQuery) { + // Show welcome only if manually started + chatMessages.innerHTML = + '
    Started a new chat! Ask me anything about Crawl4AI.
    '; + } + addCitations([]); // Clear citations + updateCitationsDisplay(); // Clear UI + + // Add to index and save + let index = loadChatIndex(); + // Generate a generic title initially, update later + const newTitle = isFromQuery ? "Chat from Selection" : `Chat ${new Date().toLocaleString()}`; + // index.unshift({ id: currentChatId, title: `Chat ${new Date().toLocaleString()}` }); // Add to start + index.unshift({ id: currentChatId, title: newTitle }); + saveChatIndex(index); + + renderHistoryList(index); // Update UI + setActiveHistoryItem(currentChatId); + saveCurrentChat(); // Save the empty new chat state + } + + function loadChat(chatId) { + if (isThinking || chatId === currentChatId) return; + + // Check if chat data actually exists before proceeding + const storedChat = localStorage.getItem(CHAT_PREFIX + chatId); + if (storedChat === null) { + console.warn(`Attempted to load non-existent chat: ${chatId}. Removing from index.`); + deleteChatData(chatId); // Clean up index + loadChatHistoryIndex(); // Reload history list + loadInitialChat(); // Load next available chat + return; + } + + console.log(`Loading chat: ${chatId}`); + saveCurrentChat(); // Save current before switching + + try { + conversationHistory = JSON.parse(storedChat); + currentChatId = chatId; + renderChatMessages(conversationHistory); + updateCitationsDisplay(); + setActiveHistoryItem(chatId); + } catch (e) { + console.error("Error loading chat:", chatId, e); + alert("Failed to load chat data."); + conversationHistory = []; + renderChatMessages(conversationHistory); + updateCitationsDisplay(); + } + } + + function saveCurrentChat() { + if (currentChatId && conversationHistory.length > 0) { + try { + localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify(conversationHistory)); + console.log(`Chat ${currentChatId} saved.`); + + // Update title in index (e.g., use first user message) + let index = loadChatIndex(); + const currentItem = index.find((item) => item.id === currentChatId); + if ( + currentItem && + conversationHistory[0]?.sender === "user" && + !currentItem.title.startsWith("Chat about:") + ) { + currentItem.title = `Chat about: ${conversationHistory[0].text.substring(0, 30)}...`; + saveChatIndex(index); + // Re-render history list if title changed - small optimization needed here maybe + renderHistoryList(index); + setActiveHistoryItem(currentChatId); // Re-set active after re-render + } + } catch (e) { + console.error("Error saving chat:", currentChatId, e); + // Handle potential storage full errors + if (e.name === "QuotaExceededError") { + alert("Local storage is full. Cannot save chat history."); + // Consider implementing history pruning logic here + } + } + } else if (currentChatId) { + // Save empty state for newly created chats if needed, or remove? + localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify([])); + } + } + + function loadChatIndex() { + try { + const storedIndex = localStorage.getItem(CHAT_INDEX_KEY); + return storedIndex ? JSON.parse(storedIndex) : []; + } catch (e) { + console.error("Error loading chat index:", e); + return []; // Return empty array on error + } + } + + function saveChatIndex(indexArray) { + try { + localStorage.setItem(CHAT_INDEX_KEY, JSON.stringify(indexArray)); + } catch (e) { + console.error("Error saving chat index:", e); + } + } + + function renderHistoryList(indexArray) { + historyList.innerHTML = ""; // Clear existing + if (!indexArray || indexArray.length === 0) { + historyList.innerHTML = '
  • No past chats found.
  • '; + return; + } + indexArray.forEach((item) => { + const li = document.createElement("li"); + li.dataset.chatId = item.id; // Add ID to li for easier selection + + const a = document.createElement("a"); + a.href = "#"; + a.dataset.chatId = item.id; + a.textContent = item.title || `Chat ${item.id.split("_")[1] || item.id}`; + a.title = a.textContent; // Tooltip for potentially long titles + a.addEventListener("click", (e) => { + e.preventDefault(); + loadChat(item.id); + }); + + // === Add Delete Button === + const deleteBtn = document.createElement("button"); + deleteBtn.className = "delete-chat-btn"; + deleteBtn.innerHTML = "βœ•"; // Trash can emoji/icon (or use text/SVG/FontAwesome) + deleteBtn.title = "Delete Chat"; + deleteBtn.dataset.chatId = item.id; // Store ID on button too + deleteBtn.addEventListener("click", handleDeleteChat); + + li.appendChild(a); + li.appendChild(deleteBtn); // Append button to the list item + historyList.appendChild(li); + }); + } + + function renderChatMessages(messages) { + chatMessages.innerHTML = ""; // Clear existing messages + messages.forEach((message) => { + // Ensure highlighting is applied when loading from history + addMessageToChat(message, false); + }); + if (messages.length === 0) { + chatMessages.innerHTML = + '
    Chat history loaded. Ask a question!
    '; + } + // Scroll to bottom after loading messages + scrollToBottom(); + } + + function setActiveHistoryItem(chatId) { + document.querySelectorAll("#history-list li").forEach((li) => li.classList.remove("active")); + // Select the LI element directly now + const activeLi = document.querySelector(`#history-list li[data-chat-id="${chatId}"]`); + if (activeLi) { + activeLi.classList.add("active"); + } + } + + function loadInitialChat() { + const index = loadChatIndex(); + if (index.length > 0) { + loadChat(index[0].id); + } else { + // Check if handleNewChat wasn't already called by query handler + if (!currentChatId) { + handleNewChat(); + } + } + } + + function loadChatHistoryIndex() { + const index = loadChatIndex(); + renderHistoryList(index); + if (currentChatId) setActiveHistoryItem(currentChatId); + } + + // === NEW Function to Handle Delete Click === + function handleDeleteChat(event) { + event.stopPropagation(); // Prevent triggering loadChat on the link behind it + const button = event.currentTarget; + const chatIdToDelete = button.dataset.chatId; + + if (!chatIdToDelete) return; + + // Confirmation dialog + if ( + window.confirm( + `Are you sure you want to delete this chat session?\n"${ + button.previousElementSibling?.textContent || "Chat " + chatIdToDelete + }"` + ) + ) { + console.log(`Deleting chat: ${chatIdToDelete}`); + + // Perform deletion + const updatedIndex = deleteChatData(chatIdToDelete); + + // If the deleted chat was the currently active one, load another chat + if (currentChatId === chatIdToDelete) { + currentChatId = null; // Reset current ID + conversationHistory = []; // Clear state + if (updatedIndex.length > 0) { + // Load the new top chat (most recent remaining) + loadChat(updatedIndex[0].id); + } else { + // No chats left, start a new one + handleNewChat(); + } + } else { + // If a different chat was deleted, just re-render the list + renderHistoryList(updatedIndex); + // Re-apply active state in case IDs shifted (though they shouldn't) + setActiveHistoryItem(currentChatId); + } + } + } + + // === NEW Function to Delete Chat Data === + function deleteChatData(chatId) { + // Remove chat data + localStorage.removeItem(CHAT_PREFIX + chatId); + + // Update index + let index = loadChatIndex(); + index = index.filter((item) => item.id !== chatId); + saveChatIndex(index); + + console.log(`Chat ${chatId} data and index entry removed.`); + return index; // Return the updated index + } + + // --- Virtual Scrolling Placeholder --- + // NOTE: Virtual scrolling is complex. For now, we do direct rendering. + // If performance becomes an issue with very long chats/history, + // investigate libraries like 'simple-virtual-scroll' or 'virtual-scroller'. + // You would replace parts of `renderChatMessages` and `renderHistoryList` + // to work with the chosen library's API (providing data and item renderers). + console.warn("Virtual scrolling not implemented. Performance may degrade with very long chat histories."); +}); diff --git a/docs/md_v2/ask_ai/index.html b/docs/md_v2/ask_ai/index.html new file mode 100644 index 00000000..5fe79b12 --- /dev/null +++ b/docs/md_v2/ask_ai/index.html @@ -0,0 +1,64 @@ + + + + + + Crawl4AI Assistant + + + + + + + + +
    + + + + + +
    +
    + +
    + Welcome to the Crawl4AI Assistant! How can I help you today? +
    +
    +
    + + + + +
    +
    + + + + +
    + + + + + + + + + \ No newline at end of file diff --git a/docs/md_v2/assets/copy_code.js b/docs/md_v2/assets/copy_code.js new file mode 100644 index 00000000..20e6be4f --- /dev/null +++ b/docs/md_v2/assets/copy_code.js @@ -0,0 +1,62 @@ +// ==== File: docs/assets/copy_code.js ==== + +document.addEventListener('DOMContentLoaded', () => { + // Target specifically code blocks within the main content area + const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code'); + + codeBlocks.forEach((codeElement) => { + const preElement = codeElement.parentElement; // The
     tag
    +
    +        // Ensure the 
     tag can contain a positioned button
    +        if (window.getComputedStyle(preElement).position === 'static') {
    +            preElement.style.position = 'relative';
    +        }
    +
    +        // Create the button
    +        const copyButton = document.createElement('button');
    +        copyButton.className = 'copy-code-button';
    +        copyButton.type = 'button';
    +        copyButton.setAttribute('aria-label', 'Copy code to clipboard');
    +        copyButton.title = 'Copy code to clipboard';
    +        copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
    +
    +        // Append the button to the 
     element
    +        preElement.appendChild(copyButton);
    +
    +        // Add click event listener
    +        copyButton.addEventListener('click', () => {
    +            copyCodeToClipboard(codeElement, copyButton);
    +        });
    +    });
    +
    +    async function copyCodeToClipboard(codeElement, button) {
    +        // Use innerText to get the rendered text content, preserving line breaks
    +        const textToCopy = codeElement.innerText;
    +
    +        try {
    +            await navigator.clipboard.writeText(textToCopy);
    +
    +            // Visual feedback
    +            button.innerHTML = 'Copied!';
    +            button.classList.add('copied');
    +            button.disabled = true; // Temporarily disable
    +
    +            // Revert button state after a short delay
    +            setTimeout(() => {
    +                button.innerHTML = 'Copy';
    +                button.classList.remove('copied');
    +                button.disabled = false;
    +            }, 2000); // Show "Copied!" for 2 seconds
    +
    +        } catch (err) {
    +            console.error('Failed to copy code: ', err);
    +            // Optional: Provide error feedback on the button
    +            button.innerHTML = 'Error';
    +            setTimeout(() => {
    +                button.innerHTML = 'Copy';
    +            }, 2000);
    +        }
    +    }
    +
    +    console.log("Copy Code Button script loaded.");
    +});
    \ No newline at end of file
    diff --git a/docs/md_v2/assets/floating_ask_ai_button.js b/docs/md_v2/assets/floating_ask_ai_button.js
    new file mode 100644
    index 00000000..177c2356
    --- /dev/null
    +++ b/docs/md_v2/assets/floating_ask_ai_button.js
    @@ -0,0 +1,39 @@
    +// ==== File: docs/assets/floating_ask_ai_button.js ====
    +
    +document.addEventListener('DOMContentLoaded', () => {
    +    const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
    +    const currentPath = window.location.pathname;
    +
    +    // Determine the base URL for constructing the link correctly,
    +    // especially if deployed in a sub-directory.
    +    // This assumes a simple structure; adjust if needed.
    +    const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
    +
    +
    +    // Check if the current page IS the Ask AI page
    +    // Use includes() for flexibility (handles trailing slash or .html)
    +    if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
    +        console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
    +        return; // Don't add the button on the target page
    +    }
    +
    +    // --- Create the button ---
    +    const fabLink = document.createElement('a');
    +    fabLink.className = 'floating-ask-ai-button';
    +    fabLink.href = askAiPagePath; // Construct the correct URL
    +    fabLink.title = 'Ask Crawl4AI Assistant';
    +    fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
    +
    +    // Add content (using SVG icon for better visuals)
    +    fabLink.innerHTML = `
    +        
    +            
    +        
    +        Ask AI
    +    `;
    +
    +    // Append to body
    +    document.body.appendChild(fabLink);
    +
    +    console.log("Floating Ask AI Button added.");
    +});
    \ No newline at end of file
    diff --git a/docs/md_v2/assets/layout.css b/docs/md_v2/assets/layout.css
    index db5fac55..f8dbedde 100644
    --- a/docs/md_v2/assets/layout.css
    +++ b/docs/md_v2/assets/layout.css
    @@ -72,7 +72,7 @@ body {
     #terminal-mkdocs-side-panel {
         position: fixed;
         top: var(--header-height);
    -    left: max(0px, calc((100vw - var(--content-max-width)) / 2)); 
    +    left: max(0px, calc((90vw - var(--content-max-width)) / 2)); 
         bottom: 0;
         width: var(--sidebar-width);
         background-color: var(--background-color);
    @@ -294,4 +294,148 @@ footer {
          .github-stats-badge {
             display: none; /* Example: Hide completely on smallest screens */
          }
    +}
    +
    +/* --- Ask AI Selection Button --- */
    +.ask-ai-selection-button {
    +    background-color: var(--primary-dimmed-color, #09b5a5);
    +    color: var(--background-color, #070708);
    +    border: none;
    +    padding: 4px 8px;
    +    font-size: 0.8em;
    +    border-radius: 4px;
    +    cursor: pointer;
    +    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.3);
    +    transition: background-color 0.2s ease;
    +    white-space: nowrap;
    +}
    +
    +.ask-ai-selection-button:hover {
    +    background-color: var(--primary-color, #50ffff);
    +}
    +
    +/* ==== File: docs/assets/layout.css (Additions) ==== */
    +
    +/* ... (keep all existing layout CSS) ... */
    +
    +/* --- Copy Code Button Styling --- */
    +
    +/* Ensure the parent 
     can contain the absolutely positioned button */
    +#terminal-mkdocs-main-content pre {
    +    position: relative; /* Needed for absolute positioning of child */
    +    /* Add a little padding top/right to make space for the button */
    +    padding-top: 2.5em;
    +    padding-right: 1em; /* Ensure padding is sufficient */
    +}
    +
    +.copy-code-button {
    +    position: absolute;
    +    top: 0.5em; /* Adjust spacing from top */
    +    left: 0.5em; /* Adjust spacing from left */
    +    z-index: 1; /* Sit on top of code */
    +
    +    background-color: var(--progress-bar-background, #444); /* Use a background */
    +    color: var(--font-color, #eaeaea);
    +    border: 1px solid var(--secondary-color, #727578);
    +    padding: 3px 8px;
    +    font-size: 0.8em;
    +    font-family: var(--font-stack, monospace);
    +    border-radius: 4px;
    +    cursor: pointer;
    +    opacity: 0; /* Hidden by default */
    +    transition: opacity 0.2s ease-in-out, background-color 0.2s ease, color 0.2s ease;
    +    white-space: nowrap;
    +}
    +
    +/* Show button on hover of the 
     container */
    +#terminal-mkdocs-main-content pre:hover .copy-code-button {
    +    opacity: 0.8; /* Show partially */
    +}
    +
    +.copy-code-button:hover {
    +    opacity: 1; /* Fully visible on button hover */
    +    background-color: var(--secondary-color, #727578);
    +}
    +
    +.copy-code-button:focus {
    +     opacity: 1; /* Ensure visible when focused */
    +     outline: 1px dashed var(--primary-color);
    +}
    +
    +
    +/* Style for "Copied!" state */
    +.copy-code-button.copied {
    +    background-color: var(--primary-dimmed-color, #09b5a5);
    +    color: var(--background-color, #070708);
    +    border-color: var(--primary-dimmed-color, #09b5a5);
    +    opacity: 1; /* Ensure visible */
    +}
    +.copy-code-button.copied:hover {
    +     background-color: var(--primary-dimmed-color, #09b5a5); /* Prevent hover change */
    +}
    +
    +/* ==== File: docs/assets/layout.css (Additions) ==== */
    +
    +/* ... (keep all existing layout CSS) ... */
    +
    +/* --- Floating Ask AI Button --- */
    +.floating-ask-ai-button {
    +    position: fixed;
    +    bottom: 25px;
    +    right: 25px;
    +    z-index: 1050; /* Below modals, above most content */
    +
    +    background-color: var(--primary-dimmed-color, #09b5a5);
    +    color: var(--background-color, #070708);
    +    border: none;
    +    border-radius: 50%; /* Make it circular */
    +    width: 60px; /* Adjust size */
    +    height: 60px; /* Adjust size */
    +    padding: 10px; /* Adjust padding */
    +    box-shadow: 0 4px 10px rgba(0, 0, 0, 0.4);
    +    cursor: pointer;
    +    transition: background-color 0.2s ease, transform 0.2s ease;
    +
    +    display: flex;
    +    flex-direction: column; /* Stack icon and text */
    +    align-items: center;
    +    justify-content: center;
    +    text-decoration: none;
    +    text-align: center;
    +}
    +
    +.floating-ask-ai-button svg {
    +    width: 24px; /* Control icon size */
    +    height: 24px;
    +}
    +
    +.floating-ask-ai-button span {
    +    font-size: 0.7em;
    +    margin-top: 2px; /* Space between icon and text */
    +    display: block; /* Ensure it takes space */
    +     line-height: 1;
    +}
    +
    +
    +.floating-ask-ai-button:hover {
    +    background-color: var(--primary-color, #50ffff);
    +    transform: scale(1.05); /* Slight grow effect */
    +}
    +
    +.floating-ask-ai-button:focus {
    +     outline: 2px solid var(--primary-color);
    +     outline-offset: 2px;
    +}
    +
    +/* Optional: Hide text on smaller screens if needed */
    +@media screen and (max-width: 768px) {
    +     .floating-ask-ai-button span {
    +        /* display: none; */ /* Uncomment to hide text */
    +     }
    +     .floating-ask-ai-button {
    +        width: 55px;
    +        height: 55px;
    +        bottom: 20px;
    +        right: 20px;
    +     }
     }
    \ No newline at end of file
    diff --git a/docs/md_v2/assets/selection_ask_ai.js b/docs/md_v2/assets/selection_ask_ai.js
    new file mode 100644
    index 00000000..b5cb471d
    --- /dev/null
    +++ b/docs/md_v2/assets/selection_ask_ai.js
    @@ -0,0 +1,109 @@
    +// ==== File: docs/assets/selection_ask_ai.js ====
    +
    +document.addEventListener('DOMContentLoaded', () => {
    +    let askAiButton = null;
    +    const askAiPageUrl = '/core/ask-ai/'; // Adjust if your Ask AI page path is different
    +
    +    function createAskAiButton() {
    +        const button = document.createElement('button');
    +        button.id = 'ask-ai-selection-btn';
    +        button.className = 'ask-ai-selection-button';
    +        button.textContent = 'Ask AI'; // Or use an icon
    +        button.style.display = 'none'; // Initially hidden
    +        button.style.position = 'absolute';
    +        button.style.zIndex = '1500'; // Ensure it's on top
    +        document.body.appendChild(button);
    +
    +        button.addEventListener('click', handleAskAiClick);
    +        return button;
    +    }
    +
    +    function getSafeSelectedText() {
    +        const selection = window.getSelection();
    +        if (!selection || selection.rangeCount === 0) {
    +            return null;
    +        }
    +        // Avoid selecting text within the button itself if it was somehow selected
    +        const container = selection.getRangeAt(0).commonAncestorContainer;
    +        if (askAiButton && askAiButton.contains(container)) {
    +             return null;
    +        }
    +
    +        const text = selection.toString().trim();
    +        return text.length > 0 ? text : null;
    +    }
    +
    +    function positionButton(event) {
    +         const selection = window.getSelection();
    +         if (!selection || selection.rangeCount === 0 || selection.isCollapsed) {
    +             hideButton();
    +             return;
    +         }
    +
    +        const range = selection.getRangeAt(0);
    +        const rect = range.getBoundingClientRect();
    +
    +        // Calculate position: top-right of the selection
    +        const scrollX = window.scrollX;
    +        const scrollY = window.scrollY;
    +        const buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
    +        const buttonLeft = rect.right + scrollX + 5; // 5px to the right
    +
    +        askAiButton.style.top = `${buttonTop}px`;
    +        askAiButton.style.left = `${buttonLeft}px`;
    +        askAiButton.style.display = 'block'; // Show the button
    +    }
    +
    +    function hideButton() {
    +        if (askAiButton) {
    +            askAiButton.style.display = 'none';
    +        }
    +    }
    +
    +    function handleAskAiClick(event) {
    +        event.stopPropagation(); // Prevent mousedown from hiding button immediately
    +        const selectedText = getSafeSelectedText();
    +        if (selectedText) {
    +            console.log("Selected Text:", selectedText);
    +            // Base64 encode for URL safety (handles special chars, line breaks)
    +            // Use encodeURIComponent first for proper Unicode handling before btoa
    +            const encodedText = btoa(unescape(encodeURIComponent(selectedText)));
    +            const targetUrl = `${askAiPageUrl}?qq=${encodedText}`;
    +            console.log("Navigating to:", targetUrl);
    +            window.location.href = targetUrl; // Navigate to Ask AI page
    +        }
    +        hideButton(); // Hide after click
    +    }
    +
    +    // --- Event Listeners ---
    +
    +    // Show button on mouse up after selection
    +    document.addEventListener('mouseup', (event) => {
    +        // Slight delay to ensure selection is registered
    +        setTimeout(() => {
    +            const selectedText = getSafeSelectedText();
    +            if (selectedText) {
    +                if (!askAiButton) {
    +                    askAiButton = createAskAiButton();
    +                }
    +                // Don't position if the click was ON the button itself
    +                if (event.target !== askAiButton) {
    +                     positionButton(event);
    +                }
    +            } else {
    +                hideButton();
    +            }
    +        }, 10); // Small delay
    +    });
    +
    +    // Hide button on scroll or click elsewhere
    +    document.addEventListener('mousedown', (event) => {
    +        // Hide if clicking anywhere EXCEPT the button itself
    +        if (askAiButton && event.target !== askAiButton) {
    +            hideButton();
    +        }
    +    });
    +    document.addEventListener('scroll', hideButton, true); // Capture scroll events
    +
    +    console.log("Selection Ask AI script loaded.");
    +});
    \ No newline at end of file
    diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css
    index 751aabb7..92e01f85 100644
    --- a/docs/md_v2/assets/styles.css
    +++ b/docs/md_v2/assets/styles.css
    @@ -6,8 +6,8 @@
     }
     
     :root {
    -    --global-font-size: 16px;
    -    --global-code-font-size: 16px;
    +    --global-font-size: 14px;
    +    --global-code-font-size: 13px;
         --global-line-height: 1.5em;
         --global-space: 10px;
         --font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
    @@ -56,7 +56,7 @@
         --toc-width: 240px; /* Adjust based on your desired ToC width */
         --layout-transition-speed: 0.2s; /* For potential future animations */
     
    -    --page-width : 90em; /* Adjust based on your design */
    +    --page-width : 100em; /* Adjust based on your design */
     }
     
     
    diff --git a/docs/md_v2/core/ask-ai.md b/docs/md_v2/core/ask-ai.md
    new file mode 100644
    index 00000000..9122bd29
    --- /dev/null
    +++ b/docs/md_v2/core/ask-ai.md
    @@ -0,0 +1,74 @@
    +
    + +
    + + + + diff --git a/mkdocs.yml b/mkdocs.yml index 1c7be7a3..39e03a88 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -7,10 +7,11 @@ docs_dir: docs/md_v2 nav: - Home: 'index.md' + - "Ask AI": "core/ask-ai.md" + - "Quick Start": "core/quickstart.md" - Setup & Installation: - "Installation": "core/installation.md" - "Docker Deployment": "core/docker-deployment.md" - - "Quick Start": "core/quickstart.md" - "Blog & Changelog": - "Blog Home": "blog/index.md" - "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md" @@ -86,4 +87,7 @@ extra_javascript: - assets/highlight_init.js - https://buttons.github.io/buttons.js - assets/toc.js - - assets/github_stats.js \ No newline at end of file + - assets/github_stats.js + - assets/selection_ask_ai.js + - assets/copy_code.js + - assets/floating_ask_ai_button.js \ No newline at end of file