diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index af98e607..2f421178 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -122,23 +122,25 @@ def from_serializable_dict(data: Any) -> Any: # Handle typed data if isinstance(data, dict) and "type" in data: # Handle plain dictionaries - if data["type"] == "dict": + if data["type"] == "dict" and "value" in data: return {k: from_serializable_dict(v) for k, v in data["value"].items()} # Import from crawl4ai for class instances import crawl4ai - cls = getattr(crawl4ai, data["type"]) + if hasattr(crawl4ai, data["type"]): + cls = getattr(crawl4ai, data["type"]) - # Handle Enum - if issubclass(cls, Enum): - return cls(data["params"]) + # Handle Enum + if issubclass(cls, Enum): + return cls(data["params"]) - # Handle class instances - constructor_args = { - k: from_serializable_dict(v) for k, v in data["params"].items() - } - return cls(**constructor_args) + if "params" in data: + # Handle class instances + constructor_args = { + k: from_serializable_dict(v) for k, v in data["params"].items() + } + return cls(**constructor_args) # Handle lists if isinstance(data, list): diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index f3c7d861..bfe22f4e 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -491,10 +491,12 @@ class BrowserManager: Note: This method should be called in a separate task to avoid blocking the main event loop. """ - if self.playwright is None: - from playwright.async_api import async_playwright + if self.playwright is not None: + await self.close() + + from playwright.async_api import async_playwright - self.playwright = await async_playwright().start() + self.playwright = await async_playwright().start() if self.config.cdp_url or self.config.use_managed_browser: self.config.use_managed_browser = True diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index bf4825cc..954fe37e 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -7,7 +7,9 @@ import time from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA from .config import ( - DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD, + DEFAULT_PROVIDER, + DEFAULT_PROVIDER_API_KEY, + CHUNK_TOKEN_THRESHOLD, OVERLAP_RATE, WORD_TOKEN_RATE, ) @@ -542,6 +544,11 @@ class LLMExtractionStrategy(ExtractionStrategy): """ super().__init__( input_format=input_format, **kwargs) self.llm_config = llm_config + if not self.llm_config: + self.llm_config = create_llm_config( + provider=DEFAULT_PROVIDER, + api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY), + ) self.instruction = instruction self.extract_type = extraction_type self.schema = schema diff --git a/deploy/docker/README-new.md b/deploy/docker/README-new.md new file mode 100644 index 00000000..3a9bdf52 --- /dev/null +++ b/deploy/docker/README-new.md @@ -0,0 +1,644 @@ +# Crawl4AI Docker Guide 🐳 + +## Table of Contents +- [Prerequisites](#prerequisites) +- [Installation](#installation) + - [Option 1: Using Docker Compose (Recommended)](#option-1-using-docker-compose-recommended) + - [Option 2: Manual Local Build & Run](#option-2-manual-local-build--run) + - [Option 3: Using Pre-built Docker Hub Images](#option-3-using-pre-built-docker-hub-images) +- [Dockerfile Parameters](#dockerfile-parameters) +- [Using the API](#using-the-api) + - [Understanding Request Schema](#understanding-request-schema) + - [REST API Examples](#rest-api-examples) + - [Python SDK](#python-sdk) +- [Metrics & Monitoring](#metrics--monitoring) +- [Deployment Scenarios](#deployment-scenarios) +- [Complete Examples](#complete-examples) +- [Server Configuration](#server-configuration) + - [Understanding config.yml](#understanding-configyml) + - [JWT Authentication](#jwt-authentication) + - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices) + - [Customizing Your Configuration](#customizing-your-configuration) + - [Configuration Recommendations](#configuration-recommendations) +- [Getting Help](#getting-help) + +## Prerequisites + +Before we dive in, make sure you have: +- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop). +- `git` for cloning the repository. +- At least 4GB of RAM available for the container (more recommended for heavy use). +- Python 3.10+ (if using the Python SDK). +- Node.js 16+ (if using the Node.js examples). + +> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources. + +## Installation + +We offer several ways to get the Crawl4AI server running. Docker Compose is the easiest way to manage local builds and runs. + +### Option 1: Using Docker Compose (Recommended) + +Docker Compose simplifies building and running the service, especially for local development and testing across different platforms. + +#### 1. Clone Repository + +```bash +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +``` + +#### 2. Environment Setup (API Keys) + +If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**. + +```bash +# Make sure you are in the 'crawl4ai' root directory +cp deploy/docker/.llm.env.example .llm.env + +# Now edit .llm.env and add your API keys +# Example content: +# OPENAI_API_KEY=sk-your-key +# ANTHROPIC_API_KEY=your-anthropic-key +# ... +``` +> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control. + +#### 3. Build and Run with Compose + +The `docker-compose.yml` file in the project root defines services for different scenarios using **profiles**. + +* **Build and Run Locally (AMD64):** + ```bash + # Builds the image locally using Dockerfile and runs it + docker compose --profile local-amd64 up --build -d + ``` + +* **Build and Run Locally (ARM64):** + ```bash + # Builds the image locally using Dockerfile and runs it + docker compose --profile local-arm64 up --build -d + ``` + +* **Run Pre-built Image from Docker Hub (AMD64):** + ```bash + # Pulls and runs the specified AMD64 image from Docker Hub + # (Set VERSION env var for specific tags, e.g., VERSION=0.5.1-d1) + docker compose --profile hub-amd64 up -d + ``` + +* **Run Pre-built Image from Docker Hub (ARM64):** + ```bash + # Pulls and runs the specified ARM64 image from Docker Hub + docker compose --profile hub-arm64 up -d + ``` + +> The server will be available at `http://localhost:11235`. + +#### 4. Stopping Compose Services + +```bash +# Stop the service(s) associated with a profile (e.g., local-amd64) +docker compose --profile local-amd64 down +``` + +### Option 2: Manual Local Build & Run + +If you prefer not to use Docker Compose for local builds. + +#### 1. Clone Repository & Setup Environment + +Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root). + +#### 2. Build the Image (Multi-Arch) + +Use `docker buildx` to build the image. This example builds for multiple platforms and loads the image matching your host architecture into the local Docker daemon. + +```bash +# Make sure you are in the 'crawl4ai' root directory +docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load . +``` + +#### 3. Run the Container + +* **Basic run (no LLM support):** + ```bash + # Replace --platform if your host is ARM64 + docker run -d \ + -p 11235:11235 \ + --name crawl4ai-standalone \ + --shm-size=1g \ + --platform linux/amd64 \ + crawl4ai-local:latest + ``` + +* **With LLM support:** + ```bash + # Make sure .llm.env is in the current directory (project root) + # Replace --platform if your host is ARM64 + docker run -d \ + -p 11235:11235 \ + --name crawl4ai-standalone \ + --env-file .llm.env \ + --shm-size=1g \ + --platform linux/amd64 \ + crawl4ai-local:latest + ``` + +> The server will be available at `http://localhost:11235`. + +#### 4. Stopping the Manual Container + +```bash +docker stop crawl4ai-standalone && docker rm crawl4ai-standalone +``` + +### Option 3: Using Pre-built Docker Hub Images + +Pull and run images directly from Docker Hub without building locally. + +#### 1. Pull the Image + +We use a versioning scheme like `LIBRARY_VERSION-dREVISION` (e.g., `0.5.1-d1`). The `latest` tag points to the most recent stable release. Images are built with multi-arch manifests, so Docker usually pulls the correct version for your system automatically. + +```bash +# Pull a specific version (recommended for stability) +docker pull unclecode/crawl4ai:0.5.1-d1 + +# Or pull the latest stable version +docker pull unclecode/crawl4ai:latest +``` + +#### 2. Setup Environment (API Keys) + +If using LLMs, create the `.llm.env` file in a directory of your choice, similar to Step 2 in the Compose section. + +#### 3. Run the Container + +* **Basic run:** + ```bash + docker run -d \ + -p 11235:11235 \ + --name crawl4ai-hub \ + --shm-size=1g \ + unclecode/crawl4ai:0.5.1-d1 # Or use :latest + ``` + +* **With LLM support:** + ```bash + # Make sure .llm.env is in the current directory you are running docker from + docker run -d \ + -p 11235:11235 \ + --name crawl4ai-hub \ + --env-file .llm.env \ + --shm-size=1g \ + unclecode/crawl4ai:0.5.1-d1 # Or use :latest + ``` + +> The server will be available at `http://localhost:11235`. + +#### 4. Stopping the Hub Container + +```bash +docker stop crawl4ai-hub && docker rm crawl4ai-hub +``` + +#### Docker Hub Versioning Explained + +* **Image Name:** `unclecode/crawl4ai` +* **Tag Format:** `LIBRARY_VERSION-dREVISION` + * `LIBRARY_VERSION`: The Semantic Version of the core `crawl4ai` Python library included (e.g., `0.5.1`). + * `dREVISION`: An incrementing number (starting at `d1`) for Docker build changes made *without* changing the library version (e.g., base image updates, dependency fixes). Resets to `d1` for each new `LIBRARY_VERSION`. +* **Example:** `unclecode/crawl4ai:0.5.1-d1` +* **`latest` Tag:** Points to the most recent stable `LIBRARY_VERSION-dREVISION`. +* **Multi-Arch:** Images support `linux/amd64` and `linux/arm64`. Docker automatically selects the correct architecture. + +--- + +*(Rest of the document remains largely the same, but with key updates below)* + +--- + +## Dockerfile Parameters + +You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file. + +```bash +# Example: Build with 'all' features using buildx +docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --build-arg INSTALL_TYPE=all \ + -t yourname/crawl4ai-all:latest \ + --load \ + . # Build from root context +``` + +### Build Arguments Explained + +| Argument | Description | Default | Options | +| :----------- | :--------------------------------------- | :-------- | :--------------------------------- | +| INSTALL_TYPE | Feature set | `default` | `default`, `all`, `torch`, `transformer` | +| ENABLE_GPU | GPU support (CUDA for AMD64) | `false` | `true`, `false` | +| APP_HOME | Install path inside container (advanced) | `/app` | any valid path | +| USE_LOCAL | Install library from local source | `true` | `true`, `false` | +| GITHUB_REPO | Git repo to clone if USE_LOCAL=false | *(see Dockerfile)* | any git URL | +| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false | `main` | any branch name | + +*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)* + +### Build Best Practices + +1. **Choose the Right Install Type** + * `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation. + * `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras. +2. **Platform Considerations** + * Use `buildx` for building multi-architecture images, especially for pushing to registries. + * Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds. +3. **Performance Optimization** + * The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64). + +--- + +## Using the API + +Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests. + +### Python SDK + +Install the SDK: `pip install crawl4ai` + +```python +import asyncio +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed + +async def main(): + # Point to the correct server port + async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client: + # If JWT is enabled on the server, authenticate first: + # await client.authenticate("user@example.com") # See Server Configuration section + + # Example Non-streaming crawl + print("--- Running Non-Streaming Crawl ---") + results = await client.crawl( + ["https://httpbin.org/html"], + browser_config=BrowserConfig(headless=True), # Use library classes for config aid + crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + if results: # client.crawl returns None on failure + print(f"Non-streaming results success: {results.success}") + if results.success: + for result in results: # Iterate through the CrawlResultContainer + print(f"URL: {result.url}, Success: {result.success}") + else: + print("Non-streaming crawl failed.") + + + # Example Streaming crawl + print("\n--- Running Streaming Crawl ---") + stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS) + try: + async for result in await client.crawl( # client.crawl returns an async generator for streaming + ["https://httpbin.org/html", "https://httpbin.org/links/5/0"], + browser_config=BrowserConfig(headless=True), + crawler_config=stream_config + ): + print(f"Streamed result: URL: {result.url}, Success: {result.success}") + except Exception as e: + print(f"Streaming crawl failed: {e}") + + + # Example Get schema + print("\n--- Getting Schema ---") + schema = await client.get_schema() + print(f"Schema received: {bool(schema)}") # Print whether schema was received + +if __name__ == "__main__": + asyncio.run(main()) +``` + +*(SDK parameters like timeout, verify_ssl etc. remain the same)* + +### Second Approach: Direct API Calls + +Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`. + +*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)* + +#### More Examples *(Ensure Schema example uses type/value wrapper)* + +**Advanced Crawler Configuration** +*(Keep example, ensure cache_mode uses valid enum value like "bypass")* + +**Extraction Strategy** +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "type": "dict", + "value": { + "baseSelector": "article.post", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "content", "selector": ".content", "type": "html"} + ] + } + } + } + } + } + } +} +``` + +**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)* +*(Keep Deep Crawler Example)* + +### REST API Examples + +Update URLs to use port `11235`. + +#### Simple Crawl + +```python +import requests + +# Configuration objects converted to the required JSON structure +browser_config_payload = { + "type": "BrowserConfig", + "params": {"headless": True} +} +crawler_config_payload = { + "type": "CrawlerRunConfig", + "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum +} + +crawl_payload = { + "urls": ["https://httpbin.org/html"], + "browser_config": browser_config_payload, + "crawler_config": crawler_config_payload +} +response = requests.post( + "http://localhost:11235/crawl", # Updated port + # headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled + json=crawl_payload +) +print(f"Status Code: {response.status_code}") +if response.ok: + print(response.json()) +else: + print(f"Error: {response.text}") + +``` + +#### Streaming Results + +```python +import json +import httpx # Use httpx for async streaming example + +async def test_stream_crawl(token: str = None): # Made token optional + """Test the /crawl/stream endpoint with multiple URLs.""" + url = "http://localhost:11235/crawl/stream" # Updated port + payload = { + "urls": [ + "https://httpbin.org/html", + "https://httpbin.org/links/5/0", + ], + "browser_config": { + "type": "BrowserConfig", + "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": {"stream": True, "cache_mode": "bypass"} + } + } + + headers = {} + # if token: + # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled + + try: + async with httpx.AsyncClient() as client: + async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response: + print(f"Status: {response.status_code} (Expected: 200)") + response.raise_for_status() # Raise exception for bad status codes + + # Read streaming response line-by-line (NDJSON) + async for line in response.aiter_lines(): + if line: + try: + data = json.loads(line) + # Check for completion marker + if data.get("status") == "completed": + print("Stream completed.") + break + print(f"Streamed Result: {json.dumps(data, indent=2)}") + except json.JSONDecodeError: + print(f"Warning: Could not decode JSON line: {line}") + + except httpx.HTTPStatusError as e: + print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}") + except Exception as e: + print(f"Error in streaming crawl test: {str(e)}") + +# To run this example: +# import asyncio +# asyncio.run(test_stream_crawl()) +``` + +--- + +## Metrics & Monitoring + +Keep an eye on your crawler with these endpoints: + +- `/health` - Quick health check +- `/metrics` - Detailed Prometheus metrics +- `/schema` - Full API schema + +Example health check: +```bash +curl http://localhost:11235/health +``` + +--- + +*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)* + +--- + +## Server Configuration + +The server's behavior can be customized through the `config.yml` file. + +### Understanding config.yml + +The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build. + +Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`): + +```yaml +# Application Configuration +app: + title: "Crawl4AI API" + version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1" + host: "0.0.0.0" + port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf). + reload: False # Default set to False - suitable for production + timeout_keep_alive: 300 + +# Default LLM Configuration +llm: + provider: "openai/gpt-4o-mini" + api_key_env: "OPENAI_API_KEY" + # api_key: sk-... # If you pass the API key directly then api_key_env will be ignored + +# Redis Configuration (Used by internal Redis server managed by supervisord) +redis: + host: "localhost" + port: 6379 + db: 0 + password: "" + # ... other redis options ... + +# Rate Limiting Configuration +rate_limiting: + enabled: True + default_limit: "1000/minute" + trusted_proxies: [] + storage_uri: "memory://" # Use "redis://localhost:6379" if you need persistent/shared limits + +# Security Configuration +security: + enabled: false # Master toggle for security features + jwt_enabled: false # Enable JWT authentication (requires security.enabled=true) + https_redirect: false # Force HTTPS (requires security.enabled=true) + trusted_hosts: ["*"] # Allowed hosts (use specific domains in production) + headers: # Security headers (applied if security.enabled=true) + x_content_type_options: "nosniff" + x_frame_options: "DENY" + content_security_policy: "default-src 'self'" + strict_transport_security: "max-age=63072000; includeSubDomains" + +# Crawler Configuration +crawler: + memory_threshold_percent: 95.0 + rate_limiter: + base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher + timeouts: + stream_init: 30.0 # Timeout for stream initialization + batch_process: 300.0 # Timeout for non-streaming /crawl processing + +# Logging Configuration +logging: + level: "INFO" + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Observability Configuration +observability: + prometheus: + enabled: True + endpoint: "/metrics" + health_check: + endpoint: "/health" +``` + +*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)* + +*(Configuration Tips and Best Practices remain the same)* + +### Customizing Your Configuration + +You can override the default `config.yml`. + +#### Method 1: Modify Before Build + +1. Edit the `deploy/docker/config.yml` file in your local repository clone. +2. Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image. + +#### Method 2: Runtime Mount (Recommended for Custom Deploys) + +1. Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections. +2. Mount it when running the container: + + * **Using `docker run`:** + ```bash + # Assumes my-custom-config.yml is in the current directory + docker run -d -p 11235:11235 \ + --name crawl4ai-custom-config \ + --env-file .llm.env \ + --shm-size=1g \ + -v $(pwd)/my-custom-config.yml:/app/config.yml \ + unclecode/crawl4ai:latest # Or your specific tag + ``` + + * **Using `docker-compose.yml`:** Add a `volumes` section to the service definition: + ```yaml + services: + crawl4ai-hub-amd64: # Or your chosen service + image: unclecode/crawl4ai:latest + profiles: ["hub-amd64"] + <<: *base-config + volumes: + # Mount local custom config over the default one in the container + - ./my-custom-config.yml:/app/config.yml + # Keep the shared memory volume from base-config + - /dev/shm:/dev/shm + ``` + *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)* + +> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration. + +### Configuration Recommendations + +1. **Security First** 🔒 + - Always enable security in production + - Use specific trusted_hosts instead of wildcards + - Set up proper rate limiting to protect your server + - Consider your environment before enabling HTTPS redirect + +2. **Resource Management** 💻 + - Adjust memory_threshold_percent based on available RAM + - Set timeouts according to your content size and network conditions + - Use Redis for rate limiting in multi-container setups + +3. **Monitoring** 📊 + - Enable Prometheus if you need metrics + - Set DEBUG logging in development, INFO in production + - Regular health check monitoring is crucial + +4. **Performance Tuning** ⚡ + - Start with conservative rate limiter delays + - Increase batch_process timeout for large content + - Adjust stream_init timeout based on initial response times + +## Getting Help + +We're here to help you succeed with Crawl4AI! Here's how to get support: + +- 📖 Check our [full documentation](https://docs.crawl4ai.com) +- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues) +- 💬 Join our [Discord community](https://discord.gg/crawl4ai) +- ⭐ Star us on GitHub to show support! + +## Summary + +In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment: +- Building and running the Docker container +- Configuring the environment +- Making API requests with proper typing +- Using the Python SDK +- Monitoring your deployment + +Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs. + +Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀 + +Happy crawling! 🕷️ diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 33802772..c01696b2 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -388,21 +388,25 @@ async def handle_crawl_request( ) ) - async with AsyncWebCrawler(config=browser_config) as crawler: - results = [] - func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") - partial_func = partial(func, - urls[0] if len(urls) == 1 else urls, - config=crawler_config, - dispatcher=dispatcher) - results = await partial_func() - return { - "success": True, - "results": [result.model_dump() for result in results] - } + crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + results = [] + func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") + partial_func = partial(func, + urls[0] if len(urls) == 1 else urls, + config=crawler_config, + dispatcher=dispatcher) + results = await partial_func() + await crawler.close() + return { + "success": True, + "results": [result.model_dump() for result in results] + } except Exception as e: logger.error(f"Crawl error: {str(e)}", exc_info=True) + if 'crawler' in locals(): + await crawler.close() raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e) diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml index b7ef4885..3b5fead6 100644 --- a/deploy/docker/config.yml +++ b/deploy/docker/config.yml @@ -4,7 +4,7 @@ app: version: "1.0.0" host: "0.0.0.0" port: 8020 - reload: True + reload: False timeout_keep_alive: 300 # Default LLM Configuration diff --git a/tests/docker/test_server_requests.py b/tests/docker/test_server_requests.py new file mode 100644 index 00000000..ab8b8ced --- /dev/null +++ b/tests/docker/test_server_requests.py @@ -0,0 +1,650 @@ +import pytest +import pytest_asyncio +import httpx +import json +import asyncio +import os +from typing import List, Dict, Any, AsyncGenerator + +# Optional: Import crawl4ai classes directly for reference/easier payload creation aid +# You don't strictly NEED these imports for the tests to run against the server, +# but they help in understanding the structure you are mimicking in JSON. +from crawl4ai import ( + BrowserConfig, + CrawlerRunConfig, + CacheMode, + DefaultMarkdownGenerator, + PruningContentFilter, + BM25ContentFilter, + BFSDeepCrawlStrategy, + FilterChain, + ContentTypeFilter, + DomainFilter, + CompositeScorer, + KeywordRelevanceScorer, + PathDepthScorer, + JsonCssExtractionStrategy, + LLMExtractionStrategy, + LLMConfig +) + +# --- Test Configuration --- +BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable +# Use a known simple HTML page for basic tests +SIMPLE_HTML_URL = "https://httpbin.org/html" +# Use a site suitable for scraping tests +SCRAPE_TARGET_URL = "http://books.toscrape.com/" +# Use a site with internal links for deep crawl tests +DEEP_CRAWL_URL = "https://python.org" + +# --- Pytest Fixtures --- + +# Use the built-in event_loop fixture from pytest_asyncio +# The custom implementation was causing issues with closing the loop + +@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues +async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]: + """Provides an async HTTP client""" + client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0) + yield client + await client.aclose() + +# --- Helper Functions --- + +async def check_server_health(client: httpx.AsyncClient): + """Check if the server is healthy before running tests.""" + try: + response = await client.get("/health") + response.raise_for_status() + print(f"\nServer healthy: {response.json()}") + return True + except (httpx.RequestError, httpx.HTTPStatusError) as e: + pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False) + +async def assert_crawl_result_structure(result: Dict[str, Any]): + """Asserts the basic structure of a single crawl result.""" + assert isinstance(result, dict) + assert "url" in result + assert "success" in result + assert "html" in result + # Add more common checks if needed + +async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]: + """Processes an NDJSON streaming response.""" + results = [] + completed = False + async for line in response.aiter_lines(): + if line: + try: + data = json.loads(line) + if data.get("status") == "completed": + completed = True + break # Stop processing after completion marker + else: + results.append(data) + except json.JSONDecodeError: + pytest.fail(f"Failed to decode JSON line: {line}") + assert completed, "Streaming response did not end with a completion marker." + return results + + +# --- Test Class --- + +@pytest.mark.asyncio +class TestCrawlEndpoints: + + @pytest_asyncio.fixture(autouse=True) + async def check_health_before_tests(self, async_client: httpx.AsyncClient): + """Fixture to ensure server is healthy before each test in the class.""" + await check_server_health(async_client) + + # 1. Simple Requests (Primitives) + async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient): + """Test /crawl with a single URL and simple config values.""" + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True, + } + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": False, # Explicitly false for /crawl + "screenshot": False, + "cache_mode": CacheMode.BYPASS.value # Use enum value + } + } + } + try: + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error: {e}") + print(f"Response content: {e.response.text}") + raise + + assert data["success"] is True + assert isinstance(data["results"], list) + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert result["url"] == SIMPLE_HTML_URL + assert "

Herman Melville - Moby-Dick

" in result["html"] + # We don't specify a markdown generator in this test, so don't make assumptions about markdown field + # It might be null, missing, or populated depending on the server's default behavior + + async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient): + """Test /crawl/stream with a single URL and simple config values.""" + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True, + } + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": True, # Must be true for /crawl/stream + "screenshot": False, + "cache_mode": CacheMode.BYPASS.value + } + } + } + async with async_client.stream("POST", "/crawl/stream", json=payload) as response: + response.raise_for_status() + results = await process_streaming_response(response) + + assert len(results) == 1 + result = results[0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert result["url"] == SIMPLE_HTML_URL + assert "

Herman Melville - Moby-Dick

" in result["html"] + + + # 2. Multi-URL and Dispatcher + async def test_multi_url_crawl(self, async_client: httpx.AsyncClient): + """Test /crawl with multiple URLs, implicitly testing dispatcher.""" + urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"] + payload = { + "urls": urls, + "browser_config": { + "type": "BrowserConfig", + "params": {"headless": True} + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": {"stream": False, "cache_mode": CacheMode.BYPASS.value} + } + } + try: + print(f"Sending deep crawl request to server...") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert isinstance(data["results"], list) + assert len(data["results"]) == len(urls) + for result in data["results"]: + await assert_crawl_result_structure(result) + assert result["success"] is True + assert result["url"] in urls + + async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient): + """Test /crawl/stream with multiple URLs.""" + urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"] + payload = { + "urls": urls, + "browser_config": { + "type": "BrowserConfig", + "params": {"headless": True} + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": {"stream": True, "cache_mode": CacheMode.BYPASS.value} + } + } + async with async_client.stream("POST", "/crawl/stream", json=payload) as response: + response.raise_for_status() + results = await process_streaming_response(response) + + assert len(results) == len(urls) + processed_urls = set() + for result in results: + await assert_crawl_result_structure(result) + assert result["success"] is True + assert result["url"] in urls + processed_urls.add(result["url"]) + assert processed_urls == set(urls) # Ensure all URLs were processed + + + # 3. Class Values and Nested Classes (Markdown Generator) + async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient): + """Test /crawl with MarkdownGenerator using PruningContentFilter.""" + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": CacheMode.ENABLED.value, # Test different cache mode + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.5, # Example param + "threshold_type": "relative" + } + } + } + } + } + } + } + try: + print(f"Sending deep crawl request to server...") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "markdown" in result + assert isinstance(result["markdown"], dict) + assert "raw_markdown" in result["markdown"] + assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown + assert "Moby-Dick" in result["markdown"]["raw_markdown"] + # Fit markdown content might be different/shorter due to pruning + assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"]) + + async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient): + """Test /crawl with MarkdownGenerator using BM25ContentFilter.""" + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "BM25ContentFilter", + "params": { + "user_query": "Herman Melville", # Query for BM25 + "bm25_threshold": 0.1, # Lower threshold to increase matches + "language": "english" # Valid parameters + } + } + } + } + } + } + } + try: + print(f"Payload for BM25 test: {json.dumps(payload)}") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "markdown" in result + assert isinstance(result["markdown"], dict) + assert "raw_markdown" in result["markdown"] + assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown + + # Print values for debug + print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}") + print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}") + + # Either fit_markdown has content (possibly including our query terms) + # or it might be empty if no good BM25 matches were found + # Don't assert specific content since it can be environment-dependent + + + # 4. Deep Crawling + async def test_deep_crawl(self, async_client: httpx.AsyncClient): + """Test /crawl with a deep crawl strategy.""" + payload = { + "urls": [DEEP_CRAWL_URL], # Start URL + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": False, + "cache_mode": CacheMode.BYPASS.value, + "deep_crawl_strategy": { + "type": "BFSDeepCrawlStrategy", + "params": { + "max_depth": 1, # Limit depth for testing speed + "max_pages": 5, # Limit pages to crawl + "filter_chain": { + "type": "FilterChain", + "params": { + "filters": [ + { + "type": "ContentTypeFilter", + "params": {"allowed_types": ["text/html"]} + }, + { + "type": "DomainFilter", + "params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains + } + ] + } + }, + "url_scorer": { + "type": "CompositeScorer", + "params": { + "scorers": [ + { + "type": "KeywordRelevanceScorer", + "params": {"keywords": ["documentation", "tutorial"]} + }, + { + "type": "PathDepthScorer", + "params": {"weight": 0.5, "optimal_depth": 2} + } + ] + } + } + } + } + } + } + } + try: + print(f"Sending deep crawl request to server...") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert isinstance(data["results"], list) + # Expect more than 1 result due to deep crawl (start URL + crawled links) + assert len(data["results"]) > 1 + assert len(data["results"]) <= 6 # Start URL + max_links=5 + + start_url_found = False + crawled_urls_found = False + for result in data["results"]: + await assert_crawl_result_structure(result) + assert result["success"] is True + + # Print URL for debugging + print(f"Crawled URL: {result['url']}") + + # Allow URLs that contain python.org (including subdomains like docs.python.org) + assert "python.org" in result["url"] + if result["url"] == DEEP_CRAWL_URL: + start_url_found = True + else: + crawled_urls_found = True + + assert start_url_found + assert crawled_urls_found + + + # 5. Extraction without LLM (JSON/CSS) + async def test_json_css_extraction(self, async_client: httpx.AsyncClient): + """Test /crawl with JsonCssExtractionStrategy.""" + payload = { + "urls": [SCRAPE_TARGET_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": CacheMode.BYPASS.value, + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "type": "dict", # IMPORTANT: Wrap schema dict with type/value structure + "value": { + "name": "BookList", + "baseSelector": "ol.row li.col-xs-6", # Select each book item + "fields": [ + {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"}, + {"name": "price", "selector": "article.product_pod .price_color", "type": "text"}, + {"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"} + ] + } + } + } + } + } + } + } + try: + print(f"Sending deep crawl request to server...") + response = await async_client.post("/crawl", json=payload) + print(f"Response status: {response.status_code}") + + if response.status_code >= 400: + error_detail = response.json().get('detail', 'No detail provided') + print(f"Error detail: {error_detail}") + print(f"Full response: {response.text}") + + response.raise_for_status() + data = response.json() + except httpx.HTTPStatusError as e: + print(f"Server error status: {e.response.status_code}") + print(f"Server error response: {e.response.text}") + try: + error_json = e.response.json() + print(f"Parsed error: {error_json}") + except: + print("Could not parse error response as JSON") + raise + + assert data["success"] is True + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "extracted_content" in result + assert result["extracted_content"] is not None + + # Extracted content should be a JSON string representing a list of dicts + try: + extracted_data = json.loads(result["extracted_content"]) + assert isinstance(extracted_data, list) + assert len(extracted_data) > 0 # Should find some books + # Check structure of the first extracted item + first_item = extracted_data[0] + assert "title" in first_item + assert "price" in first_item + assert "rating" in first_item + assert "star-rating" in first_item["rating"] # e.g., "star-rating Three" + except (json.JSONDecodeError, AssertionError) as e: + pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}") + + + # 6. Extraction with LLM + async def test_llm_extraction(self, async_client: httpx.AsyncClient): + """ + Test /crawl with LLMExtractionStrategy. + NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY) + configured via .llm.env or environment variables. + This test uses the default provider configured in the server's config.yml. + """ + payload = { + "urls": [SIMPLE_HTML_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": CacheMode.BYPASS.value, + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "instruction": "Extract the main title and the author mentioned in the text into JSON.", + # LLMConfig is implicitly defined by server's config.yml and .llm.env + # If you needed to override provider/token PER REQUEST: + "llm_config": { + "type": "LLMConfig", + "params": { + "provider": "openai/gpt-4o", # Example override + "api_token": os.getenv("OPENAI_API_KEY") # Example override + } + }, + "schema": { # Optional: Provide a schema for structured output + "type": "dict", # IMPORTANT: Wrap schema dict + "value": { + "title": "Book Info", + "type": "object", + "properties": { + "title": {"type": "string", "description": "The main title of the work"}, + "author": {"type": "string", "description": "The author of the work"} + }, + "required": ["title", "author"] + } + } + } + } + } + } + } + + try: + response = await async_client.post("/crawl", json=payload) + response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key) + data = response.json() + except httpx.HTTPStatusError as e: + # Catch potential server errors (like 500 due to missing/invalid API keys) + pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.") + except httpx.RequestError as e: + pytest.fail(f"LLM extraction request failed: {e}.") + + assert data["success"] is True + assert len(data["results"]) == 1 + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "extracted_content" in result + assert result["extracted_content"] is not None + + # Extracted content should be JSON (because we provided a schema) + try: + extracted_data = json.loads(result["extracted_content"]) + print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification + + # Handle both dict and list formats (server returns a list) + if isinstance(extracted_data, list): + assert len(extracted_data) > 0 + extracted_item = extracted_data[0] # Take first item + assert isinstance(extracted_item, dict) + assert "title" in extracted_item + assert "author" in extracted_item + assert "Moby-Dick" in extracted_item.get("title", "") + assert "Herman Melville" in extracted_item.get("author", "") + else: + assert isinstance(extracted_data, dict) + assert "title" in extracted_data + assert "author" in extracted_data + assert "Moby-Dick" in extracted_data.get("title", "") + assert "Herman Melville" in extracted_data.get("author", "") + except (json.JSONDecodeError, AssertionError) as e: + pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}") + except Exception as e: # Catch any other unexpected error + pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}") + +if __name__ == "__main__": + # Define arguments for pytest programmatically + # -v: verbose output + # -s: show print statements immediately (useful for debugging) + # __file__: tells pytest to run tests in the current file + pytest_args = ["-v", "-s", __file__] + + # You can add more pytest arguments here if needed, for example: + # '-k test_llm_extraction': Run only the LLM test function + # pytest_args.append("-k test_llm_extraction") + + print(f"Running pytest with args: {pytest_args}") + + # Execute pytest + exit_code = pytest.main(pytest_args) + + print(f"Pytest finished with exit code: {exit_code}") \ No newline at end of file