fix(core): improve error handling and stability in core components

Enhance error handling and stability across multiple components: - Add safety checks in async_configs.py for type and params existence - Fix browser manager initialization and cleanup logic - Add default LLM config fallback in extraction strategy - Add comprehensive Docker deployment guide and server tests BREAKING CHANGE: BrowserManager.start() now automatically closes existing instances
2025-04-11 20:58:39 +08:00
parent 108b2a8bfb
commit 3179d6ad0c
7 changed files with 1336 additions and 27 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -122,23 +122,25 @@ def from_serializable_dict(data: Any) -> Any:
    # Handle typed data
    if isinstance(data, dict) and "type" in data:
        # Handle plain dictionaries
-        if data["type"] == "dict":
+        if data["type"] == "dict" and "value" in data:
            return {k: from_serializable_dict(v) for k, v in data["value"].items()}

        # Import from crawl4ai for class instances
        import crawl4ai

-        cls = getattr(crawl4ai, data["type"])
+        if hasattr(crawl4ai, data["type"]):
+            cls = getattr(crawl4ai, data["type"])

-        # Handle Enum
-        if issubclass(cls, Enum):
-            return cls(data["params"])
+            # Handle Enum
+            if issubclass(cls, Enum):
+                return cls(data["params"])

-        # Handle class instances
-        constructor_args = {
-            k: from_serializable_dict(v) for k, v in data["params"].items()
-        }
-        return cls(**constructor_args)
+            if "params" in data:
+                # Handle class instances
+                constructor_args = {
+                    k: from_serializable_dict(v) for k, v in data["params"].items()
+                }
+                return cls(**constructor_args)

    # Handle lists
    if isinstance(data, list):
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -491,10 +491,12 @@ class BrowserManager:

        Note: This method should be called in a separate task to avoid blocking the main event loop.
        """
-        if self.playwright is None:
-            from playwright.async_api import async_playwright
+        if self.playwright is not None:
+            await self.close()
+            
+        from playwright.async_api import async_playwright

-            self.playwright = await async_playwright().start()
+        self.playwright = await async_playwright().start()

        if self.config.cdp_url or self.config.use_managed_browser:
            self.config.use_managed_browser = True
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -7,7 +7,9 @@ import time

 from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
 from .config import (
-    DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
+    DEFAULT_PROVIDER,
+    DEFAULT_PROVIDER_API_KEY,
+    CHUNK_TOKEN_THRESHOLD,
    OVERLAP_RATE,
    WORD_TOKEN_RATE,
 )
@@ -542,6 +544,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
        """
        super().__init__( input_format=input_format, **kwargs)
        self.llm_config = llm_config
+        if not self.llm_config:
+            self.llm_config = create_llm_config(
+                provider=DEFAULT_PROVIDER,
+                api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
+            )
        self.instruction = instruction
        self.extract_type = extraction_type
        self.schema = schema
--- a/deploy/docker/README-new.md
+++ b/deploy/docker/README-new.md
@@ -0,0 +1,644 @@
+# Crawl4AI Docker Guide 🐳
+
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+  - [Option 1: Using Docker Compose (Recommended)](#option-1-using-docker-compose-recommended)
+  - [Option 2: Manual Local Build & Run](#option-2-manual-local-build--run)
+  - [Option 3: Using Pre-built Docker Hub Images](#option-3-using-pre-built-docker-hub-images)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+  - [Understanding Request Schema](#understanding-request-schema)
+  - [REST API Examples](#rest-api-examples)
+  - [Python SDK](#python-sdk)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Server Configuration](#server-configuration)
+  - [Understanding config.yml](#understanding-configyml)
+  - [JWT Authentication](#jwt-authentication)
+  - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
+  - [Customizing Your Configuration](#customizing-your-configuration)
+  - [Configuration Recommendations](#configuration-recommendations)
+- [Getting Help](#getting-help)
+
+## Prerequisites
+
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
+- `git` for cloning the repository.
+- At least 4GB of RAM available for the container (more recommended for heavy use).
+- Python 3.10+ (if using the Python SDK).
+- Node.js 16+ (if using the Node.js examples).
+
+> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+We offer several ways to get the Crawl4AI server running. Docker Compose is the easiest way to manage local builds and runs.
+
+### Option 1: Using Docker Compose (Recommended)
+
+Docker Compose simplifies building and running the service, especially for local development and testing across different platforms.
+
+#### 1. Clone Repository
+
+```bash
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+```
+
+#### 2. Environment Setup (API Keys)
+
+If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+cp deploy/docker/.llm.env.example .llm.env
+
+# Now edit .llm.env and add your API keys
+# Example content:
+# OPENAI_API_KEY=sk-your-key
+# ANTHROPIC_API_KEY=your-anthropic-key
+# ...
+```
+> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
+
+#### 3. Build and Run with Compose
+
+The `docker-compose.yml` file in the project root defines services for different scenarios using **profiles**.
+
+*   **Build and Run Locally (AMD64):**
+    ```bash
+    # Builds the image locally using Dockerfile and runs it
+    docker compose --profile local-amd64 up --build -d
+    ```
+
+*   **Build and Run Locally (ARM64):**
+    ```bash
+    # Builds the image locally using Dockerfile and runs it
+    docker compose --profile local-arm64 up --build -d
+    ```
+
+*   **Run Pre-built Image from Docker Hub (AMD64):**
+    ```bash
+    # Pulls and runs the specified AMD64 image from Docker Hub
+    # (Set VERSION env var for specific tags, e.g., VERSION=0.5.1-d1)
+    docker compose --profile hub-amd64 up -d
+    ```
+
+*   **Run Pre-built Image from Docker Hub (ARM64):**
+    ```bash
+    # Pulls and runs the specified ARM64 image from Docker Hub
+    docker compose --profile hub-arm64 up -d
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping Compose Services
+
+```bash
+# Stop the service(s) associated with a profile (e.g., local-amd64)
+docker compose --profile local-amd64 down
+```
+
+### Option 2: Manual Local Build & Run
+
+If you prefer not to use Docker Compose for local builds.
+
+#### 1. Clone Repository & Setup Environment
+
+Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
+
+#### 2. Build the Image (Multi-Arch)
+
+Use `docker buildx` to build the image. This example builds for multiple platforms and loads the image matching your host architecture into the local Docker daemon.
+
+```bash
+# Make sure you are in the 'crawl4ai' root directory
+docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
+```
+
+#### 3. Run the Container
+
+*   **Basic run (no LLM support):**
+    ```bash
+    # Replace --platform if your host is ARM64
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --shm-size=1g \
+      --platform linux/amd64 \
+      crawl4ai-local:latest
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory (project root)
+    # Replace --platform if your host is ARM64
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-standalone \
+      --env-file .llm.env \
+      --shm-size=1g \
+      --platform linux/amd64 \
+      crawl4ai-local:latest
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Manual Container
+
+```bash
+docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
+```
+
+### Option 3: Using Pre-built Docker Hub Images
+
+Pull and run images directly from Docker Hub without building locally.
+
+#### 1. Pull the Image
+
+We use a versioning scheme like `LIBRARY_VERSION-dREVISION` (e.g., `0.5.1-d1`). The `latest` tag points to the most recent stable release. Images are built with multi-arch manifests, so Docker usually pulls the correct version for your system automatically.
+
+```bash
+# Pull a specific version (recommended for stability)
+docker pull unclecode/crawl4ai:0.5.1-d1
+
+# Or pull the latest stable version
+docker pull unclecode/crawl4ai:latest
+```
+
+#### 2. Setup Environment (API Keys)
+
+If using LLMs, create the `.llm.env` file in a directory of your choice, similar to Step 2 in the Compose section.
+
+#### 3. Run the Container
+
+*   **Basic run:**
+    ```bash
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-hub \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.5.1-d1 # Or use :latest
+    ```
+
+*   **With LLM support:**
+    ```bash
+    # Make sure .llm.env is in the current directory you are running docker from
+    docker run -d \
+      -p 11235:11235 \
+      --name crawl4ai-hub \
+      --env-file .llm.env \
+      --shm-size=1g \
+      unclecode/crawl4ai:0.5.1-d1 # Or use :latest
+    ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Hub Container
+
+```bash
+docker stop crawl4ai-hub && docker rm crawl4ai-hub
+```
+
+#### Docker Hub Versioning Explained
+
+*   **Image Name:** `unclecode/crawl4ai`
+*   **Tag Format:** `LIBRARY_VERSION-dREVISION`
+    *   `LIBRARY_VERSION`: The Semantic Version of the core `crawl4ai` Python library included (e.g., `0.5.1`).
+    *   `dREVISION`: An incrementing number (starting at `d1`) for Docker build changes made *without* changing the library version (e.g., base image updates, dependency fixes). Resets to `d1` for each new `LIBRARY_VERSION`.
+*   **Example:** `unclecode/crawl4ai:0.5.1-d1`
+*   **`latest` Tag:** Points to the most recent stable `LIBRARY_VERSION-dREVISION`.
+*   **Multi-Arch:** Images support `linux/amd64` and `linux/arm64`. Docker automatically selects the correct architecture.
+
+---
+
+*(Rest of the document remains largely the same, but with key updates below)*
+
+---
+
+## Dockerfile Parameters
+
+You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
+
+```bash
+# Example: Build with 'all' features using buildx
+docker buildx build \
+  --platform linux/amd64,linux/arm64 \
+  --build-arg INSTALL_TYPE=all \
+  -t yourname/crawl4ai-all:latest \
+  --load \
+  . # Build from root context
+```
+
+### Build Arguments Explained
+
+| Argument     | Description                              | Default   | Options                            |
+| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
+| INSTALL_TYPE | Feature set                              | `default` | `default`, `all`, `torch`, `transformer` |
+| ENABLE_GPU   | GPU support (CUDA for AMD64)           | `false`   | `true`, `false`                    |
+| APP_HOME     | Install path inside container (advanced) | `/app`    | any valid path                   |
+| USE_LOCAL    | Install library from local source        | `true`    | `true`, `false`                    |
+| GITHUB_REPO  | Git repo to clone if USE_LOCAL=false   | *(see Dockerfile)* | any git URL                  |
+| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false   | `main`    | any branch name                  |
+
+*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
+
+### Build Best Practices
+
+1.  **Choose the Right Install Type**
+    *   `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
+    *   `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
+2.  **Platform Considerations**
+    *   Use `buildx` for building multi-architecture images, especially for pushing to registries.
+    *   Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
+3.  **Performance Optimization**
+    *   The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
+
+---
+
+## Using the API
+
+Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
+
+### Python SDK
+
+Install the SDK: `pip install crawl4ai`
+
+```python
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
+
+async def main():
+    # Point to the correct server port
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
+        # If JWT is enabled on the server, authenticate first:
+        # await client.authenticate("user@example.com") # See Server Configuration section
+
+        # Example Non-streaming crawl
+        print("--- Running Non-Streaming Crawl ---")
+        results = await client.crawl(
+            ["https://httpbin.org/html"],
+            browser_config=BrowserConfig(headless=True), # Use library classes for config aid
+            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+        )
+        if results: # client.crawl returns None on failure
+          print(f"Non-streaming results success: {results.success}")
+          if results.success:
+              for result in results: # Iterate through the CrawlResultContainer
+                  print(f"URL: {result.url}, Success: {result.success}")
+        else:
+            print("Non-streaming crawl failed.")
+
+
+        # Example Streaming crawl
+        print("\n--- Running Streaming Crawl ---")
+        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
+        try:
+            async for result in await client.crawl( # client.crawl returns an async generator for streaming
+                ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
+                browser_config=BrowserConfig(headless=True),
+                crawler_config=stream_config
+            ):
+                print(f"Streamed result: URL: {result.url}, Success: {result.success}")
+        except Exception as e:
+            print(f"Streaming crawl failed: {e}")
+
+
+        # Example Get schema
+        print("\n--- Getting Schema ---")
+        schema = await client.get_schema()
+        print(f"Schema received: {bool(schema)}") # Print whether schema was received
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+*(SDK parameters like timeout, verify_ssl etc. remain the same)*
+
+### Second Approach: Direct API Calls
+
+Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
+
+*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
+
+#### More Examples *(Ensure Schema example uses type/value wrapper)*
+
+**Advanced Crawler Configuration**
+*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
+
+**Extraction Strategy**
+```json
+{
+    "crawler_config": {
+        "type": "CrawlerRunConfig",
+        "params": {
+            "extraction_strategy": {
+                "type": "JsonCssExtractionStrategy",
+                "params": {
+                    "schema": {
+                        "type": "dict",
+                        "value": {
+                           "baseSelector": "article.post",
+                           "fields": [
+                               {"name": "title", "selector": "h1", "type": "text"},
+                               {"name": "content", "selector": ".content", "type": "html"}
+                           ]
+                         }
+                    }
+                }
+            }
+        }
+    }
+}
+```
+
+**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
+*(Keep Deep Crawler Example)*
+
+### REST API Examples
+
+Update URLs to use port `11235`.
+
+#### Simple Crawl
+
+```python
+import requests
+
+# Configuration objects converted to the required JSON structure
+browser_config_payload = {
+    "type": "BrowserConfig",
+    "params": {"headless": True}
+}
+crawler_config_payload = {
+    "type": "CrawlerRunConfig",
+    "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
+}
+
+crawl_payload = {
+    "urls": ["https://httpbin.org/html"],
+    "browser_config": browser_config_payload,
+    "crawler_config": crawler_config_payload
+}
+response = requests.post(
+    "http://localhost:11235/crawl", # Updated port
+    # headers={"Authorization": f"Bearer {token}"},  # If JWT is enabled
+    json=crawl_payload
+)
+print(f"Status Code: {response.status_code}")
+if response.ok:
+    print(response.json())
+else:
+    print(f"Error: {response.text}")
+
+```
+
+#### Streaming Results
+
+```python
+import json
+import httpx # Use httpx for async streaming example
+
+async def test_stream_crawl(token: str = None): # Made token optional
+    """Test the /crawl/stream endpoint with multiple URLs."""
+    url = "http://localhost:11235/crawl/stream" # Updated port
+    payload = {
+        "urls": [
+            "https://httpbin.org/html",
+            "https://httpbin.org/links/5/0",
+        ],
+        "browser_config": {
+            "type": "BrowserConfig",
+            "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
+        },
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {"stream": True, "cache_mode": "bypass"}
+        }
+    }
+
+    headers = {}
+    # if token:
+    #    headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
+
+    try:
+        async with httpx.AsyncClient() as client:
+            async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
+                print(f"Status: {response.status_code} (Expected: 200)")
+                response.raise_for_status() # Raise exception for bad status codes
+
+                # Read streaming response line-by-line (NDJSON)
+                async for line in response.aiter_lines():
+                    if line:
+                        try:
+                            data = json.loads(line)
+                            # Check for completion marker
+                            if data.get("status") == "completed":
+                                print("Stream completed.")
+                                break
+                            print(f"Streamed Result: {json.dumps(data, indent=2)}")
+                        except json.JSONDecodeError:
+                            print(f"Warning: Could not decode JSON line: {line}")
+
+    except httpx.HTTPStatusError as e:
+         print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
+    except Exception as e:
+        print(f"Error in streaming crawl test: {str(e)}")
+
+# To run this example:
+# import asyncio
+# asyncio.run(test_stream_crawl())
+```
+
+---
+
+## Metrics & Monitoring
+
+Keep an eye on your crawler with these endpoints:
+
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
+
+Example health check:
+```bash
+curl http://localhost:11235/health
+```
+
+---
+
+*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
+
+---
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file.
+
+### Understanding config.yml
+
+The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
+
+Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
+
+```yaml
+# Application Configuration
+app:
+  title: "Crawl4AI API"
+  version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
+  host: "0.0.0.0"
+  port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
+  reload: False # Default set to False - suitable for production
+  timeout_keep_alive: 300
+
+# Default LLM Configuration
+llm:
+  provider: "openai/gpt-4o-mini"
+  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
+
+# Redis Configuration (Used by internal Redis server managed by supervisord)
+redis:
+  host: "localhost"
+  port: 6379
+  db: 0
+  password: ""
+  # ... other redis options ...
+
+# Rate Limiting Configuration
+rate_limiting:
+  enabled: True
+  default_limit: "1000/minute"
+  trusted_proxies: []
+  storage_uri: "memory://"  # Use "redis://localhost:6379" if you need persistent/shared limits
+
+# Security Configuration
+security:
+  enabled: false # Master toggle for security features
+  jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
+  https_redirect: false # Force HTTPS (requires security.enabled=true)
+  trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
+  headers: # Security headers (applied if security.enabled=true)
+    x_content_type_options: "nosniff"
+    x_frame_options: "DENY"
+    content_security_policy: "default-src 'self'"
+    strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+  memory_threshold_percent: 95.0
+  rate_limiter:
+    base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
+  timeouts:
+    stream_init: 30.0  # Timeout for stream initialization
+    batch_process: 300.0 # Timeout for non-streaming /crawl processing
+
+# Logging Configuration
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+  prometheus:
+    enabled: True
+    endpoint: "/metrics"
+  health_check:
+    endpoint: "/health"
+```
+
+*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
+
+*(Configuration Tips and Best Practices remain the same)*
+
+### Customizing Your Configuration
+
+You can override the default `config.yml`.
+
+#### Method 1: Modify Before Build
+
+1.  Edit the `deploy/docker/config.yml` file in your local repository clone.
+2.  Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
+
+#### Method 2: Runtime Mount (Recommended for Custom Deploys)
+
+1.  Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
+2.  Mount it when running the container:
+
+    *   **Using `docker run`:**
+        ```bash
+        # Assumes my-custom-config.yml is in the current directory
+        docker run -d -p 11235:11235 \
+          --name crawl4ai-custom-config \
+          --env-file .llm.env \
+          --shm-size=1g \
+          -v $(pwd)/my-custom-config.yml:/app/config.yml \
+          unclecode/crawl4ai:latest # Or your specific tag
+        ```
+
+    *   **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
+        ```yaml
+        services:
+          crawl4ai-hub-amd64: # Or your chosen service
+            image: unclecode/crawl4ai:latest
+            profiles: ["hub-amd64"]
+            <<: *base-config
+            volumes:
+              # Mount local custom config over the default one in the container
+              - ./my-custom-config.yml:/app/config.yml
+              # Keep the shared memory volume from base-config
+              - /dev/shm:/dev/shm
+        ```
+        *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
+
+> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
+
+### Configuration Recommendations
+
+1. **Security First** 🔒
+   - Always enable security in production
+   - Use specific trusted_hosts instead of wildcards
+   - Set up proper rate limiting to protect your server
+   - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** 💻
+   - Adjust memory_threshold_percent based on available RAM
+   - Set timeouts according to your content size and network conditions
+   - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** 📊
+   - Enable Prometheus if you need metrics
+   - Set DEBUG logging in development, INFO in production
+   - Regular health check monitoring is crucial
+
+4. **Performance Tuning** ⚡
+   - Start with conservative rate limiter delays
+   - Increase batch_process timeout for large content
+   - Adjust stream_init timeout based on initial response times
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- 📖 Check our [full documentation](https://docs.crawl4ai.com)
+- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
+- ⭐ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment
+- Making API requests with proper typing
+- Using the Python SDK
+- Monitoring your deployment
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
+
+Happy crawling! 🕷️
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -388,21 +388,25 @@ async def handle_crawl_request(
            )
        )

-        async with AsyncWebCrawler(config=browser_config) as crawler:
-            results = []
-            func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
-            partial_func = partial(func, 
-                                   urls[0] if len(urls) == 1 else urls, 
-                                   config=crawler_config, 
-                                   dispatcher=dispatcher)
-            results = await partial_func()
-            return {
-                "success": True,
-                "results": [result.model_dump() for result in results]
-            }
+        crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
+        await crawler.start()
+        results = []
+        func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
+        partial_func = partial(func, 
+                                urls[0] if len(urls) == 1 else urls, 
+                                config=crawler_config, 
+                                dispatcher=dispatcher)
+        results = await partial_func()
+        await crawler.close()
+        return {
+            "success": True,
+            "results": [result.model_dump() for result in results]
+        }

    except Exception as e:
        logger.error(f"Crawl error: {str(e)}", exc_info=True)
+        if 'crawler' in locals():
+            await crawler.close()
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=str(e)
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -4,7 +4,7 @@ app:
  version: "1.0.0"
  host: "0.0.0.0"
  port: 8020
-  reload: True
+  reload: False
  timeout_keep_alive: 300

 # Default LLM Configuration
--- a/tests/docker/test_server_requests.py
+++ b/tests/docker/test_server_requests.py
@@ -0,0 +1,650 @@
+import pytest
+import pytest_asyncio
+import httpx
+import json
+import asyncio
+import os
+from typing import List, Dict, Any, AsyncGenerator
+
+# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
+# You don't strictly NEED these imports for the tests to run against the server,
+# but they help in understanding the structure you are mimicking in JSON.
+from crawl4ai import (
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode,
+    DefaultMarkdownGenerator,
+    PruningContentFilter,
+    BM25ContentFilter,
+    BFSDeepCrawlStrategy,
+    FilterChain,
+    ContentTypeFilter,
+    DomainFilter,
+    CompositeScorer,
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    JsonCssExtractionStrategy,
+    LLMExtractionStrategy,
+    LLMConfig
+)
+
+# --- Test Configuration ---
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
+# Use a known simple HTML page for basic tests
+SIMPLE_HTML_URL = "https://httpbin.org/html"
+# Use a site suitable for scraping tests
+SCRAPE_TARGET_URL = "http://books.toscrape.com/"
+# Use a site with internal links for deep crawl tests
+DEEP_CRAWL_URL = "https://python.org"
+
+# --- Pytest Fixtures ---
+
+# Use the built-in event_loop fixture from pytest_asyncio
+# The custom implementation was causing issues with closing the loop
+
+@pytest_asyncio.fixture(scope="function")  # Changed to function scope to avoid event loop issues
+async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
+    """Provides an async HTTP client"""
+    client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
+    yield client
+    await client.aclose()
+
+# --- Helper Functions ---
+
+async def check_server_health(client: httpx.AsyncClient):
+    """Check if the server is healthy before running tests."""
+    try:
+        response = await client.get("/health")
+        response.raise_for_status()
+        print(f"\nServer healthy: {response.json()}")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
+
+async def assert_crawl_result_structure(result: Dict[str, Any]):
+    """Asserts the basic structure of a single crawl result."""
+    assert isinstance(result, dict)
+    assert "url" in result
+    assert "success" in result
+    assert "html" in result
+    # Add more common checks if needed
+
+async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
+    """Processes an NDJSON streaming response."""
+    results = []
+    completed = False
+    async for line in response.aiter_lines():
+        if line:
+            try:
+                data = json.loads(line)
+                if data.get("status") == "completed":
+                    completed = True
+                    break # Stop processing after completion marker
+                else:
+                    results.append(data)
+            except json.JSONDecodeError:
+                pytest.fail(f"Failed to decode JSON line: {line}")
+    assert completed, "Streaming response did not end with a completion marker."
+    return results
+
+
+# --- Test Class ---
+
+@pytest.mark.asyncio
+class TestCrawlEndpoints:
+
+    @pytest_asyncio.fixture(autouse=True)
+    async def check_health_before_tests(self, async_client: httpx.AsyncClient):
+        """Fixture to ensure server is healthy before each test in the class."""
+        await check_server_health(async_client)
+
+    # 1. Simple Requests (Primitives)
+    async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
+        """Test /crawl with a single URL and simple config values."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {
+                    "headless": True,
+                }
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False, # Explicitly false for /crawl
+                    "screenshot": False,
+                    "cache_mode": CacheMode.BYPASS.value # Use enum value
+                }
+            }
+        }
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error: {e}")
+            print(f"Response content: {e.response.text}")
+            raise
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert result["url"] == SIMPLE_HTML_URL
+        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
+        # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
+        # It might be null, missing, or populated depending on the server's default behavior
+
+    async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
+        """Test /crawl/stream with a single URL and simple config values."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {
+                    "headless": True,
+                }
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": True, # Must be true for /crawl/stream
+                    "screenshot": False,
+                    "cache_mode": CacheMode.BYPASS.value
+                }
+            }
+        }
+        async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
+            response.raise_for_status()
+            results = await process_streaming_response(response)
+
+        assert len(results) == 1
+        result = results[0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert result["url"] == SIMPLE_HTML_URL
+        assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
+
+
+    # 2. Multi-URL and Dispatcher
+    async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
+        """Test /crawl with multiple URLs, implicitly testing dispatcher."""
+        urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
+        payload = {
+            "urls": urls,
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {"headless": True}
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        assert len(data["results"]) == len(urls)
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["url"] in urls
+
+    async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
+        """Test /crawl/stream with multiple URLs."""
+        urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
+        payload = {
+            "urls": urls,
+            "browser_config": {
+                "type": "BrowserConfig",
+                "params": {"headless": True}
+            },
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
+            }
+        }
+        async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
+            response.raise_for_status()
+            results = await process_streaming_response(response)
+
+        assert len(results) == len(urls)
+        processed_urls = set()
+        for result in results:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            assert result["url"] in urls
+            processed_urls.add(result["url"])
+        assert processed_urls == set(urls) # Ensure all URLs were processed
+
+
+    # 3. Class Values and Nested Classes (Markdown Generator)
+    async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
+        """Test /crawl with MarkdownGenerator using PruningContentFilter."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "cache_mode": CacheMode.ENABLED.value, # Test different cache mode
+                    "markdown_generator": {
+                        "type": "DefaultMarkdownGenerator",
+                        "params": {
+                            "content_filter": {
+                                "type": "PruningContentFilter",
+                                "params": {
+                                    "threshold": 0.5, # Example param
+                                    "threshold_type": "relative"
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "markdown" in result
+        assert isinstance(result["markdown"], dict)
+        assert "raw_markdown" in result["markdown"]
+        assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
+        assert "Moby-Dick" in result["markdown"]["raw_markdown"]
+        # Fit markdown content might be different/shorter due to pruning
+        assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
+
+    async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
+        """Test /crawl with MarkdownGenerator using BM25ContentFilter."""
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "markdown_generator": {
+                        "type": "DefaultMarkdownGenerator",
+                        "params": {
+                            "content_filter": {
+                                "type": "BM25ContentFilter",
+                                "params": {
+                                    "user_query": "Herman Melville", # Query for BM25
+                                    "bm25_threshold": 0.1, # Lower threshold to increase matches
+                                    "language": "english"  # Valid parameters
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Payload for BM25 test: {json.dumps(payload)}")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "markdown" in result
+        assert isinstance(result["markdown"], dict)
+        assert "raw_markdown" in result["markdown"]
+        assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
+        
+        # Print values for debug
+        print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
+        print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
+        
+        # Either fit_markdown has content (possibly including our query terms)
+        # or it might be empty if no good BM25 matches were found
+        # Don't assert specific content since it can be environment-dependent
+
+
+    # 4. Deep Crawling
+    async def test_deep_crawl(self, async_client: httpx.AsyncClient):
+        """Test /crawl with a deep crawl strategy."""
+        payload = {
+            "urls": [DEEP_CRAWL_URL], # Start URL
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {
+                            "max_depth": 1, # Limit depth for testing speed
+                            "max_pages": 5, # Limit pages to crawl
+                            "filter_chain": {
+                                "type": "FilterChain",
+                                "params": {
+                                    "filters": [
+                                        {
+                                            "type": "ContentTypeFilter",
+                                            "params": {"allowed_types": ["text/html"]}
+                                        },
+                                        {
+                                            "type": "DomainFilter",
+                                            "params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
+                                        }
+                                    ]
+                                }
+                            },
+                            "url_scorer": {
+                                "type": "CompositeScorer",
+                                "params": {
+                                    "scorers": [
+                                        {
+                                            "type": "KeywordRelevanceScorer",
+                                            "params": {"keywords": ["documentation", "tutorial"]}
+                                        },
+                                        {
+                                            "type": "PathDepthScorer",
+                                            "params": {"weight": 0.5, "optimal_depth": 2}
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert isinstance(data["results"], list)
+        # Expect more than 1 result due to deep crawl (start URL + crawled links)
+        assert len(data["results"]) > 1
+        assert len(data["results"]) <= 6 # Start URL + max_links=5
+
+        start_url_found = False
+        crawled_urls_found = False
+        for result in data["results"]:
+            await assert_crawl_result_structure(result)
+            assert result["success"] is True
+            
+            # Print URL for debugging
+            print(f"Crawled URL: {result['url']}")
+            
+            # Allow URLs that contain python.org (including subdomains like docs.python.org)
+            assert "python.org" in result["url"]
+            if result["url"] == DEEP_CRAWL_URL:
+                start_url_found = True
+            else:
+                crawled_urls_found = True
+
+        assert start_url_found
+        assert crawled_urls_found
+
+
+    # 5. Extraction without LLM (JSON/CSS)
+    async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
+        """Test /crawl with JsonCssExtractionStrategy."""
+        payload = {
+            "urls": [SCRAPE_TARGET_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "extraction_strategy": {
+                        "type": "JsonCssExtractionStrategy",
+                        "params": {
+                            "schema": { 
+                                "type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
+                                "value": {
+                                    "name": "BookList",
+                                    "baseSelector": "ol.row li.col-xs-6", # Select each book item
+                                    "fields": [
+                                        {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
+                                        {"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
+                                        {"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
+                                    ]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        try:
+            print(f"Sending deep crawl request to server...")
+            response = await async_client.post("/crawl", json=payload)
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code >= 400:
+                error_detail = response.json().get('detail', 'No detail provided')
+                print(f"Error detail: {error_detail}")
+                print(f"Full response: {response.text}")
+            
+            response.raise_for_status()
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            print(f"Server error status: {e.response.status_code}")
+            print(f"Server error response: {e.response.text}")
+            try:
+                error_json = e.response.json()
+                print(f"Parsed error: {error_json}")
+            except:
+                print("Could not parse error response as JSON")
+            raise
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "extracted_content" in result
+        assert result["extracted_content"] is not None
+
+        # Extracted content should be a JSON string representing a list of dicts
+        try:
+            extracted_data = json.loads(result["extracted_content"])
+            assert isinstance(extracted_data, list)
+            assert len(extracted_data) > 0 # Should find some books
+            # Check structure of the first extracted item
+            first_item = extracted_data[0]
+            assert "title" in first_item
+            assert "price" in first_item
+            assert "rating" in first_item
+            assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
+        except (json.JSONDecodeError, AssertionError) as e:
+            pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
+
+
+    # 6. Extraction with LLM
+    async def test_llm_extraction(self, async_client: httpx.AsyncClient):
+        """
+        Test /crawl with LLMExtractionStrategy.
+        NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
+              configured via .llm.env or environment variables.
+              This test uses the default provider configured in the server's config.yml.
+        """
+        payload = {
+            "urls": [SIMPLE_HTML_URL],
+            "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "cache_mode": CacheMode.BYPASS.value,
+                    "extraction_strategy": {
+                        "type": "LLMExtractionStrategy",
+                        "params": {
+                            "instruction": "Extract the main title and the author mentioned in the text into JSON.",
+                            # LLMConfig is implicitly defined by server's config.yml and .llm.env
+                            # If you needed to override provider/token PER REQUEST:
+                            "llm_config": {
+                               "type": "LLMConfig",
+                               "params": {
+                                  "provider": "openai/gpt-4o", # Example override
+                                  "api_token": os.getenv("OPENAI_API_KEY") # Example override
+                               }
+                            },
+                            "schema": { # Optional: Provide a schema for structured output
+                                "type": "dict", # IMPORTANT: Wrap schema dict
+                                "value": {
+                                    "title": "Book Info",
+                                    "type": "object",
+                                    "properties": {
+                                        "title": {"type": "string", "description": "The main title of the work"},
+                                        "author": {"type": "string", "description": "The author of the work"}
+                                    },
+                                     "required": ["title", "author"]
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        try:
+            response = await async_client.post("/crawl", json=payload)
+            response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
+            data = response.json()
+        except httpx.HTTPStatusError as e:
+            # Catch potential server errors (like 500 due to missing/invalid API keys)
+            pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
+        except httpx.RequestError as e:
+             pytest.fail(f"LLM extraction request failed: {e}.")
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "extracted_content" in result
+        assert result["extracted_content"] is not None
+
+        # Extracted content should be JSON (because we provided a schema)
+        try:
+            extracted_data = json.loads(result["extracted_content"])
+            print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
+            
+            # Handle both dict and list formats (server returns a list)
+            if isinstance(extracted_data, list):
+                assert len(extracted_data) > 0
+                extracted_item = extracted_data[0]  # Take first item
+                assert isinstance(extracted_item, dict)
+                assert "title" in extracted_item
+                assert "author" in extracted_item
+                assert "Moby-Dick" in extracted_item.get("title", "")
+                assert "Herman Melville" in extracted_item.get("author", "")
+            else:
+                assert isinstance(extracted_data, dict)
+                assert "title" in extracted_data
+                assert "author" in extracted_data
+                assert "Moby-Dick" in extracted_data.get("title", "")
+                assert "Herman Melville" in extracted_data.get("author", "")
+        except (json.JSONDecodeError, AssertionError) as e:
+            pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
+        except Exception as e: # Catch any other unexpected error
+            pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
+            
+if __name__ == "__main__":
+    # Define arguments for pytest programmatically
+    # -v: verbose output
+    # -s: show print statements immediately (useful for debugging)
+    # __file__: tells pytest to run tests in the current file
+    pytest_args = ["-v", "-s", __file__]
+
+    # You can add more pytest arguments here if needed, for example:
+    # '-k test_llm_extraction': Run only the LLM test function
+    # pytest_args.append("-k test_llm_extraction")
+
+    print(f"Running pytest with args: {pytest_args}")
+
+    # Execute pytest
+    exit_code = pytest.main(pytest_args)
+
+    print(f"Pytest finished with exit code: {exit_code}")