826 lines
22 KiB
Plaintext
826 lines
22 KiB
Plaintext
## Docker Deployment
|
|
|
|
Complete Docker deployment guide with pre-built images, API endpoints, configuration, and MCP integration.
|
|
|
|
### Quick Start with Pre-built Images
|
|
|
|
```bash
|
|
# Pull latest image
|
|
docker pull unclecode/crawl4ai:latest
|
|
|
|
# Setup LLM API keys
|
|
cat > .llm.env << EOL
|
|
OPENAI_API_KEY=sk-your-key
|
|
ANTHROPIC_API_KEY=your-anthropic-key
|
|
GROQ_API_KEY=your-groq-key
|
|
GEMINI_API_TOKEN=your-gemini-token
|
|
EOL
|
|
|
|
# Run with LLM support
|
|
docker run -d \
|
|
-p 11235:11235 \
|
|
--name crawl4ai \
|
|
--env-file .llm.env \
|
|
--shm-size=1g \
|
|
unclecode/crawl4ai:latest
|
|
|
|
# Basic run (no LLM)
|
|
docker run -d \
|
|
-p 11235:11235 \
|
|
--name crawl4ai \
|
|
--shm-size=1g \
|
|
unclecode/crawl4ai:latest
|
|
|
|
# Check health
|
|
curl http://localhost:11235/health
|
|
```
|
|
|
|
### Docker Compose Deployment
|
|
|
|
```bash
|
|
# Clone and setup
|
|
git clone https://github.com/unclecode/crawl4ai.git
|
|
cd crawl4ai
|
|
cp deploy/docker/.llm.env.example .llm.env
|
|
# Edit .llm.env with your API keys
|
|
|
|
# Run pre-built image
|
|
IMAGE=unclecode/crawl4ai:latest docker compose up -d
|
|
|
|
# Build locally
|
|
docker compose up --build -d
|
|
|
|
# Build with all features
|
|
INSTALL_TYPE=all docker compose up --build -d
|
|
|
|
# Build with GPU support
|
|
ENABLE_GPU=true docker compose up --build -d
|
|
|
|
# Stop service
|
|
docker compose down
|
|
```
|
|
|
|
### Manual Build with Multi-Architecture
|
|
|
|
```bash
|
|
# Clone repository
|
|
git clone https://github.com/unclecode/crawl4ai.git
|
|
cd crawl4ai
|
|
|
|
# Build for current architecture
|
|
docker buildx build -t crawl4ai-local:latest --load .
|
|
|
|
# Build for multiple architectures
|
|
docker buildx build --platform linux/amd64,linux/arm64 \
|
|
-t crawl4ai-local:latest --load .
|
|
|
|
# Build with specific features
|
|
docker buildx build \
|
|
--build-arg INSTALL_TYPE=all \
|
|
--build-arg ENABLE_GPU=false \
|
|
-t crawl4ai-local:latest --load .
|
|
|
|
# Run custom build
|
|
docker run -d \
|
|
-p 11235:11235 \
|
|
--name crawl4ai-custom \
|
|
--env-file .llm.env \
|
|
--shm-size=1g \
|
|
crawl4ai-local:latest
|
|
```
|
|
|
|
### Build Arguments
|
|
|
|
```bash
|
|
# Available build options
|
|
docker buildx build \
|
|
--build-arg INSTALL_TYPE=all \ # default|all|torch|transformer
|
|
--build-arg ENABLE_GPU=true \ # true|false
|
|
--build-arg APP_HOME=/app \ # Install path
|
|
--build-arg USE_LOCAL=true \ # Use local source
|
|
--build-arg GITHUB_REPO=url \ # Git repo if USE_LOCAL=false
|
|
--build-arg GITHUB_BRANCH=main \ # Git branch
|
|
-t crawl4ai-custom:latest --load .
|
|
```
|
|
|
|
### Core API Endpoints
|
|
|
|
```python
|
|
# Main crawling endpoints
|
|
import requests
|
|
import json
|
|
|
|
# Basic crawl
|
|
payload = {
|
|
"urls": ["https://example.com"],
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
|
|
}
|
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
|
|
|
# Streaming crawl
|
|
payload["crawler_config"]["params"]["stream"] = True
|
|
response = requests.post("http://localhost:11235/crawl/stream", json=payload)
|
|
|
|
# Health check
|
|
response = requests.get("http://localhost:11235/health")
|
|
|
|
# API schema
|
|
response = requests.get("http://localhost:11235/schema")
|
|
|
|
# Metrics (Prometheus format)
|
|
response = requests.get("http://localhost:11235/metrics")
|
|
```
|
|
|
|
### Specialized Endpoints
|
|
|
|
```python
|
|
# HTML extraction (preprocessed for schema)
|
|
response = requests.post("http://localhost:11235/html",
|
|
json={"url": "https://example.com"})
|
|
|
|
# Screenshot capture
|
|
response = requests.post("http://localhost:11235/screenshot", json={
|
|
"url": "https://example.com",
|
|
"screenshot_wait_for": 2,
|
|
"output_path": "/path/to/save/screenshot.png"
|
|
})
|
|
|
|
# PDF generation
|
|
response = requests.post("http://localhost:11235/pdf", json={
|
|
"url": "https://example.com",
|
|
"output_path": "/path/to/save/document.pdf"
|
|
})
|
|
|
|
# JavaScript execution
|
|
response = requests.post("http://localhost:11235/execute_js", json={
|
|
"url": "https://example.com",
|
|
"scripts": [
|
|
"return document.title",
|
|
"return Array.from(document.querySelectorAll('a')).map(a => a.href)"
|
|
]
|
|
})
|
|
|
|
# Markdown generation
|
|
response = requests.post("http://localhost:11235/md", json={
|
|
"url": "https://example.com",
|
|
"f": "fit", # raw|fit|bm25|llm
|
|
"q": "extract main content", # query for filtering
|
|
"c": "0" # cache: 0=bypass, 1=use
|
|
})
|
|
|
|
# LLM Q&A
|
|
response = requests.get("http://localhost:11235/llm/https://example.com?q=What is this page about?")
|
|
|
|
# Library context (for AI assistants)
|
|
response = requests.get("http://localhost:11235/ask", params={
|
|
"context_type": "all", # code|doc|all
|
|
"query": "how to use extraction strategies",
|
|
"score_ratio": 0.5,
|
|
"max_results": 20
|
|
})
|
|
```
|
|
|
|
### Python SDK Usage
|
|
|
|
```python
|
|
import asyncio
|
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
|
|
|
|
async def main():
|
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
|
# Non-streaming crawl
|
|
results = await client.crawl(
|
|
["https://example.com"],
|
|
browser_config=BrowserConfig(headless=True),
|
|
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
)
|
|
|
|
for result in results:
|
|
print(f"URL: {result.url}, Success: {result.success}")
|
|
print(f"Content length: {len(result.markdown)}")
|
|
|
|
# Streaming crawl
|
|
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
|
async for result in await client.crawl(
|
|
["https://example.com", "https://python.org"],
|
|
browser_config=BrowserConfig(headless=True),
|
|
crawler_config=stream_config
|
|
):
|
|
print(f"Streamed: {result.url} - {result.success}")
|
|
|
|
# Get API schema
|
|
schema = await client.get_schema()
|
|
print(f"Schema available: {bool(schema)}")
|
|
|
|
asyncio.run(main())
|
|
```
|
|
|
|
### Advanced API Configuration
|
|
|
|
```python
|
|
# Complex extraction with LLM
|
|
payload = {
|
|
"urls": ["https://example.com"],
|
|
"browser_config": {
|
|
"type": "BrowserConfig",
|
|
"params": {
|
|
"headless": True,
|
|
"viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}
|
|
}
|
|
},
|
|
"crawler_config": {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {
|
|
"extraction_strategy": {
|
|
"type": "LLMExtractionStrategy",
|
|
"params": {
|
|
"llm_config": {
|
|
"type": "LLMConfig",
|
|
"params": {
|
|
"provider": "openai/gpt-4o-mini",
|
|
"api_token": "env:OPENAI_API_KEY"
|
|
}
|
|
},
|
|
"schema": {
|
|
"type": "dict",
|
|
"value": {
|
|
"type": "object",
|
|
"properties": {
|
|
"title": {"type": "string"},
|
|
"content": {"type": "string"}
|
|
}
|
|
}
|
|
},
|
|
"instruction": "Extract title and main content"
|
|
}
|
|
},
|
|
"markdown_generator": {
|
|
"type": "DefaultMarkdownGenerator",
|
|
"params": {
|
|
"content_filter": {
|
|
"type": "PruningContentFilter",
|
|
"params": {"threshold": 0.6}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
|
```
|
|
|
|
### CSS Extraction Strategy
|
|
|
|
```python
|
|
# CSS-based structured extraction
|
|
schema = {
|
|
"name": "ProductList",
|
|
"baseSelector": ".product",
|
|
"fields": [
|
|
{"name": "title", "selector": "h2", "type": "text"},
|
|
{"name": "price", "selector": ".price", "type": "text"},
|
|
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
|
|
]
|
|
}
|
|
|
|
payload = {
|
|
"urls": ["https://example-shop.com"],
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
"crawler_config": {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {
|
|
"extraction_strategy": {
|
|
"type": "JsonCssExtractionStrategy",
|
|
"params": {
|
|
"schema": {"type": "dict", "value": schema}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
|
data = response.json()
|
|
extracted = json.loads(data["results"][0]["extracted_content"])
|
|
```
|
|
|
|
### MCP (Model Context Protocol) Integration
|
|
|
|
```bash
|
|
# Add Crawl4AI as MCP provider to Claude Code
|
|
claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
|
|
|
|
# List MCP providers
|
|
claude mcp list
|
|
|
|
# Test MCP connection
|
|
python tests/mcp/test_mcp_socket.py
|
|
|
|
# Available MCP endpoints
|
|
# SSE: http://localhost:11235/mcp/sse
|
|
# WebSocket: ws://localhost:11235/mcp/ws
|
|
# Schema: http://localhost:11235/mcp/schema
|
|
```
|
|
|
|
Available MCP tools:
|
|
- `md` - Generate markdown from web content
|
|
- `html` - Extract preprocessed HTML
|
|
- `screenshot` - Capture webpage screenshots
|
|
- `pdf` - Generate PDF documents
|
|
- `execute_js` - Run JavaScript on web pages
|
|
- `crawl` - Perform multi-URL crawling
|
|
- `ask` - Query Crawl4AI library context
|
|
|
|
### Configuration Management
|
|
|
|
```yaml
|
|
# config.yml structure
|
|
app:
|
|
title: "Crawl4AI API"
|
|
version: "1.0.0"
|
|
host: "0.0.0.0"
|
|
port: 11235
|
|
timeout_keep_alive: 300
|
|
|
|
llm:
|
|
provider: "openai/gpt-4o-mini"
|
|
api_key_env: "OPENAI_API_KEY"
|
|
|
|
security:
|
|
enabled: false
|
|
jwt_enabled: false
|
|
trusted_hosts: ["*"]
|
|
|
|
crawler:
|
|
memory_threshold_percent: 95.0
|
|
rate_limiter:
|
|
base_delay: [1.0, 2.0]
|
|
timeouts:
|
|
stream_init: 30.0
|
|
batch_process: 300.0
|
|
pool:
|
|
max_pages: 40
|
|
idle_ttl_sec: 1800
|
|
|
|
rate_limiting:
|
|
enabled: true
|
|
default_limit: "1000/minute"
|
|
storage_uri: "memory://"
|
|
|
|
logging:
|
|
level: "INFO"
|
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
```
|
|
|
|
### Custom Configuration Deployment
|
|
|
|
```bash
|
|
# Method 1: Mount custom config
|
|
docker run -d -p 11235:11235 \
|
|
--name crawl4ai-custom \
|
|
--env-file .llm.env \
|
|
--shm-size=1g \
|
|
-v $(pwd)/my-config.yml:/app/config.yml \
|
|
unclecode/crawl4ai:latest
|
|
|
|
# Method 2: Build with custom config
|
|
# Edit deploy/docker/config.yml then build
|
|
docker buildx build -t crawl4ai-custom:latest --load .
|
|
```
|
|
|
|
### Monitoring and Health Checks
|
|
|
|
```bash
|
|
# Health endpoint
|
|
curl http://localhost:11235/health
|
|
|
|
# Prometheus metrics
|
|
curl http://localhost:11235/metrics
|
|
|
|
# Configuration validation
|
|
curl -X POST http://localhost:11235/config/dump \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"code": "CrawlerRunConfig(cache_mode=\"BYPASS\", screenshot=True)"}'
|
|
```
|
|
|
|
### Playground Interface
|
|
|
|
Access the interactive playground at `http://localhost:11235/playground` for:
|
|
- Testing configurations with visual interface
|
|
- Generating JSON payloads for REST API
|
|
- Converting Python config to JSON format
|
|
- Testing crawl operations directly in browser
|
|
|
|
### Async Job Processing
|
|
|
|
```python
|
|
# Submit job for async processing
|
|
import time
|
|
|
|
# Submit crawl job
|
|
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
|
task_id = response.json()["task_id"]
|
|
|
|
# Poll for completion
|
|
while True:
|
|
result = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
|
|
status = result.json()
|
|
|
|
if status["status"] in ["COMPLETED", "FAILED"]:
|
|
break
|
|
time.sleep(1.5)
|
|
|
|
print("Final result:", status)
|
|
```
|
|
|
|
### Production Deployment
|
|
|
|
```bash
|
|
# Production-ready deployment
|
|
docker run -d \
|
|
--name crawl4ai-prod \
|
|
--restart unless-stopped \
|
|
-p 11235:11235 \
|
|
--env-file .llm.env \
|
|
--shm-size=2g \
|
|
--memory=8g \
|
|
--cpus=4 \
|
|
-v /path/to/custom-config.yml:/app/config.yml \
|
|
unclecode/crawl4ai:latest
|
|
|
|
# With Docker Compose for production
|
|
version: '3.8'
|
|
services:
|
|
crawl4ai:
|
|
image: unclecode/crawl4ai:latest
|
|
ports:
|
|
- "11235:11235"
|
|
environment:
|
|
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
|
volumes:
|
|
- ./config.yml:/app/config.yml
|
|
shm_size: 2g
|
|
deploy:
|
|
resources:
|
|
limits:
|
|
memory: 8G
|
|
cpus: '4'
|
|
restart: unless-stopped
|
|
```
|
|
|
|
### Configuration Validation and JSON Structure
|
|
|
|
```python
|
|
# Method 1: Create config objects and dump to see expected JSON structure
|
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
|
|
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
|
|
import json
|
|
|
|
# Create browser config and see JSON structure
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
viewport_width=1280,
|
|
viewport_height=720,
|
|
proxy="http://user:pass@proxy:8080"
|
|
)
|
|
|
|
# Get JSON structure
|
|
browser_json = browser_config.dump()
|
|
print("BrowserConfig JSON structure:")
|
|
print(json.dumps(browser_json, indent=2))
|
|
|
|
# Create crawler config with extraction strategy
|
|
schema = {
|
|
"name": "Articles",
|
|
"baseSelector": ".article",
|
|
"fields": [
|
|
{"name": "title", "selector": "h2", "type": "text"},
|
|
{"name": "content", "selector": ".content", "type": "html"}
|
|
]
|
|
}
|
|
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
screenshot=True,
|
|
extraction_strategy=JsonCssExtractionStrategy(schema),
|
|
js_code=["window.scrollTo(0, document.body.scrollHeight);"],
|
|
wait_for="css:.loaded"
|
|
)
|
|
|
|
crawler_json = crawler_config.dump()
|
|
print("\nCrawlerRunConfig JSON structure:")
|
|
print(json.dumps(crawler_json, indent=2))
|
|
```
|
|
|
|
### Reverse Validation - JSON to Objects
|
|
|
|
```python
|
|
# Method 2: Load JSON back to config objects for validation
|
|
from crawl4ai.async_configs import from_serializable_dict
|
|
|
|
# Test JSON structure by converting back to objects
|
|
test_browser_json = {
|
|
"type": "BrowserConfig",
|
|
"params": {
|
|
"headless": True,
|
|
"viewport_width": 1280,
|
|
"proxy": "http://user:pass@proxy:8080"
|
|
}
|
|
}
|
|
|
|
try:
|
|
# Convert JSON back to object
|
|
restored_browser = from_serializable_dict(test_browser_json)
|
|
print(f"✅ Valid BrowserConfig: {type(restored_browser)}")
|
|
print(f"Headless: {restored_browser.headless}")
|
|
print(f"Proxy: {restored_browser.proxy}")
|
|
except Exception as e:
|
|
print(f"❌ Invalid BrowserConfig JSON: {e}")
|
|
|
|
# Test complex crawler config JSON
|
|
test_crawler_json = {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {
|
|
"cache_mode": "bypass",
|
|
"screenshot": True,
|
|
"extraction_strategy": {
|
|
"type": "JsonCssExtractionStrategy",
|
|
"params": {
|
|
"schema": {
|
|
"type": "dict",
|
|
"value": {
|
|
"name": "Products",
|
|
"baseSelector": ".product",
|
|
"fields": [
|
|
{"name": "title", "selector": "h3", "type": "text"}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
try:
|
|
restored_crawler = from_serializable_dict(test_crawler_json)
|
|
print(f"✅ Valid CrawlerRunConfig: {type(restored_crawler)}")
|
|
print(f"Cache mode: {restored_crawler.cache_mode}")
|
|
print(f"Has extraction strategy: {restored_crawler.extraction_strategy is not None}")
|
|
except Exception as e:
|
|
print(f"❌ Invalid CrawlerRunConfig JSON: {e}")
|
|
```
|
|
|
|
### Using Server's /config/dump Endpoint for Validation
|
|
|
|
```python
|
|
import requests
|
|
|
|
# Method 3: Use server endpoint to validate configuration syntax
|
|
def validate_config_with_server(config_code: str) -> dict:
|
|
"""Validate configuration using server's /config/dump endpoint"""
|
|
response = requests.post(
|
|
"http://localhost:11235/config/dump",
|
|
json={"code": config_code}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
print("✅ Valid configuration syntax")
|
|
return response.json()
|
|
else:
|
|
print(f"❌ Invalid configuration: {response.status_code}")
|
|
print(response.json())
|
|
return None
|
|
|
|
# Test valid configuration
|
|
valid_config = """
|
|
CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
screenshot=True,
|
|
js_code=["window.scrollTo(0, document.body.scrollHeight);"],
|
|
wait_for="css:.content-loaded"
|
|
)
|
|
"""
|
|
|
|
result = validate_config_with_server(valid_config)
|
|
if result:
|
|
print("Generated JSON structure:")
|
|
print(json.dumps(result, indent=2))
|
|
|
|
# Test invalid configuration (should fail)
|
|
invalid_config = """
|
|
CrawlerRunConfig(
|
|
cache_mode="invalid_mode",
|
|
screenshot=True,
|
|
js_code=some_function() # This will fail
|
|
)
|
|
"""
|
|
|
|
validate_config_with_server(invalid_config)
|
|
```
|
|
|
|
### Configuration Builder Helper
|
|
|
|
```python
|
|
def build_and_validate_request(urls, browser_params=None, crawler_params=None):
|
|
"""Helper to build and validate complete request payload"""
|
|
|
|
# Create configurations
|
|
browser_config = BrowserConfig(**(browser_params or {}))
|
|
crawler_config = CrawlerRunConfig(**(crawler_params or {}))
|
|
|
|
# Build complete request payload
|
|
payload = {
|
|
"urls": urls if isinstance(urls, list) else [urls],
|
|
"browser_config": browser_config.dump(),
|
|
"crawler_config": crawler_config.dump()
|
|
}
|
|
|
|
print("✅ Complete request payload:")
|
|
print(json.dumps(payload, indent=2))
|
|
|
|
# Validate by attempting to reconstruct
|
|
try:
|
|
test_browser = from_serializable_dict(payload["browser_config"])
|
|
test_crawler = from_serializable_dict(payload["crawler_config"])
|
|
print("✅ Payload validation successful")
|
|
return payload
|
|
except Exception as e:
|
|
print(f"❌ Payload validation failed: {e}")
|
|
return None
|
|
|
|
# Example usage
|
|
payload = build_and_validate_request(
|
|
urls=["https://example.com"],
|
|
browser_params={"headless": True, "viewport_width": 1280},
|
|
crawler_params={
|
|
"cache_mode": CacheMode.BYPASS,
|
|
"screenshot": True,
|
|
"word_count_threshold": 10
|
|
}
|
|
)
|
|
|
|
if payload:
|
|
# Send to server
|
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
|
print(f"Server response: {response.status_code}")
|
|
```
|
|
|
|
### Common JSON Structure Patterns
|
|
|
|
```python
|
|
# Pattern 1: Simple primitive values
|
|
simple_config = {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {
|
|
"cache_mode": "bypass", # String enum value
|
|
"screenshot": True, # Boolean
|
|
"page_timeout": 60000 # Integer
|
|
}
|
|
}
|
|
|
|
# Pattern 2: Nested objects
|
|
nested_config = {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {
|
|
"extraction_strategy": {
|
|
"type": "LLMExtractionStrategy",
|
|
"params": {
|
|
"llm_config": {
|
|
"type": "LLMConfig",
|
|
"params": {
|
|
"provider": "openai/gpt-4o-mini",
|
|
"api_token": "env:OPENAI_API_KEY"
|
|
}
|
|
},
|
|
"instruction": "Extract main content"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# Pattern 3: Dictionary values (must use type: dict wrapper)
|
|
dict_config = {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {
|
|
"extraction_strategy": {
|
|
"type": "JsonCssExtractionStrategy",
|
|
"params": {
|
|
"schema": {
|
|
"type": "dict", # Required wrapper
|
|
"value": { # Actual dictionary content
|
|
"name": "Products",
|
|
"baseSelector": ".product",
|
|
"fields": [
|
|
{"name": "title", "selector": "h2", "type": "text"}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# Pattern 4: Lists and arrays
|
|
list_config = {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {
|
|
"js_code": [ # Lists are handled directly
|
|
"window.scrollTo(0, document.body.scrollHeight);",
|
|
"document.querySelector('.load-more')?.click();"
|
|
],
|
|
"excluded_tags": ["script", "style", "nav"]
|
|
}
|
|
}
|
|
```
|
|
|
|
### Troubleshooting Common JSON Errors
|
|
|
|
```python
|
|
def diagnose_json_errors():
|
|
"""Common JSON structure errors and fixes"""
|
|
|
|
# ❌ WRONG: Missing type wrapper for objects
|
|
wrong_config = {
|
|
"browser_config": {
|
|
"headless": True # Missing type wrapper
|
|
}
|
|
}
|
|
|
|
# ✅ CORRECT: Proper type wrapper
|
|
correct_config = {
|
|
"browser_config": {
|
|
"type": "BrowserConfig",
|
|
"params": {
|
|
"headless": True
|
|
}
|
|
}
|
|
}
|
|
|
|
# ❌ WRONG: Dictionary without type: dict wrapper
|
|
wrong_dict = {
|
|
"schema": {
|
|
"name": "Products" # Raw dict, should be wrapped
|
|
}
|
|
}
|
|
|
|
# ✅ CORRECT: Dictionary with proper wrapper
|
|
correct_dict = {
|
|
"schema": {
|
|
"type": "dict",
|
|
"value": {
|
|
"name": "Products"
|
|
}
|
|
}
|
|
}
|
|
|
|
# ❌ WRONG: Invalid enum string
|
|
wrong_enum = {
|
|
"cache_mode": "DISABLED" # Wrong case/value
|
|
}
|
|
|
|
# ✅ CORRECT: Valid enum string
|
|
correct_enum = {
|
|
"cache_mode": "bypass" # or "enabled", "disabled", etc.
|
|
}
|
|
|
|
print("Common error patterns documented above")
|
|
|
|
# Validate your JSON structure before sending
|
|
def pre_flight_check(payload):
|
|
"""Run checks before sending to server"""
|
|
required_keys = ["urls", "browser_config", "crawler_config"]
|
|
|
|
for key in required_keys:
|
|
if key not in payload:
|
|
print(f"❌ Missing required key: {key}")
|
|
return False
|
|
|
|
# Check type wrappers
|
|
for config_key in ["browser_config", "crawler_config"]:
|
|
config = payload[config_key]
|
|
if not isinstance(config, dict) or "type" not in config:
|
|
print(f"❌ {config_key} missing type wrapper")
|
|
return False
|
|
if "params" not in config:
|
|
print(f"❌ {config_key} missing params")
|
|
return False
|
|
|
|
print("✅ Pre-flight check passed")
|
|
return True
|
|
|
|
# Example usage
|
|
payload = {
|
|
"urls": ["https://example.com"],
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
|
|
}
|
|
|
|
if pre_flight_check(payload):
|
|
# Safe to send to server
|
|
pass
|
|
```
|
|
|
|
**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment/), [API Reference](https://docs.crawl4ai.com/api/), [MCP Integration](https://docs.crawl4ai.com/core/docker-deployment/#mcp-model-context-protocol-support), [Configuration Options](https://docs.crawl4ai.com/core/docker-deployment/#server-configuration) |