feat(docker): add Docker deployment configuration and API server

Add Docker deployment setup with FastAPI server implementation for Crawl4AI:
- Create Dockerfile with Python 3.10 and Playwright dependencies
- Implement FastAPI server with streaming and non-streaming endpoints
- Add request/response models and JSON serialization
- Include test script for API verification

Also includes:
- Update .gitignore for Continue development files
- Add project rules in .continuerules
- Clean up async_dispatcher.py formatting
This commit is contained in:
UncleCode
2025-01-31 15:22:21 +08:00
parent f81712eb91
commit ce4f04dad2
9 changed files with 426 additions and 638 deletions

18
deploy/docker/Dockerfile Normal file
View File

@@ -0,0 +1,18 @@
FROM python:3.10-slim
# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
wget \
&& rm -rf /var/lib/apt/lists/*
# Install Playwright dependencies
RUN playwright install --with-deps chromium
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]

78
deploy/docker/models.py Normal file
View File

@@ -0,0 +1,78 @@
from typing import List, Optional, Any, Dict
from pydantic import BaseModel
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig,
DefaultMarkdownGenerator,
PruningContentFilter,
BM25ContentFilter,
LLMContentFilter,
# Add other strategy classes as needed
)
class StrategyConfig(BaseModel):
"""Base class for strategy configurations"""
type: str
params: Dict[str, Any]
def create_instance(self):
"""Convert config to actual strategy instance"""
strategy_mappings = {
# Markdown Generators
'DefaultMarkdownGenerator': DefaultMarkdownGenerator,
# Content Filters
'PruningContentFilter': PruningContentFilter,
'BM25ContentFilter': BM25ContentFilter,
'LLMContentFilter': LLMContentFilter,
# Add other mappings as needed
# 'CustomStrategy': CustomStrategyClass,
}
strategy_class = strategy_mappings.get(self.type)
if not strategy_class:
raise ValueError(f"Unknown strategy type: {self.type}")
# Handle nested strategy configurations
processed_params = {}
for key, value in self.params.items():
if isinstance(value, dict) and 'type' in value:
# Recursively create nested strategy instances
nested_config = StrategyConfig(type=value['type'], params=value.get('params', {}))
processed_params[key] = nested_config.create_instance()
else:
processed_params[key] = value
return strategy_class(**processed_params)
class CrawlRequest(BaseModel):
urls: List[str]
browser_config: Optional[dict] = None
crawler_config: Optional[dict] = None
def get_configs(self):
"""Enhanced conversion of dicts to config objects"""
browser_config = BrowserConfig.from_kwargs(self.browser_config or {})
crawler_dict = self.crawler_config or {}
# Process strategy configurations
for key, value in crawler_dict.items():
if isinstance(value, dict) and 'type' in value:
# Convert strategy configuration to actual instance
strategy_config = StrategyConfig(
type=value['type'],
params=value.get('params', {})
)
crawler_dict[key] = strategy_config.create_instance()
crawler_config = CrawlerRunConfig.from_kwargs(crawler_dict)
return browser_config, crawler_config
class CrawlResponse(BaseModel):
success: bool
results: List[dict] # Will contain serialized CrawlResults
class Config:
arbitrary_types_allowed = True

View File

@@ -0,0 +1,3 @@
crawl4ai
fastapi
uvicorn

148
deploy/docker/server.py Normal file
View File

@@ -0,0 +1,148 @@
# pyright: ignore
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
import json
import asyncio
from typing import AsyncGenerator
from datetime import datetime
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig,
AsyncWebCrawler,
MemoryAdaptiveDispatcher,
RateLimiter,
)
from .models import CrawlRequest, CrawlResponse
class CrawlJSONEncoder(json.JSONEncoder):
"""Custom JSON encoder for crawler results"""
def default(self, obj):
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, bytes):
return obj.decode('utf-8', errors='ignore')
if hasattr(obj, 'model_dump'):
return obj.model_dump()
if hasattr(obj, '__dict__'):
return {k: v for k, v in obj.__dict__.items() if not k.startswith('_')}
return str(obj) # Fallback to string representation
def serialize_result(result) -> dict:
"""Safely serialize a crawler result"""
try:
# Convert to dict handling special cases
if hasattr(result, 'model_dump'):
result_dict = result.model_dump()
else:
result_dict = {
k: v for k, v in result.__dict__.items()
if not k.startswith('_')
}
# Remove known non-serializable objects
result_dict.pop('ssl_certificate', None)
result_dict.pop('downloaded_files', None)
return result_dict
except Exception as e:
print(f"Error serializing result: {e}")
return {"error": str(e), "url": getattr(result, 'url', 'unknown')}
app = FastAPI(title="Crawl4AI API")
async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
"""Stream results and manage crawler lifecycle"""
try:
async for result in results_gen:
try:
# Handle serialization of result
result_dict = serialize_result(result)
# Remove non-serializable objects
print(f"Streaming result for URL: {result_dict['url']}, Success: {result_dict['success']}")
yield (json.dumps(result_dict, cls=CrawlJSONEncoder) + "\n").encode('utf-8')
except Exception as e:
# Log error but continue streaming
print(f"Error serializing result: {e}")
error_response = {
"error": str(e),
"url": getattr(result, 'url', 'unknown')
}
yield (json.dumps(error_response) + "\n").encode('utf-8')
except asyncio.CancelledError:
# Handle client disconnection gracefully
print("Client disconnected, cleaning up...")
finally:
# Ensure crawler cleanup happens in all cases
try:
await crawler.close()
except Exception as e:
print(f"Error closing crawler: {e}")
@app.post("/crawl")
async def crawl(request: CrawlRequest):
browser_config, crawler_config = request.get_configs()
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=75.0,
rate_limiter=RateLimiter(base_delay=(1.0, 2.0)),
# monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
)
try:
if crawler_config.stream:
# For streaming, manage crawler lifecycle manually
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
results_gen = await crawler.arun_many(
urls=request.urls,
config=crawler_config,
dispatcher=dispatcher
)
return StreamingResponse(
stream_results(crawler, results_gen),
media_type='application/x-ndjson'
)
else:
# For non-streaming, use context manager
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(
urls=request.urls,
config=crawler_config,
dispatcher=dispatcher
)
# Handle serialization of results
results_dict = []
for result in results:
try:
result_dict = {
k: v for k, v in (result.model_dump() if hasattr(result, 'model_dump')
else result.__dict__).items()
if not k.startswith('_')
}
result_dict.pop('ssl_certificate', None)
result_dict.pop('downloaded_files', None)
results_dict.append(result_dict)
except Exception as e:
print(f"Error serializing result: {e}")
continue
return CrawlResponse(success=True, results=results_dict)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/schema")
async def get_schema():
"""Return config schemas for client validation"""
return {
"browser": BrowserConfig.model_json_schema(),
"crawler": CrawlerRunConfig.model_json_schema()
}
if __name__ == "__main__":
import uvicorn
# Run in auto reload mode
# WARNING: You must pass the application as an import string to enable 'reload' or 'workers'.
uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)

108
deploy/docker/test.py Normal file
View File

@@ -0,0 +1,108 @@
import httpx
import asyncio
import json
async def test_regular():
"""Test non-streaming API call"""
async with httpx.AsyncClient() as client:
response = await client.post("http://localhost:8000/crawl", json={
"urls": ["https://example.com"] * 3, # Test with 3 identical URLs
"browser_config": {
"headless": True,
"verbose": False
},
"crawler_config": {
"cache_mode": "BYPASS",
"stream": False
}
})
results = response.json()
print("\nRegular Response:")
print(f"Got {len(results['results'])} results at once")
for result in results['results']:
print(f"URL: {result['url']}, Success: {result['success']}")
async def test_streaming():
"""Test streaming API call"""
async with httpx.AsyncClient() as client:
try:
response = await client.post(
"http://localhost:8000/crawl",
json={
"urls": ["https://example.com"] * 3,
"browser_config": {
"headless": True,
"verbose": False
},
"crawler_config": {
"cache_mode": "BYPASS",
"stream": True
}
},
timeout=30.0
)
print("\nStreaming Response:")
async for line in response.aiter_lines():
if line.strip():
try:
result = json.loads(line)
print(f"Received result for URL: {result['url']}, Success: {result['success']}")
except json.JSONDecodeError as e:
print(f"Error decoding response: {e}")
continue
except Exception as e:
print(f"Error during streaming: {e}")
async def test_complex_config():
"""Test API with complex nested configurations"""
async with httpx.AsyncClient() as client:
response = await client.post("http://localhost:8000/crawl",
timeout=30.0, json={
"urls": ["https://en.wikipedia.org/wiki/Apple"],
"browser_config": {
"headless": True,
"verbose": False
},
"crawler_config": {
"cache_mode": "BYPASS",
"excluded_tags": ["nav", "footer", "aside"],
"remove_overlay_elements": True,
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.48,
"threshold_type": "fixed",
"min_word_threshold": 0
}
},
"options": {"ignore_links": True}
}
}
}
})
result = response.json()
if result['success']:
for r in result['results']:
print(f"Full Markdown Length: {len(r['markdown_v2']['raw_markdown'])}")
print(f"Fit Markdown Length: {len(r['markdown_v2']['fit_markdown'])}")
async def main():
"""Run both tests"""
print("Testing Crawl4AI API...")
# print("\n1. Testing regular (non-streaming) endpoint...")
# await test_regular()
# print("\n2. Testing streaming endpoint...")
# await test_streaming()
print("\n3. Testing complex configuration...")
await test_complex_config()
if __name__ == "__main__":
asyncio.run(main())