Files
crawl4ai/deploy/docker/server.py
UncleCode 53ac3ec0b4 feat(docker): add Docker service integration and config serialization
Add Docker service integration with FastAPI server and client implementation.
Implement serialization utilities for BrowserConfig and CrawlerRunConfig to support
Docker service communication. Clean up imports and improve error handling.

- Add Crawl4aiDockerClient class
- Implement config serialization/deserialization
- Add FastAPI server with streaming support
- Add health check endpoint
- Clean up imports and type hints
2025-01-31 18:00:16 +08:00

105 lines
3.6 KiB
Python

import os, sys
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
import json
import asyncio
from typing import AsyncGenerator
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig,
AsyncWebCrawler,
MemoryAdaptiveDispatcher,
RateLimiter,
)
from models import CrawlRequest, CrawlResponse
app = FastAPI(title="Crawl4AI API")
async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
"""Stream results and manage crawler lifecycle"""
def datetime_handler(obj):
"""Custom handler for datetime objects during JSON serialization"""
if hasattr(obj, 'isoformat'):
return obj.isoformat()
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
try:
async for result in results_gen:
try:
# Use dump method for serialization
result_dict = result.model_dump()
print(f"Streaming result for URL: {result_dict['url']}, Success: {result_dict['success']}")
# Use custom JSON encoder with datetime handler
yield (json.dumps(result_dict, default=datetime_handler) + "\n").encode('utf-8')
except Exception as e:
print(f"Error serializing result: {e}")
error_response = {
"error": str(e),
"url": getattr(result, 'url', 'unknown')
}
yield (json.dumps(error_response, default=datetime_handler) + "\n").encode('utf-8')
except asyncio.CancelledError:
print("Client disconnected, cleaning up...")
finally:
try:
await crawler.close()
except Exception as e:
print(f"Error closing crawler: {e}")
@app.post("/crawl")
async def crawl(request: CrawlRequest):
# Load configs using our new utilities
browser_config = BrowserConfig.load(request.browser_config)
crawler_config = CrawlerRunConfig.load(request.crawler_config)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=95.0,
rate_limiter=RateLimiter(base_delay=(1.0, 2.0)),
)
try:
if crawler_config.stream:
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
results_gen = await crawler.arun_many(
urls=request.urls,
config=crawler_config,
dispatcher=dispatcher
)
return StreamingResponse(
stream_results(crawler, results_gen),
media_type='application/x-ndjson'
)
else:
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(
urls=request.urls,
config=crawler_config,
dispatcher=dispatcher
)
# Use dump method for each result
results_dict = [result.model_dump() for result in results]
return CrawlResponse(success=True, results=results_dict)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/schema")
async def get_schema():
"""Return config schemas for client validation"""
return {
"browser": BrowserConfig.model_json_schema(),
"crawler": CrawlerRunConfig.model_json_schema()
}
@app.get("/health")
async def health():
return {"status": "ok"}
if __name__ == "__main__":
import uvicorn
uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)