Add Docker service integration with FastAPI server and client implementation. Implement serialization utilities for BrowserConfig and CrawlerRunConfig to support Docker service communication. Clean up imports and improve error handling. - Add Crawl4aiDockerClient class - Implement config serialization/deserialization - Add FastAPI server with streaming support - Add health check endpoint - Clean up imports and type hints
79 lines
2.6 KiB
Python
79 lines
2.6 KiB
Python
from typing import List, Optional, Any, Dict
|
|
from pydantic import BaseModel
|
|
from crawl4ai import (
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
DefaultMarkdownGenerator,
|
|
PruningContentFilter,
|
|
BM25ContentFilter,
|
|
LLMContentFilter,
|
|
# Add other strategy classes as needed
|
|
|
|
)
|
|
|
|
class StrategyConfig(BaseModel):
|
|
"""Base class for strategy configurations"""
|
|
type: str
|
|
params: Dict[str, Any]
|
|
|
|
def create_instance(self):
|
|
"""Convert config to actual strategy instance"""
|
|
strategy_mappings = {
|
|
# Markdown Generators
|
|
'DefaultMarkdownGenerator': DefaultMarkdownGenerator,
|
|
|
|
# Content Filters
|
|
'PruningContentFilter': PruningContentFilter,
|
|
'BM25ContentFilter': BM25ContentFilter,
|
|
'LLMContentFilter': LLMContentFilter,
|
|
|
|
# Add other mappings as needed
|
|
# 'CustomStrategy': CustomStrategyClass,
|
|
}
|
|
|
|
strategy_class = strategy_mappings.get(self.type)
|
|
if not strategy_class:
|
|
raise ValueError(f"Unknown strategy type: {self.type}")
|
|
|
|
# Handle nested strategy configurations
|
|
processed_params = {}
|
|
for key, value in self.params.items():
|
|
if isinstance(value, dict) and 'type' in value:
|
|
# Recursively create nested strategy instances
|
|
nested_config = StrategyConfig(type=value['type'], params=value.get('params', {}))
|
|
processed_params[key] = nested_config.create_instance()
|
|
else:
|
|
processed_params[key] = value
|
|
|
|
return strategy_class(**processed_params)
|
|
|
|
class CrawlRequest(BaseModel):
|
|
urls: List[str]
|
|
browser_config: Optional[dict] = None
|
|
crawler_config: Optional[dict] = None
|
|
|
|
def get_configs(self):
|
|
"""Enhanced conversion of dicts to config objects"""
|
|
browser_config = BrowserConfig.from_kwargs(self.browser_config or {})
|
|
|
|
crawler_dict = self.crawler_config or {}
|
|
|
|
# Process strategy configurations
|
|
for key, value in crawler_dict.items():
|
|
if isinstance(value, dict) and 'type' in value:
|
|
# Convert strategy configuration to actual instance
|
|
strategy_config = StrategyConfig(
|
|
type=value['type'],
|
|
params=value.get('params', {})
|
|
)
|
|
crawler_dict[key] = strategy_config.create_instance()
|
|
|
|
crawler_config = CrawlerRunConfig.from_kwargs(crawler_dict)
|
|
return browser_config, crawler_config
|
|
|
|
class CrawlResponse(BaseModel):
|
|
success: bool
|
|
results: List[dict] # Will contain serialized CrawlResults
|
|
|
|
class Config:
|
|
arbitrary_types_allowed = True |