diff --git a/deploy/docker/models.py b/deploy/docker/models.py deleted file mode 100644 index 1f658e33..00000000 --- a/deploy/docker/models.py +++ /dev/null @@ -1,79 +0,0 @@ -from typing import List, Optional, Any, Dict -from pydantic import BaseModel -from crawl4ai import ( - BrowserConfig, - CrawlerRunConfig, - DefaultMarkdownGenerator, - PruningContentFilter, - BM25ContentFilter, - LLMContentFilter, - # Add other strategy classes as needed - -) - -class StrategyConfig(BaseModel): - """Base class for strategy configurations""" - type: str - params: Dict[str, Any] - - def create_instance(self): - """Convert config to actual strategy instance""" - strategy_mappings = { - # Markdown Generators - 'DefaultMarkdownGenerator': DefaultMarkdownGenerator, - - # Content Filters - 'PruningContentFilter': PruningContentFilter, - 'BM25ContentFilter': BM25ContentFilter, - 'LLMContentFilter': LLMContentFilter, - - # Add other mappings as needed - # 'CustomStrategy': CustomStrategyClass, - } - - strategy_class = strategy_mappings.get(self.type) - if not strategy_class: - raise ValueError(f"Unknown strategy type: {self.type}") - - # Handle nested strategy configurations - processed_params = {} - for key, value in self.params.items(): - if isinstance(value, dict) and 'type' in value: - # Recursively create nested strategy instances - nested_config = StrategyConfig(type=value['type'], params=value.get('params', {})) - processed_params[key] = nested_config.create_instance() - else: - processed_params[key] = value - - return strategy_class(**processed_params) - -class CrawlRequest(BaseModel): - urls: List[str] - browser_config: Optional[dict] = None - crawler_config: Optional[dict] = None - - def get_configs(self): - """Enhanced conversion of dicts to config objects""" - browser_config = BrowserConfig.from_kwargs(self.browser_config or {}) - - crawler_dict = self.crawler_config or {} - - # Process strategy configurations - for key, value in crawler_dict.items(): - if isinstance(value, dict) and 'type' in value: - # Convert strategy configuration to actual instance - strategy_config = StrategyConfig( - type=value['type'], - params=value.get('params', {}) - ) - crawler_dict[key] = strategy_config.create_instance() - - crawler_config = CrawlerRunConfig.from_kwargs(crawler_dict) - return browser_config, crawler_config - -class CrawlResponse(BaseModel): - success: bool - results: List[dict] # Will contain serialized CrawlResults - - class Config: - arbitrary_types_allowed = True \ No newline at end of file diff --git a/deploy/docker/server.py b/deploy/docker/server.py index 4e30cfec..7ec662a3 100644 --- a/deploy/docker/server.py +++ b/deploy/docker/server.py @@ -13,7 +13,21 @@ from crawl4ai import ( MemoryAdaptiveDispatcher, RateLimiter, ) -from models import CrawlRequest, CrawlResponse + +from typing import List, Optional +from pydantic import BaseModel + +class CrawlRequest(BaseModel): + urls: List[str] + browser_config: Optional[dict] = None + crawler_config: Optional[dict] = None + +class CrawlResponse(BaseModel): + success: bool + results: List[dict] + + class Config: + arbitrary_types_allowed = True app = FastAPI(title="Crawl4AI API") diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py deleted file mode 100644 index 69d77934..00000000 --- a/deploy/docker/utils.py +++ /dev/null @@ -1,36 +0,0 @@ -import json -from datetime import datetime - -class CrawlJSONEncoder(json.JSONEncoder): - """Custom JSON encoder for crawler results""" - def default(self, obj): - if isinstance(obj, datetime): - return obj.isoformat() - if isinstance(obj, bytes): - return obj.decode('utf-8', errors='ignore') - if hasattr(obj, 'model_dump'): - return obj.model_dump() - if hasattr(obj, '__dict__'): - return {k: v for k, v in obj.__dict__.items() if not k.startswith('_')} - return str(obj) # Fallback to string representation - -def serialize_result(result) -> dict: - """Safely serialize a crawler result""" - try: - # Convert to dict handling special cases - if hasattr(result, 'model_dump'): - result_dict = result.model_dump() - else: - result_dict = { - k: v for k, v in result.__dict__.items() - if not k.startswith('_') - } - - # Remove known non-serializable objects - result_dict.pop('ssl_certificate', None) - result_dict.pop('downloaded_files', None) - - return result_dict - except Exception as e: - print(f"Error serializing result: {e}") - return {"error": str(e), "url": getattr(result, 'url', 'unknown')} \ No newline at end of file