refactor(docker): remove unused models and utilities for cleaner codebase

This commit is contained in:
UncleCode
2025-02-01 20:10:13 +08:00
parent 2f15976b34
commit 7b1ef07c41
3 changed files with 15 additions and 116 deletions

View File

@@ -1,79 +0,0 @@
from typing import List, Optional, Any, Dict
from pydantic import BaseModel
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig,
DefaultMarkdownGenerator,
PruningContentFilter,
BM25ContentFilter,
LLMContentFilter,
# Add other strategy classes as needed
)
class StrategyConfig(BaseModel):
"""Base class for strategy configurations"""
type: str
params: Dict[str, Any]
def create_instance(self):
"""Convert config to actual strategy instance"""
strategy_mappings = {
# Markdown Generators
'DefaultMarkdownGenerator': DefaultMarkdownGenerator,
# Content Filters
'PruningContentFilter': PruningContentFilter,
'BM25ContentFilter': BM25ContentFilter,
'LLMContentFilter': LLMContentFilter,
# Add other mappings as needed
# 'CustomStrategy': CustomStrategyClass,
}
strategy_class = strategy_mappings.get(self.type)
if not strategy_class:
raise ValueError(f"Unknown strategy type: {self.type}")
# Handle nested strategy configurations
processed_params = {}
for key, value in self.params.items():
if isinstance(value, dict) and 'type' in value:
# Recursively create nested strategy instances
nested_config = StrategyConfig(type=value['type'], params=value.get('params', {}))
processed_params[key] = nested_config.create_instance()
else:
processed_params[key] = value
return strategy_class(**processed_params)
class CrawlRequest(BaseModel):
urls: List[str]
browser_config: Optional[dict] = None
crawler_config: Optional[dict] = None
def get_configs(self):
"""Enhanced conversion of dicts to config objects"""
browser_config = BrowserConfig.from_kwargs(self.browser_config or {})
crawler_dict = self.crawler_config or {}
# Process strategy configurations
for key, value in crawler_dict.items():
if isinstance(value, dict) and 'type' in value:
# Convert strategy configuration to actual instance
strategy_config = StrategyConfig(
type=value['type'],
params=value.get('params', {})
)
crawler_dict[key] = strategy_config.create_instance()
crawler_config = CrawlerRunConfig.from_kwargs(crawler_dict)
return browser_config, crawler_config
class CrawlResponse(BaseModel):
success: bool
results: List[dict] # Will contain serialized CrawlResults
class Config:
arbitrary_types_allowed = True

View File

@@ -13,7 +13,21 @@ from crawl4ai import (
MemoryAdaptiveDispatcher,
RateLimiter,
)
from models import CrawlRequest, CrawlResponse
from typing import List, Optional
from pydantic import BaseModel
class CrawlRequest(BaseModel):
urls: List[str]
browser_config: Optional[dict] = None
crawler_config: Optional[dict] = None
class CrawlResponse(BaseModel):
success: bool
results: List[dict]
class Config:
arbitrary_types_allowed = True
app = FastAPI(title="Crawl4AI API")

View File

@@ -1,36 +0,0 @@
import json
from datetime import datetime
class CrawlJSONEncoder(json.JSONEncoder):
"""Custom JSON encoder for crawler results"""
def default(self, obj):
if isinstance(obj, datetime):
return obj.isoformat()
if isinstance(obj, bytes):
return obj.decode('utf-8', errors='ignore')
if hasattr(obj, 'model_dump'):
return obj.model_dump()
if hasattr(obj, '__dict__'):
return {k: v for k, v in obj.__dict__.items() if not k.startswith('_')}
return str(obj) # Fallback to string representation
def serialize_result(result) -> dict:
"""Safely serialize a crawler result"""
try:
# Convert to dict handling special cases
if hasattr(result, 'model_dump'):
result_dict = result.model_dump()
else:
result_dict = {
k: v for k, v in result.__dict__.items()
if not k.startswith('_')
}
# Remove known non-serializable objects
result_dict.pop('ssl_certificate', None)
result_dict.pop('downloaded_files', None)
return result_dict
except Exception as e:
print(f"Error serializing result: {e}")
return {"error": str(e), "url": getattr(result, 'url', 'unknown')}