refactor(docker): remove unused models and utilities for cleaner codebase
This commit is contained in:
@@ -1,79 +0,0 @@
|
||||
from typing import List, Optional, Any, Dict
|
||||
from pydantic import BaseModel
|
||||
from crawl4ai import (
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
DefaultMarkdownGenerator,
|
||||
PruningContentFilter,
|
||||
BM25ContentFilter,
|
||||
LLMContentFilter,
|
||||
# Add other strategy classes as needed
|
||||
|
||||
)
|
||||
|
||||
class StrategyConfig(BaseModel):
|
||||
"""Base class for strategy configurations"""
|
||||
type: str
|
||||
params: Dict[str, Any]
|
||||
|
||||
def create_instance(self):
|
||||
"""Convert config to actual strategy instance"""
|
||||
strategy_mappings = {
|
||||
# Markdown Generators
|
||||
'DefaultMarkdownGenerator': DefaultMarkdownGenerator,
|
||||
|
||||
# Content Filters
|
||||
'PruningContentFilter': PruningContentFilter,
|
||||
'BM25ContentFilter': BM25ContentFilter,
|
||||
'LLMContentFilter': LLMContentFilter,
|
||||
|
||||
# Add other mappings as needed
|
||||
# 'CustomStrategy': CustomStrategyClass,
|
||||
}
|
||||
|
||||
strategy_class = strategy_mappings.get(self.type)
|
||||
if not strategy_class:
|
||||
raise ValueError(f"Unknown strategy type: {self.type}")
|
||||
|
||||
# Handle nested strategy configurations
|
||||
processed_params = {}
|
||||
for key, value in self.params.items():
|
||||
if isinstance(value, dict) and 'type' in value:
|
||||
# Recursively create nested strategy instances
|
||||
nested_config = StrategyConfig(type=value['type'], params=value.get('params', {}))
|
||||
processed_params[key] = nested_config.create_instance()
|
||||
else:
|
||||
processed_params[key] = value
|
||||
|
||||
return strategy_class(**processed_params)
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
urls: List[str]
|
||||
browser_config: Optional[dict] = None
|
||||
crawler_config: Optional[dict] = None
|
||||
|
||||
def get_configs(self):
|
||||
"""Enhanced conversion of dicts to config objects"""
|
||||
browser_config = BrowserConfig.from_kwargs(self.browser_config or {})
|
||||
|
||||
crawler_dict = self.crawler_config or {}
|
||||
|
||||
# Process strategy configurations
|
||||
for key, value in crawler_dict.items():
|
||||
if isinstance(value, dict) and 'type' in value:
|
||||
# Convert strategy configuration to actual instance
|
||||
strategy_config = StrategyConfig(
|
||||
type=value['type'],
|
||||
params=value.get('params', {})
|
||||
)
|
||||
crawler_dict[key] = strategy_config.create_instance()
|
||||
|
||||
crawler_config = CrawlerRunConfig.from_kwargs(crawler_dict)
|
||||
return browser_config, crawler_config
|
||||
|
||||
class CrawlResponse(BaseModel):
|
||||
success: bool
|
||||
results: List[dict] # Will contain serialized CrawlResults
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
@@ -13,7 +13,21 @@ from crawl4ai import (
|
||||
MemoryAdaptiveDispatcher,
|
||||
RateLimiter,
|
||||
)
|
||||
from models import CrawlRequest, CrawlResponse
|
||||
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
urls: List[str]
|
||||
browser_config: Optional[dict] = None
|
||||
crawler_config: Optional[dict] = None
|
||||
|
||||
class CrawlResponse(BaseModel):
|
||||
success: bool
|
||||
results: List[dict]
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
app = FastAPI(title="Crawl4AI API")
|
||||
|
||||
|
||||
@@ -1,36 +0,0 @@
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
class CrawlJSONEncoder(json.JSONEncoder):
|
||||
"""Custom JSON encoder for crawler results"""
|
||||
def default(self, obj):
|
||||
if isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
if isinstance(obj, bytes):
|
||||
return obj.decode('utf-8', errors='ignore')
|
||||
if hasattr(obj, 'model_dump'):
|
||||
return obj.model_dump()
|
||||
if hasattr(obj, '__dict__'):
|
||||
return {k: v for k, v in obj.__dict__.items() if not k.startswith('_')}
|
||||
return str(obj) # Fallback to string representation
|
||||
|
||||
def serialize_result(result) -> dict:
|
||||
"""Safely serialize a crawler result"""
|
||||
try:
|
||||
# Convert to dict handling special cases
|
||||
if hasattr(result, 'model_dump'):
|
||||
result_dict = result.model_dump()
|
||||
else:
|
||||
result_dict = {
|
||||
k: v for k, v in result.__dict__.items()
|
||||
if not k.startswith('_')
|
||||
}
|
||||
|
||||
# Remove known non-serializable objects
|
||||
result_dict.pop('ssl_certificate', None)
|
||||
result_dict.pop('downloaded_files', None)
|
||||
|
||||
return result_dict
|
||||
except Exception as e:
|
||||
print(f"Error serializing result: {e}")
|
||||
return {"error": str(e), "url": getattr(result, 'url', 'unknown')}
|
||||
Reference in New Issue
Block a user