refactor(docker): remove unused models and utilities for cleaner codebase
This commit is contained in:
@@ -1,79 +0,0 @@
|
|||||||
from typing import List, Optional, Any, Dict
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from crawl4ai import (
|
|
||||||
BrowserConfig,
|
|
||||||
CrawlerRunConfig,
|
|
||||||
DefaultMarkdownGenerator,
|
|
||||||
PruningContentFilter,
|
|
||||||
BM25ContentFilter,
|
|
||||||
LLMContentFilter,
|
|
||||||
# Add other strategy classes as needed
|
|
||||||
|
|
||||||
)
|
|
||||||
|
|
||||||
class StrategyConfig(BaseModel):
|
|
||||||
"""Base class for strategy configurations"""
|
|
||||||
type: str
|
|
||||||
params: Dict[str, Any]
|
|
||||||
|
|
||||||
def create_instance(self):
|
|
||||||
"""Convert config to actual strategy instance"""
|
|
||||||
strategy_mappings = {
|
|
||||||
# Markdown Generators
|
|
||||||
'DefaultMarkdownGenerator': DefaultMarkdownGenerator,
|
|
||||||
|
|
||||||
# Content Filters
|
|
||||||
'PruningContentFilter': PruningContentFilter,
|
|
||||||
'BM25ContentFilter': BM25ContentFilter,
|
|
||||||
'LLMContentFilter': LLMContentFilter,
|
|
||||||
|
|
||||||
# Add other mappings as needed
|
|
||||||
# 'CustomStrategy': CustomStrategyClass,
|
|
||||||
}
|
|
||||||
|
|
||||||
strategy_class = strategy_mappings.get(self.type)
|
|
||||||
if not strategy_class:
|
|
||||||
raise ValueError(f"Unknown strategy type: {self.type}")
|
|
||||||
|
|
||||||
# Handle nested strategy configurations
|
|
||||||
processed_params = {}
|
|
||||||
for key, value in self.params.items():
|
|
||||||
if isinstance(value, dict) and 'type' in value:
|
|
||||||
# Recursively create nested strategy instances
|
|
||||||
nested_config = StrategyConfig(type=value['type'], params=value.get('params', {}))
|
|
||||||
processed_params[key] = nested_config.create_instance()
|
|
||||||
else:
|
|
||||||
processed_params[key] = value
|
|
||||||
|
|
||||||
return strategy_class(**processed_params)
|
|
||||||
|
|
||||||
class CrawlRequest(BaseModel):
|
|
||||||
urls: List[str]
|
|
||||||
browser_config: Optional[dict] = None
|
|
||||||
crawler_config: Optional[dict] = None
|
|
||||||
|
|
||||||
def get_configs(self):
|
|
||||||
"""Enhanced conversion of dicts to config objects"""
|
|
||||||
browser_config = BrowserConfig.from_kwargs(self.browser_config or {})
|
|
||||||
|
|
||||||
crawler_dict = self.crawler_config or {}
|
|
||||||
|
|
||||||
# Process strategy configurations
|
|
||||||
for key, value in crawler_dict.items():
|
|
||||||
if isinstance(value, dict) and 'type' in value:
|
|
||||||
# Convert strategy configuration to actual instance
|
|
||||||
strategy_config = StrategyConfig(
|
|
||||||
type=value['type'],
|
|
||||||
params=value.get('params', {})
|
|
||||||
)
|
|
||||||
crawler_dict[key] = strategy_config.create_instance()
|
|
||||||
|
|
||||||
crawler_config = CrawlerRunConfig.from_kwargs(crawler_dict)
|
|
||||||
return browser_config, crawler_config
|
|
||||||
|
|
||||||
class CrawlResponse(BaseModel):
|
|
||||||
success: bool
|
|
||||||
results: List[dict] # Will contain serialized CrawlResults
|
|
||||||
|
|
||||||
class Config:
|
|
||||||
arbitrary_types_allowed = True
|
|
||||||
@@ -13,7 +13,21 @@ from crawl4ai import (
|
|||||||
MemoryAdaptiveDispatcher,
|
MemoryAdaptiveDispatcher,
|
||||||
RateLimiter,
|
RateLimiter,
|
||||||
)
|
)
|
||||||
from models import CrawlRequest, CrawlResponse
|
|
||||||
|
from typing import List, Optional
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
class CrawlRequest(BaseModel):
|
||||||
|
urls: List[str]
|
||||||
|
browser_config: Optional[dict] = None
|
||||||
|
crawler_config: Optional[dict] = None
|
||||||
|
|
||||||
|
class CrawlResponse(BaseModel):
|
||||||
|
success: bool
|
||||||
|
results: List[dict]
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
app = FastAPI(title="Crawl4AI API")
|
app = FastAPI(title="Crawl4AI API")
|
||||||
|
|
||||||
|
|||||||
@@ -1,36 +0,0 @@
|
|||||||
import json
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
class CrawlJSONEncoder(json.JSONEncoder):
|
|
||||||
"""Custom JSON encoder for crawler results"""
|
|
||||||
def default(self, obj):
|
|
||||||
if isinstance(obj, datetime):
|
|
||||||
return obj.isoformat()
|
|
||||||
if isinstance(obj, bytes):
|
|
||||||
return obj.decode('utf-8', errors='ignore')
|
|
||||||
if hasattr(obj, 'model_dump'):
|
|
||||||
return obj.model_dump()
|
|
||||||
if hasattr(obj, '__dict__'):
|
|
||||||
return {k: v for k, v in obj.__dict__.items() if not k.startswith('_')}
|
|
||||||
return str(obj) # Fallback to string representation
|
|
||||||
|
|
||||||
def serialize_result(result) -> dict:
|
|
||||||
"""Safely serialize a crawler result"""
|
|
||||||
try:
|
|
||||||
# Convert to dict handling special cases
|
|
||||||
if hasattr(result, 'model_dump'):
|
|
||||||
result_dict = result.model_dump()
|
|
||||||
else:
|
|
||||||
result_dict = {
|
|
||||||
k: v for k, v in result.__dict__.items()
|
|
||||||
if not k.startswith('_')
|
|
||||||
}
|
|
||||||
|
|
||||||
# Remove known non-serializable objects
|
|
||||||
result_dict.pop('ssl_certificate', None)
|
|
||||||
result_dict.pop('downloaded_files', None)
|
|
||||||
|
|
||||||
return result_dict
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error serializing result: {e}")
|
|
||||||
return {"error": str(e), "url": getattr(result, 'url', 'unknown')}
|
|
||||||
Reference in New Issue
Block a user