feat(docker): add Docker service integration and config serialization
Add Docker service integration with FastAPI server and client implementation. Implement serialization utilities for BrowserConfig and CrawlerRunConfig to support Docker service communication. Clean up imports and improve error handling. - Add Crawl4aiDockerClient class - Implement config serialization/deserialization - Add FastAPI server with streaming support - Add health check endpoint - Clean up imports and type hints
This commit is contained in:
@@ -7,17 +7,129 @@ from .config import (
|
||||
SOCIAL_MEDIA_DOMAINS,
|
||||
)
|
||||
|
||||
from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator
|
||||
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter
|
||||
from .content_filter_strategy import RelevantContentFilter # , BM25ContentFilter, LLMContentFilter, PruningContentFilter
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||
from typing import Optional, Union, List
|
||||
from typing import Union, List
|
||||
from .cache_context import CacheMode
|
||||
|
||||
import inspect
|
||||
from typing import Any, Dict
|
||||
from enum import Enum
|
||||
|
||||
class BrowserConfig:
|
||||
def to_serializable_dict(obj: Any) -> Dict:
|
||||
"""
|
||||
Recursively convert an object to a serializable dictionary using {type, params} structure
|
||||
for complex objects.
|
||||
"""
|
||||
if obj is None:
|
||||
return None
|
||||
|
||||
# Handle basic types
|
||||
if isinstance(obj, (str, int, float, bool)):
|
||||
return obj
|
||||
|
||||
# Handle Enum
|
||||
if isinstance(obj, Enum):
|
||||
return {
|
||||
"type": obj.__class__.__name__,
|
||||
"params": obj.value
|
||||
}
|
||||
|
||||
# Handle datetime objects
|
||||
if hasattr(obj, 'isoformat'):
|
||||
return obj.isoformat()
|
||||
|
||||
# Handle lists, tuples, and sets
|
||||
if isinstance(obj, (list, tuple, set)):
|
||||
return [to_serializable_dict(item) for item in obj]
|
||||
|
||||
# Handle dictionaries - preserve them as-is
|
||||
if isinstance(obj, dict):
|
||||
return {
|
||||
"type": "dict", # Mark as plain dictionary
|
||||
"value": {str(k): to_serializable_dict(v) for k, v in obj.items()}
|
||||
}
|
||||
|
||||
# Handle class instances
|
||||
if hasattr(obj, '__class__'):
|
||||
# Get constructor signature
|
||||
sig = inspect.signature(obj.__class__.__init__)
|
||||
params = sig.parameters
|
||||
|
||||
# Get current values
|
||||
current_values = {}
|
||||
for name, param in params.items():
|
||||
if name == 'self':
|
||||
continue
|
||||
|
||||
value = getattr(obj, name, param.default)
|
||||
|
||||
# Only include if different from default, considering empty values
|
||||
if not (is_empty_value(value) and is_empty_value(param.default)):
|
||||
if value != param.default:
|
||||
current_values[name] = to_serializable_dict(value)
|
||||
|
||||
return {
|
||||
"type": obj.__class__.__name__,
|
||||
"params": current_values
|
||||
}
|
||||
|
||||
return str(obj)
|
||||
|
||||
def from_serializable_dict(data: Any) -> Any:
|
||||
"""
|
||||
Recursively convert a serializable dictionary back to an object instance.
|
||||
"""
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
# Handle basic types
|
||||
if isinstance(data, (str, int, float, bool)):
|
||||
return data
|
||||
|
||||
# Handle typed data
|
||||
if isinstance(data, dict) and "type" in data:
|
||||
# Handle plain dictionaries
|
||||
if data["type"] == "dict":
|
||||
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
|
||||
|
||||
# Import from crawl4ai for class instances
|
||||
import crawl4ai
|
||||
cls = getattr(crawl4ai, data["type"])
|
||||
|
||||
# Handle Enum
|
||||
if issubclass(cls, Enum):
|
||||
return cls(data["params"])
|
||||
|
||||
# Handle class instances
|
||||
constructor_args = {
|
||||
k: from_serializable_dict(v) for k, v in data["params"].items()
|
||||
}
|
||||
return cls(**constructor_args)
|
||||
|
||||
# Handle lists
|
||||
if isinstance(data, list):
|
||||
return [from_serializable_dict(item) for item in data]
|
||||
|
||||
# Handle raw dictionaries (legacy support)
|
||||
if isinstance(data, dict):
|
||||
return {k: from_serializable_dict(v) for k, v in data.items()}
|
||||
|
||||
return data
|
||||
|
||||
def is_empty_value(value: Any) -> bool:
|
||||
"""Check if a value is effectively empty/null."""
|
||||
if value is None:
|
||||
return True
|
||||
if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
class BrowserConfig():
|
||||
"""
|
||||
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
||||
|
||||
@@ -239,8 +351,18 @@ class BrowserConfig:
|
||||
config_dict.update(kwargs)
|
||||
return BrowserConfig.from_kwargs(config_dict)
|
||||
|
||||
# Create a funciton returns dict of the object
|
||||
def dump(self) -> dict:
|
||||
# Serialize the object to a dictionary
|
||||
return to_serializable_dict(self)
|
||||
|
||||
class CrawlerRunConfig:
|
||||
@staticmethod
|
||||
def load( data: dict) -> "BrowserConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
return from_serializable_dict(data)
|
||||
|
||||
|
||||
class CrawlerRunConfig():
|
||||
"""
|
||||
Configuration class for controlling how the crawler runs each crawl operation.
|
||||
This includes parameters for content extraction, page manipulation, waiting conditions,
|
||||
@@ -665,6 +787,15 @@ class CrawlerRunConfig:
|
||||
)
|
||||
|
||||
# Create a funciton returns dict of the object
|
||||
def dump(self) -> dict:
|
||||
# Serialize the object to a dictionary
|
||||
return to_serializable_dict(self)
|
||||
|
||||
@staticmethod
|
||||
def load(data: dict) -> "CrawlerRunConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
return from_serializable_dict(data)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"word_count_threshold": self.word_count_threshold,
|
||||
|
||||
Reference in New Issue
Block a user