feat(config): enhance serialization and add deep crawling exports
Improve configuration serialization with better handling of frozensets and slots. Expand deep crawling module exports and documentation. Add comprehensive API usage examples in Docker README. - Add support for frozenset serialization - Improve error handling in config loading - Export additional deep crawling components - Enhance Docker API documentation with detailed examples - Fix ContentTypeFilter initialization
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
import re
|
||||
from attr import has
|
||||
from .config import (
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
@@ -17,6 +19,7 @@ from typing import Union, List
|
||||
from .cache_context import CacheMode
|
||||
from .proxy_strategy import ProxyRotationStrategy
|
||||
|
||||
|
||||
import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
@@ -47,7 +50,11 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
# Handle lists, tuples, and sets
|
||||
if isinstance(obj, (list, tuple, set)):
|
||||
return [to_serializable_dict(item) for item in obj]
|
||||
|
||||
|
||||
# Handle frozensets, which are not iterable
|
||||
if isinstance(obj, frozenset):
|
||||
return [to_serializable_dict(item) for item in list(obj)]
|
||||
|
||||
# Handle dictionaries - preserve them as-is
|
||||
if isinstance(obj, dict):
|
||||
return {
|
||||
@@ -60,6 +67,7 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
# Get constructor signature
|
||||
sig = inspect.signature(obj.__class__.__init__)
|
||||
params = sig.parameters
|
||||
_type = obj.__class__.__name__
|
||||
|
||||
# Get current values
|
||||
current_values = {}
|
||||
@@ -73,6 +81,24 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
if not (is_empty_value(value) and is_empty_value(param.default)):
|
||||
if value != param.default:
|
||||
current_values[name] = to_serializable_dict(value)
|
||||
elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__:
|
||||
slot = f"_{name}"
|
||||
slot_value = getattr(obj, slot, None)
|
||||
if not is_empty_value(slot_value):
|
||||
current_values[name] = to_serializable_dict(slot_value)
|
||||
|
||||
|
||||
# # Then handle slots if present
|
||||
# if hasattr(obj.__class__, '__slots__'):
|
||||
# for slot in obj.__class__.__slots__:
|
||||
# # Remove leading underscore if present
|
||||
# param_name = slot[1:] if slot.startswith('_') else slot
|
||||
|
||||
# # Get the slot value if it exists
|
||||
# if hasattr(obj, slot):
|
||||
# value = getattr(obj, slot)
|
||||
# if not is_empty_value(value):
|
||||
# current_values[param_name] = to_serializable_dict(value)
|
||||
|
||||
return {
|
||||
"type": obj.__class__.__name__,
|
||||
@@ -100,7 +126,10 @@ def from_serializable_dict(data: Any) -> Any:
|
||||
|
||||
# Import from crawl4ai for class instances
|
||||
import crawl4ai
|
||||
cls = getattr(crawl4ai, data["type"])
|
||||
if not hasattr(crawl4ai, data["type"]):
|
||||
return None
|
||||
else:
|
||||
cls = getattr(crawl4ai, data["type"])
|
||||
|
||||
# Handle Enum
|
||||
if issubclass(cls, Enum):
|
||||
@@ -361,7 +390,14 @@ class BrowserConfig():
|
||||
def load( data: dict) -> "BrowserConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
config = from_serializable_dict(data)
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
|
||||
# check if the deserialized object is an instance of BrowserConfig
|
||||
if isinstance(config, BrowserConfig):
|
||||
return config
|
||||
elif isinstance(config, dict):
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
else:
|
||||
raise ValueError("Invalid data type for BrowserConfig")
|
||||
|
||||
|
||||
class CrawlerRunConfig():
|
||||
@@ -807,7 +843,13 @@ class CrawlerRunConfig():
|
||||
def load(data: dict) -> "CrawlerRunConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
config = from_serializable_dict(data)
|
||||
return CrawlerRunConfig.from_kwargs(config)
|
||||
# If config type is alread instant of CrawleRunConfig, return it
|
||||
if isinstance(config, CrawlerRunConfig):
|
||||
return config
|
||||
elif isinstance(config, dict):
|
||||
return CrawlerRunConfig.from_kwargs(config)
|
||||
else:
|
||||
raise ValueError("Invalid data type")
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user