feat(config): enhance serialization and add deep crawling exports

Improve configuration serialization with better handling of frozensets and slots.
Expand deep crawling module exports and documentation.
Add comprehensive API usage examples in Docker README.

- Add support for frozenset serialization
- Improve error handling in config loading
- Export additional deep crawling components
- Enhance Docker API documentation with detailed examples
- Fix ContentTypeFilter initialization
This commit is contained in:
UncleCode
2025-02-13 21:45:19 +08:00
parent 43e09da694
commit 966fb47e64
6 changed files with 460 additions and 30 deletions

View File

@@ -1,3 +1,5 @@
import re
from attr import has
from .config import (
MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -17,6 +19,7 @@ from typing import Union, List
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
import inspect
from typing import Any, Dict, Optional
from enum import Enum
@@ -47,7 +50,11 @@ def to_serializable_dict(obj: Any) -> Dict:
# Handle lists, tuples, and sets
if isinstance(obj, (list, tuple, set)):
return [to_serializable_dict(item) for item in obj]
# Handle frozensets, which are not iterable
if isinstance(obj, frozenset):
return [to_serializable_dict(item) for item in list(obj)]
# Handle dictionaries - preserve them as-is
if isinstance(obj, dict):
return {
@@ -60,6 +67,7 @@ def to_serializable_dict(obj: Any) -> Dict:
# Get constructor signature
sig = inspect.signature(obj.__class__.__init__)
params = sig.parameters
_type = obj.__class__.__name__
# Get current values
current_values = {}
@@ -73,6 +81,24 @@ def to_serializable_dict(obj: Any) -> Dict:
if not (is_empty_value(value) and is_empty_value(param.default)):
if value != param.default:
current_values[name] = to_serializable_dict(value)
elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__:
slot = f"_{name}"
slot_value = getattr(obj, slot, None)
if not is_empty_value(slot_value):
current_values[name] = to_serializable_dict(slot_value)
# # Then handle slots if present
# if hasattr(obj.__class__, '__slots__'):
# for slot in obj.__class__.__slots__:
# # Remove leading underscore if present
# param_name = slot[1:] if slot.startswith('_') else slot
# # Get the slot value if it exists
# if hasattr(obj, slot):
# value = getattr(obj, slot)
# if not is_empty_value(value):
# current_values[param_name] = to_serializable_dict(value)
return {
"type": obj.__class__.__name__,
@@ -100,7 +126,10 @@ def from_serializable_dict(data: Any) -> Any:
# Import from crawl4ai for class instances
import crawl4ai
cls = getattr(crawl4ai, data["type"])
if not hasattr(crawl4ai, data["type"]):
return None
else:
cls = getattr(crawl4ai, data["type"])
# Handle Enum
if issubclass(cls, Enum):
@@ -361,7 +390,14 @@ class BrowserConfig():
def load( data: dict) -> "BrowserConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
return BrowserConfig.from_kwargs(config)
# check if the deserialized object is an instance of BrowserConfig
if isinstance(config, BrowserConfig):
return config
elif isinstance(config, dict):
return BrowserConfig.from_kwargs(config)
else:
raise ValueError("Invalid data type for BrowserConfig")
class CrawlerRunConfig():
@@ -807,7 +843,13 @@ class CrawlerRunConfig():
def load(data: dict) -> "CrawlerRunConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
return CrawlerRunConfig.from_kwargs(config)
# If config type is alread instant of CrawleRunConfig, return it
if isinstance(config, CrawlerRunConfig):
return config
elif isinstance(config, dict):
return CrawlerRunConfig.from_kwargs(config)
else:
raise ValueError("Invalid data type")
def to_dict(self):
return {