Files
crawl4ai/tests/docker/test_serialization.py
UncleCode baee4949d3 refactor(llm): rename LlmConfig to LLMConfig for consistency
Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions.
Update all imports and usages to use the new name.
Update documentation and examples to reflect the change.

BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.
2025-03-05 14:17:04 +08:00

255 lines
8.3 KiB
Python

import inspect
from typing import Any, Dict
from enum import Enum
from crawl4ai.types import LLMConfig
def to_serializable_dict(obj: Any) -> Dict:
"""
Recursively convert an object to a serializable dictionary using {type, params} structure
for complex objects.
"""
if obj is None:
return None
# Handle basic types
if isinstance(obj, (str, int, float, bool)):
return obj
# Handle Enum
if isinstance(obj, Enum):
return {
"type": obj.__class__.__name__,
"params": obj.value
}
# Handle datetime objects
if hasattr(obj, 'isoformat'):
return obj.isoformat()
# Handle lists, tuples, and sets
if isinstance(obj, (list, tuple, set)):
return [to_serializable_dict(item) for item in obj]
# Handle dictionaries - preserve them as-is
if isinstance(obj, dict):
return {
"type": "dict", # Mark as plain dictionary
"value": {str(k): to_serializable_dict(v) for k, v in obj.items()}
}
# Handle class instances
if hasattr(obj, '__class__'):
# Get constructor signature
sig = inspect.signature(obj.__class__.__init__)
params = sig.parameters
# Get current values
current_values = {}
for name, param in params.items():
if name == 'self':
continue
value = getattr(obj, name, param.default)
# Only include if different from default, considering empty values
if not (is_empty_value(value) and is_empty_value(param.default)):
if value != param.default:
current_values[name] = to_serializable_dict(value)
return {
"type": obj.__class__.__name__,
"params": current_values
}
return str(obj)
def from_serializable_dict(data: Any) -> Any:
"""
Recursively convert a serializable dictionary back to an object instance.
"""
if data is None:
return None
# Handle basic types
if isinstance(data, (str, int, float, bool)):
return data
# Handle typed data
if isinstance(data, dict) and "type" in data:
# Handle plain dictionaries
if data["type"] == "dict":
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
# Import from crawl4ai for class instances
import crawl4ai
cls = getattr(crawl4ai, data["type"])
# Handle Enum
if issubclass(cls, Enum):
return cls(data["params"])
# Handle class instances
constructor_args = {
k: from_serializable_dict(v) for k, v in data["params"].items()
}
return cls(**constructor_args)
# Handle lists
if isinstance(data, list):
return [from_serializable_dict(item) for item in data]
# Handle raw dictionaries (legacy support)
if isinstance(data, dict):
return {k: from_serializable_dict(v) for k, v in data.items()}
return data
def is_empty_value(value: Any) -> bool:
"""Check if a value is effectively empty/null."""
if value is None:
return True
if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
return True
return False
# if __name__ == "__main__":
# from crawl4ai import (
# CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator,
# PruningContentFilter, BM25ContentFilter, LLMContentFilter,
# JsonCssExtractionStrategy, CosineStrategy, RegexChunking,
# WebScrapingStrategy, LXMLWebScrapingStrategy
# )
# # Test Case 1: BM25 content filtering through markdown generator
# config1 = CrawlerRunConfig(
# cache_mode=CacheMode.BYPASS,
# markdown_generator=DefaultMarkdownGenerator(
# content_filter=BM25ContentFilter(
# user_query="technology articles",
# bm25_threshold=1.2,
# language="english"
# )
# ),
# chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
# excluded_tags=["nav", "footer", "aside"],
# remove_overlay_elements=True
# )
# # Serialize
# serialized = to_serializable_dict(config1)
# print("\nSerialized Config:")
# print(serialized)
# # Example output structure would now look like:
# """
# {
# "type": "CrawlerRunConfig",
# "params": {
# "cache_mode": {
# "type": "CacheMode",
# "params": "bypass"
# },
# "markdown_generator": {
# "type": "DefaultMarkdownGenerator",
# "params": {
# "content_filter": {
# "type": "BM25ContentFilter",
# "params": {
# "user_query": "technology articles",
# "bm25_threshold": 1.2,
# "language": "english"
# }
# }
# }
# }
# }
# }
# """
# # Deserialize
# deserialized = from_serializable_dict(serialized)
# print("\nDeserialized Config:")
# print(to_serializable_dict(deserialized))
# # Verify they match
# assert to_serializable_dict(config1) == to_serializable_dict(deserialized)
# print("\nVerification passed: Configuration matches after serialization/deserialization!")
if __name__ == "__main__":
from crawl4ai import (
CrawlerRunConfig, CacheMode, DefaultMarkdownGenerator,
PruningContentFilter, BM25ContentFilter, LLMContentFilter,
JsonCssExtractionStrategy, RegexChunking,
WebScrapingStrategy, LXMLWebScrapingStrategy
)
# Test Case 1: BM25 content filtering through markdown generator
config1 = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=BM25ContentFilter(
user_query="technology articles",
bm25_threshold=1.2,
language="english"
)
),
chunking_strategy=RegexChunking(patterns=[r"\n\n", r"\.\s+"]),
excluded_tags=["nav", "footer", "aside"],
remove_overlay_elements=True
)
# Test Case 2: LLM-based extraction with pruning filter
schema = {
"baseSelector": "article.post",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".content", "type": "html"}
]
}
config2 = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(schema=schema),
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
),
options={"ignore_links": True}
),
scraping_strategy=LXMLWebScrapingStrategy()
)
# Test Case 3:LLM content filter
config3 = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
content_filter=LLMContentFilter(
llm_config = LLMConfig(provider="openai/gpt-4"),
instruction="Extract key technical concepts",
chunk_token_threshold=2000,
overlap_rate=0.1
),
options={"ignore_images": True}
),
scraping_strategy=WebScrapingStrategy()
)
# Test all configurations
test_configs = [config1, config2, config3]
for i, config in enumerate(test_configs, 1):
print(f"\nTesting Configuration {i}:")
# Serialize
serialized = to_serializable_dict(config)
print(f"\nSerialized Config {i}:")
print(serialized)
# Deserialize
deserialized = from_serializable_dict(serialized)
print(f"\nDeserialized Config {i}:")
print(to_serializable_dict(deserialized)) # Convert back to dict for comparison
# Verify they match
assert to_serializable_dict(config) == to_serializable_dict(deserialized)
print(f"\nVerification passed: Configuration {i} matches after serialization/deserialization!")