feat(config): enhance serialization and add deep crawling exports

Improve configuration serialization with better handling of frozensets and slots.
Expand deep crawling module exports and documentation.
Add comprehensive API usage examples in Docker README.

- Add support for frozenset serialization
- Improve error handling in config loading
- Export additional deep crawling components
- Enhance Docker API documentation with detailed examples
- Fix ContentTypeFilter initialization
This commit is contained in:
UncleCode
2025-02-13 21:45:19 +08:00
parent 43e09da694
commit 966fb47e64
6 changed files with 460 additions and 30 deletions

View File

@@ -33,10 +33,35 @@ from .async_dispatcher import (
)
from .docker_client import Crawl4aiDockerClient
from .hub import CrawlerHub
from .deep_crawling import DeepCrawlStrategy
from .deep_crawling import (
DeepCrawlStrategy,
BFSDeepCrawlStrategy,
FastFilterChain,
FastContentTypeFilter,
FastDomainFilter,
FastURLFilter,
FastFilterStats,
FastKeywordRelevanceScorer,
FastURLScorer,
BestFirstCrawlingStrategy,
DFSDeepCrawlStrategy,
DeepCrawlDecorator,
)
__all__ = [
"AsyncWebCrawler",
"DeepCrawlStrategy",
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy",
"FastFilterChain",
"FastContentTypeFilter",
"FastDomainFilter",
"FastFilterStats",
"FastURLFilter",
"FastKeywordRelevanceScorer",
"FastURLScorer",
"DeepCrawlDecorator",
"CrawlResult",
"CrawlerHub",
"CacheMode",

View File

@@ -1,3 +1,5 @@
import re
from attr import has
from .config import (
MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -17,6 +19,7 @@ from typing import Union, List
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
import inspect
from typing import Any, Dict, Optional
from enum import Enum
@@ -47,7 +50,11 @@ def to_serializable_dict(obj: Any) -> Dict:
# Handle lists, tuples, and sets
if isinstance(obj, (list, tuple, set)):
return [to_serializable_dict(item) for item in obj]
# Handle frozensets, which are not iterable
if isinstance(obj, frozenset):
return [to_serializable_dict(item) for item in list(obj)]
# Handle dictionaries - preserve them as-is
if isinstance(obj, dict):
return {
@@ -60,6 +67,7 @@ def to_serializable_dict(obj: Any) -> Dict:
# Get constructor signature
sig = inspect.signature(obj.__class__.__init__)
params = sig.parameters
_type = obj.__class__.__name__
# Get current values
current_values = {}
@@ -73,6 +81,24 @@ def to_serializable_dict(obj: Any) -> Dict:
if not (is_empty_value(value) and is_empty_value(param.default)):
if value != param.default:
current_values[name] = to_serializable_dict(value)
elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__:
slot = f"_{name}"
slot_value = getattr(obj, slot, None)
if not is_empty_value(slot_value):
current_values[name] = to_serializable_dict(slot_value)
# # Then handle slots if present
# if hasattr(obj.__class__, '__slots__'):
# for slot in obj.__class__.__slots__:
# # Remove leading underscore if present
# param_name = slot[1:] if slot.startswith('_') else slot
# # Get the slot value if it exists
# if hasattr(obj, slot):
# value = getattr(obj, slot)
# if not is_empty_value(value):
# current_values[param_name] = to_serializable_dict(value)
return {
"type": obj.__class__.__name__,
@@ -100,7 +126,10 @@ def from_serializable_dict(data: Any) -> Any:
# Import from crawl4ai for class instances
import crawl4ai
cls = getattr(crawl4ai, data["type"])
if not hasattr(crawl4ai, data["type"]):
return None
else:
cls = getattr(crawl4ai, data["type"])
# Handle Enum
if issubclass(cls, Enum):
@@ -361,7 +390,14 @@ class BrowserConfig():
def load( data: dict) -> "BrowserConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
return BrowserConfig.from_kwargs(config)
# check if the deserialized object is an instance of BrowserConfig
if isinstance(config, BrowserConfig):
return config
elif isinstance(config, dict):
return BrowserConfig.from_kwargs(config)
else:
raise ValueError("Invalid data type for BrowserConfig")
class CrawlerRunConfig():
@@ -807,7 +843,13 @@ class CrawlerRunConfig():
def load(data: dict) -> "CrawlerRunConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
return CrawlerRunConfig.from_kwargs(config)
# If config type is alread instant of CrawleRunConfig, return it
if isinstance(config, CrawlerRunConfig):
return config
elif isinstance(config, dict):
return CrawlerRunConfig.from_kwargs(config)
else:
raise ValueError("Invalid data type")
def to_dict(self):
return {

View File

@@ -3,6 +3,17 @@ from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
from .bfs_strategy import BFSDeepCrawlStrategy
from .bff_strategy import BestFirstCrawlingStrategy
from .dfs_strategy import DFSDeepCrawlStrategy
from .filters import (
FastFilterChain,
FastContentTypeFilter,
FastDomainFilter,
FastURLFilter,
FastFilterStats,
)
from .scorers import (
FastKeywordRelevanceScorer,
FastURLScorer,
)
__all__ = [
"DeepCrawlDecorator",
@@ -10,4 +21,11 @@ __all__ = [
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy",
"FastFilterChain",
"FastContentTypeFilter",
"FastDomainFilter",
"FastURLFilter",
"FastFilterStats",
"FastKeywordRelevanceScorer",
"FastURLScorer",
]

View File

@@ -522,7 +522,7 @@ class FastContentTypeFilter(FastURLFilter):
return path.rpartition(".")[-1].lower()
def __init__(
self, allowed_types: Union[str, List[str]], check_extension: bool = True
self, allowed_types: Union[str, List[str]], check_extension: bool = True, ext_map: Dict[str, str] = _MIME_MAP
):
super().__init__()
# Normalize and store as frozenset for fast lookup