feat(config): enhance serialization and add deep crawling exports
Improve configuration serialization with better handling of frozensets and slots. Expand deep crawling module exports and documentation. Add comprehensive API usage examples in Docker README. - Add support for frozenset serialization - Improve error handling in config loading - Export additional deep crawling components - Enhance Docker API documentation with detailed examples - Fix ContentTypeFilter initialization
This commit is contained in:
@@ -33,10 +33,35 @@ from .async_dispatcher import (
|
||||
)
|
||||
from .docker_client import Crawl4aiDockerClient
|
||||
from .hub import CrawlerHub
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
from .deep_crawling import (
|
||||
DeepCrawlStrategy,
|
||||
BFSDeepCrawlStrategy,
|
||||
FastFilterChain,
|
||||
FastContentTypeFilter,
|
||||
FastDomainFilter,
|
||||
FastURLFilter,
|
||||
FastFilterStats,
|
||||
FastKeywordRelevanceScorer,
|
||||
FastURLScorer,
|
||||
BestFirstCrawlingStrategy,
|
||||
DFSDeepCrawlStrategy,
|
||||
DeepCrawlDecorator,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AsyncWebCrawler",
|
||||
"DeepCrawlStrategy",
|
||||
"BFSDeepCrawlStrategy",
|
||||
"BestFirstCrawlingStrategy",
|
||||
"DFSDeepCrawlStrategy",
|
||||
"FastFilterChain",
|
||||
"FastContentTypeFilter",
|
||||
"FastDomainFilter",
|
||||
"FastFilterStats",
|
||||
"FastURLFilter",
|
||||
"FastKeywordRelevanceScorer",
|
||||
"FastURLScorer",
|
||||
"DeepCrawlDecorator",
|
||||
"CrawlResult",
|
||||
"CrawlerHub",
|
||||
"CacheMode",
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import re
|
||||
from attr import has
|
||||
from .config import (
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
@@ -17,6 +19,7 @@ from typing import Union, List
|
||||
from .cache_context import CacheMode
|
||||
from .proxy_strategy import ProxyRotationStrategy
|
||||
|
||||
|
||||
import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
@@ -47,7 +50,11 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
# Handle lists, tuples, and sets
|
||||
if isinstance(obj, (list, tuple, set)):
|
||||
return [to_serializable_dict(item) for item in obj]
|
||||
|
||||
|
||||
# Handle frozensets, which are not iterable
|
||||
if isinstance(obj, frozenset):
|
||||
return [to_serializable_dict(item) for item in list(obj)]
|
||||
|
||||
# Handle dictionaries - preserve them as-is
|
||||
if isinstance(obj, dict):
|
||||
return {
|
||||
@@ -60,6 +67,7 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
# Get constructor signature
|
||||
sig = inspect.signature(obj.__class__.__init__)
|
||||
params = sig.parameters
|
||||
_type = obj.__class__.__name__
|
||||
|
||||
# Get current values
|
||||
current_values = {}
|
||||
@@ -73,6 +81,24 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
if not (is_empty_value(value) and is_empty_value(param.default)):
|
||||
if value != param.default:
|
||||
current_values[name] = to_serializable_dict(value)
|
||||
elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__:
|
||||
slot = f"_{name}"
|
||||
slot_value = getattr(obj, slot, None)
|
||||
if not is_empty_value(slot_value):
|
||||
current_values[name] = to_serializable_dict(slot_value)
|
||||
|
||||
|
||||
# # Then handle slots if present
|
||||
# if hasattr(obj.__class__, '__slots__'):
|
||||
# for slot in obj.__class__.__slots__:
|
||||
# # Remove leading underscore if present
|
||||
# param_name = slot[1:] if slot.startswith('_') else slot
|
||||
|
||||
# # Get the slot value if it exists
|
||||
# if hasattr(obj, slot):
|
||||
# value = getattr(obj, slot)
|
||||
# if not is_empty_value(value):
|
||||
# current_values[param_name] = to_serializable_dict(value)
|
||||
|
||||
return {
|
||||
"type": obj.__class__.__name__,
|
||||
@@ -100,7 +126,10 @@ def from_serializable_dict(data: Any) -> Any:
|
||||
|
||||
# Import from crawl4ai for class instances
|
||||
import crawl4ai
|
||||
cls = getattr(crawl4ai, data["type"])
|
||||
if not hasattr(crawl4ai, data["type"]):
|
||||
return None
|
||||
else:
|
||||
cls = getattr(crawl4ai, data["type"])
|
||||
|
||||
# Handle Enum
|
||||
if issubclass(cls, Enum):
|
||||
@@ -361,7 +390,14 @@ class BrowserConfig():
|
||||
def load( data: dict) -> "BrowserConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
config = from_serializable_dict(data)
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
|
||||
# check if the deserialized object is an instance of BrowserConfig
|
||||
if isinstance(config, BrowserConfig):
|
||||
return config
|
||||
elif isinstance(config, dict):
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
else:
|
||||
raise ValueError("Invalid data type for BrowserConfig")
|
||||
|
||||
|
||||
class CrawlerRunConfig():
|
||||
@@ -807,7 +843,13 @@ class CrawlerRunConfig():
|
||||
def load(data: dict) -> "CrawlerRunConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
config = from_serializable_dict(data)
|
||||
return CrawlerRunConfig.from_kwargs(config)
|
||||
# If config type is alread instant of CrawleRunConfig, return it
|
||||
if isinstance(config, CrawlerRunConfig):
|
||||
return config
|
||||
elif isinstance(config, dict):
|
||||
return CrawlerRunConfig.from_kwargs(config)
|
||||
else:
|
||||
raise ValueError("Invalid data type")
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
|
||||
@@ -3,6 +3,17 @@ from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
|
||||
from .bfs_strategy import BFSDeepCrawlStrategy
|
||||
from .bff_strategy import BestFirstCrawlingStrategy
|
||||
from .dfs_strategy import DFSDeepCrawlStrategy
|
||||
from .filters import (
|
||||
FastFilterChain,
|
||||
FastContentTypeFilter,
|
||||
FastDomainFilter,
|
||||
FastURLFilter,
|
||||
FastFilterStats,
|
||||
)
|
||||
from .scorers import (
|
||||
FastKeywordRelevanceScorer,
|
||||
FastURLScorer,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"DeepCrawlDecorator",
|
||||
@@ -10,4 +21,11 @@ __all__ = [
|
||||
"BFSDeepCrawlStrategy",
|
||||
"BestFirstCrawlingStrategy",
|
||||
"DFSDeepCrawlStrategy",
|
||||
"FastFilterChain",
|
||||
"FastContentTypeFilter",
|
||||
"FastDomainFilter",
|
||||
"FastURLFilter",
|
||||
"FastFilterStats",
|
||||
"FastKeywordRelevanceScorer",
|
||||
"FastURLScorer",
|
||||
]
|
||||
@@ -522,7 +522,7 @@ class FastContentTypeFilter(FastURLFilter):
|
||||
return path.rpartition(".")[-1].lower()
|
||||
|
||||
def __init__(
|
||||
self, allowed_types: Union[str, List[str]], check_extension: bool = True
|
||||
self, allowed_types: Union[str, List[str]], check_extension: bool = True, ext_map: Dict[str, str] = _MIME_MAP
|
||||
):
|
||||
super().__init__()
|
||||
# Normalize and store as frozenset for fast lookup
|
||||
|
||||
Reference in New Issue
Block a user