fix(docker): resolve filter serialization and JSON encoding errors in deep crawl strategy (ref #1419)
- Fix URLPatternFilter serialization by preventing private __slots__ from being serialized as constructor params - Add public attributes to URLPatternFilter to store original constructor parameters for proper serialization - Handle property descriptors in CrawlResult.model_dump() to prevent JSON serialization errors - Ensure filter chains work correctly with Docker client and REST API The issue occurred because: 1. Private implementation details (_simple_suffixes, etc.) were being serialized and passed as constructor arguments during deserialization 2. Property descriptors were being included in the serialized output, causing "Object of type property is not JSON serializable" errors Changes: - async_configs.py: Comment out __slots__ serialization logic (lines 100-109) - filters.py: Add patterns, use_glob, reverse to URLPatternFilter __slots__ and store as public attributes - models.py: Convert property descriptors to strings in model_dump() instead of including them directly
This commit is contained in:
@@ -97,13 +97,16 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
|
||||
if value != param.default and not ignore_default_value:
|
||||
current_values[name] = to_serializable_dict(value)
|
||||
|
||||
if hasattr(obj, '__slots__'):
|
||||
for slot in obj.__slots__:
|
||||
if slot.startswith('_'): # Handle private slots
|
||||
attr_name = slot[1:] # Remove leading '_'
|
||||
value = getattr(obj, slot, None)
|
||||
if value is not None:
|
||||
current_values[attr_name] = to_serializable_dict(value)
|
||||
# Don't serialize private __slots__ - they're internal implementation details
|
||||
# not constructor parameters. This was causing URLPatternFilter to fail
|
||||
# because _simple_suffixes was being serialized as 'simple_suffixes'
|
||||
# if hasattr(obj, '__slots__'):
|
||||
# for slot in obj.__slots__:
|
||||
# if slot.startswith('_'): # Handle private slots
|
||||
# attr_name = slot[1:] # Remove leading '_'
|
||||
# value = getattr(obj, slot, None)
|
||||
# if value is not None:
|
||||
# current_values[attr_name] = to_serializable_dict(value)
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -120,6 +120,9 @@ class URLPatternFilter(URLFilter):
|
||||
"""Pattern filter balancing speed and completeness"""
|
||||
|
||||
__slots__ = (
|
||||
"patterns", # Store original patterns for serialization
|
||||
"use_glob", # Store original use_glob for serialization
|
||||
"reverse", # Store original reverse for serialization
|
||||
"_simple_suffixes",
|
||||
"_simple_prefixes",
|
||||
"_domain_patterns",
|
||||
@@ -142,6 +145,11 @@ class URLPatternFilter(URLFilter):
|
||||
reverse: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
# Store original constructor params for serialization
|
||||
self.patterns = patterns
|
||||
self.use_glob = use_glob
|
||||
self.reverse = reverse
|
||||
|
||||
self._reverse = reverse
|
||||
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||
|
||||
|
||||
@@ -253,6 +253,16 @@ class CrawlResult(BaseModel):
|
||||
requirements change, this is where you would update the logic.
|
||||
"""
|
||||
result = super().model_dump(*args, **kwargs)
|
||||
|
||||
# Remove any property descriptors that might have been included
|
||||
# These deprecated properties should not be in the serialized output
|
||||
for key in ['fit_html', 'fit_markdown', 'markdown_v2']:
|
||||
if key in result and isinstance(result[key], property):
|
||||
# del result[key]
|
||||
# Nasrin: I decided to convert it to string instead of removing it.
|
||||
result[key] = str(result[key])
|
||||
|
||||
# Add the markdown field properly
|
||||
if self._markdown is not None:
|
||||
result["markdown"] = self._markdown.model_dump()
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user