Merge pull request #1648 from christopher-w-murphy/fix/content-relevance-filter

[Fix]: Docker server does not decode ContentRelevanceFilter
2025-12-03 18:36:07 +08:00
parent df4d87ed78 6893094f58
commit 5a8fb57795
7 changed files with 169 additions and 53 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -72,6 +72,8 @@ from .deep_crawling import (
    BestFirstCrawlingStrategy,
    DFSDeepCrawlStrategy,
    DeepCrawlDecorator,
+    ContentRelevanceFilter,
+    ContentTypeScorer,
 )
 # NEW: Import AsyncUrlSeeder
 from .async_url_seeder import AsyncUrlSeeder
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,5 +1,5 @@
+import importlib
 import os
-from typing import Union
 import warnings
 import requests
 from .config import (
@@ -27,14 +27,14 @@ from .table_extraction import TableExtractionStrategy, DefaultTableExtraction
 from .cache_context import CacheMode
 from .proxy_strategy import ProxyRotationStrategy

-from typing import Union, List, Callable
 import inspect
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 from enum import Enum

 # Type alias for URL matching
 UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]

+
 class MatchMode(Enum):
    OR = "or"
    AND = "and"
@@ -42,8 +42,7 @@ class MatchMode(Enum):
 # from .proxy_strategy import ProxyConfig


-
-def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
+def to_serializable_dict(obj: Any, ignore_default_value : bool = False):
    """
    Recursively convert an object to a serializable dictionary using {type, params} structure
    for complex objects.
@@ -110,8 +109,6 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
        #             if value is not None:
        #                 current_values[attr_name] = to_serializable_dict(value)

-            
-        
        return {
            "type": obj.__class__.__name__,
            "params": current_values
@@ -137,12 +134,20 @@ def from_serializable_dict(data: Any) -> Any:
        if data["type"] == "dict" and "value" in data:
            return {k: from_serializable_dict(v) for k, v in data["value"].items()}

-        # Import from crawl4ai for class instances
-        import crawl4ai
-
-        if hasattr(crawl4ai, data["type"]):
-            cls = getattr(crawl4ai, data["type"])
+        cls = None
+        # If you are receiving an error while trying to convert a dict to an object:
+        # Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file
+        module_paths = ["crawl4ai"]
+        for module_path in module_paths:
+            try:
+                mod = importlib.import_module(module_path)
+                if hasattr(mod, data["type"]):
+                    cls = getattr(mod, data["type"])
+                    break
+            except (ImportError, AttributeError):
+                continue

+        if cls is not None:
            # Handle Enum
            if issubclass(cls, Enum):
                return cls(data["params"])
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -509,18 +509,22 @@ class DomainFilter(URLFilter):
 class ContentRelevanceFilter(URLFilter):
    """BM25-based relevance filter using head section content"""

-    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
+    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl", "query")

    def __init__(
        self,
-        query: str,
+        query: Union[str, List[str]],
        threshold: float,
        k1: float = 1.2,
        b: float = 0.75,
        avgdl: int = 1000,
    ):
        super().__init__(name="BM25RelevanceFilter")
-        self.query_terms = self._tokenize(query)
+        if isinstance(query, list):
+            self.query = " ".join(query)
+        else:
+            self.query = query
+        self.query_terms = self._tokenize(self.query)
        self.threshold = threshold
        self.k1 = k1  # TF saturation parameter
        self.b = b  # Length normalization parameter
--- a/crawl4ai/docker_client.py
+++ b/crawl4ai/docker_client.py
@@ -180,7 +180,7 @@ class Crawl4aiDockerClient:
                                yield CrawlResult(**result)
            return stream_results()

-        response = await self._request("POST", "/crawl", json=data)
+        response = await self._request("POST", "/crawl", json=data, timeout=hooks_timeout)
        result_data = response.json()
        if not result_data.get("success", False):
            raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")