Merge pull request #1648 from christopher-w-murphy/fix/content-relevance-filter

[Fix]: Docker server does not decode ContentRelevanceFilter
2025-12-03 18:36:07 +08:00
parent df4d87ed78 6893094f58
commit 5a8fb57795
7 changed files with 169 additions and 53 deletions
--- a/README.md
+++ b/README.md
@@ -1034,7 +1034,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro

 | Company | About | Sponsorship Tier |
 |------|------|----------------------------|
-| <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a>  | Scrapeless is the best full-stack web scraping toolkit offering Scraping API, Scraping Browser, Web Unlocker, Captcha Solver, and Proxies, designed to handle all your data collection needs. | 🥈 Silver |
+| <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a>  | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver |
 | <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze |
 | <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold |
 | <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold |
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -72,6 +72,8 @@ from .deep_crawling import (
    BestFirstCrawlingStrategy,
    DFSDeepCrawlStrategy,
    DeepCrawlDecorator,
+    ContentRelevanceFilter,
+    ContentTypeScorer,
 )
 # NEW: Import AsyncUrlSeeder
 from .async_url_seeder import AsyncUrlSeeder
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,5 +1,5 @@
+import importlib
 import os
-from typing import Union
 import warnings
 import requests
 from .config import (
@@ -27,14 +27,14 @@ from .table_extraction import TableExtractionStrategy, DefaultTableExtraction
 from .cache_context import CacheMode
 from .proxy_strategy import ProxyRotationStrategy

-from typing import Union, List, Callable
 import inspect
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 from enum import Enum

 # Type alias for URL matching
 UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]

+
 class MatchMode(Enum):
    OR = "or"
    AND = "and"
@@ -42,8 +42,7 @@ class MatchMode(Enum):
 # from .proxy_strategy import ProxyConfig


-
-def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
+def to_serializable_dict(obj: Any, ignore_default_value : bool = False):
    """
    Recursively convert an object to a serializable dictionary using {type, params} structure
    for complex objects.
@@ -110,8 +109,6 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
        #             if value is not None:
        #                 current_values[attr_name] = to_serializable_dict(value)

-            
-        
        return {
            "type": obj.__class__.__name__,
            "params": current_values
@@ -137,12 +134,20 @@ def from_serializable_dict(data: Any) -> Any:
        if data["type"] == "dict" and "value" in data:
            return {k: from_serializable_dict(v) for k, v in data["value"].items()}

-        # Import from crawl4ai for class instances
-        import crawl4ai
-
-        if hasattr(crawl4ai, data["type"]):
-            cls = getattr(crawl4ai, data["type"])
+        cls = None
+        # If you are receiving an error while trying to convert a dict to an object:
+        # Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file
+        module_paths = ["crawl4ai"]
+        for module_path in module_paths:
+            try:
+                mod = importlib.import_module(module_path)
+                if hasattr(mod, data["type"]):
+                    cls = getattr(mod, data["type"])
+                    break
+            except (ImportError, AttributeError):
+                continue

+        if cls is not None:
            # Handle Enum
            if issubclass(cls, Enum):
                return cls(data["params"])
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -509,18 +509,22 @@ class DomainFilter(URLFilter):
 class ContentRelevanceFilter(URLFilter):
    """BM25-based relevance filter using head section content"""

-    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
+    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl", "query")

    def __init__(
        self,
-        query: str,
+        query: Union[str, List[str]],
        threshold: float,
        k1: float = 1.2,
        b: float = 0.75,
        avgdl: int = 1000,
    ):
        super().__init__(name="BM25RelevanceFilter")
-        self.query_terms = self._tokenize(query)
+        if isinstance(query, list):
+            self.query = " ".join(query)
+        else:
+            self.query = query
+        self.query_terms = self._tokenize(self.query)
        self.threshold = threshold
        self.k1 = k1  # TF saturation parameter
        self.b = b  # Length normalization parameter
--- a/crawl4ai/docker_client.py
+++ b/crawl4ai/docker_client.py
@@ -180,7 +180,7 @@ class Crawl4aiDockerClient:
                                yield CrawlResult(**result)
            return stream_results()

-        response = await self._request("POST", "/crawl", json=data)
+        response = await self._request("POST", "/crawl", json=data, timeout=hooks_timeout)
        result_data = response.json()
        if not result_data.get("success", False):
            raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
--- a/docs/examples/cloud_browser/scrapeless_browser.py
+++ b/docs/examples/cloud_browser/scrapeless_browser.py
@@ -0,0 +1,61 @@
+import json
+import asyncio
+from urllib.parse import quote, urlencode
+from crawl4ai import CrawlerRunConfig, BrowserConfig, AsyncWebCrawler
+
+# Scrapeless provides a free anti-detection fingerprint browser client and cloud browsers:
+# https://www.scrapeless.com/en/blog/scrapeless-nstbrowser-strategic-integration
+
+async def main():
+    # customize browser fingerprint
+    fingerprint = {
+        "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.1.2.3 Safari/537.36",
+        "platform": "Windows",
+        "screen": {
+            "width": 1280, "height": 1024
+        },
+        "localization": {
+            "languages": ["zh-HK", "en-US", "en"], "timezone": "Asia/Hong_Kong",
+        }
+    }
+
+    fingerprint_json = json.dumps(fingerprint)
+    encoded_fingerprint = quote(fingerprint_json)
+
+    scrapeless_params = {
+        "token": "your token",
+        "sessionTTL": 1000,
+        "sessionName": "Demo",
+        "fingerprint": encoded_fingerprint,
+        # Sets the target country/region for the proxy, sending requests via an IP address from that region. You can specify a country code (e.g., US for the United States, GB for the United Kingdom, ANY for any country). See country codes for all supported options.
+        # "proxyCountry": "ANY",
+        # create profile on scrapeless
+        # "profileId": "your profileId",
+        # For more usage details, please refer to https://docs.scrapeless.com/en/scraping-browser/quickstart/getting-started
+    }
+    query_string = urlencode(scrapeless_params)
+    scrapeless_connection_url = f"wss://browser.scrapeless.com/api/v2/browser?{query_string}"
+    async with AsyncWebCrawler(
+        config=BrowserConfig(
+            headless=False,
+            browser_mode="cdp",
+            cdp_url=scrapeless_connection_url,
+        )
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://www.scrapeless.com/en",
+            config=CrawlerRunConfig(
+                wait_for="css:.content",
+                scan_full_page=True,
+            ),
+        )
+        print("-" * 20)
+        print(f'Status Code: {result.status_code}')
+        print("-" * 20)
+        print(f'Title: {result.metadata["title"]}')
+        print(f'Description: {result.metadata["description"]}')
+        print("-" * 20)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+    
--- a/tests/docker/test_filter_deep_crawl.py
+++ b/tests/docker/test_filter_deep_crawl.py
@@ -1,16 +1,31 @@
 """
 Test the complete fix for both the filter serialization and JSON serialization issues.
 """
+import os
+import traceback
+from typing import Any

 import asyncio
 import httpx

 from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
+from crawl4ai.deep_crawling import (
+    BFSDeepCrawlStrategy,
+    ContentRelevanceFilter,
+    FilterChain,
+    URLFilter,
+    URLPatternFilter,
+)

-BASE_URL = "http://localhost:11234/"  # Adjust port as needed
+CRAWL4AI_DOCKER_PORT = os.environ.get("CRAWL4AI_DOCKER_PORT", "11234")
+try:
+    BASE_PORT = int(CRAWL4AI_DOCKER_PORT)
+except TypeError:
+    BASE_PORT = 11234
+BASE_URL = f"http://localhost:{BASE_PORT}/"  # Adjust port as needed

-async def test_with_docker_client():
+
+async def test_with_docker_client(filter_chain: list[URLFilter], max_pages: int = 20, timeout: int = 30) -> bool:
    """Test using the Docker client (same as 1419.py)."""
    from crawl4ai.docker_client import Crawl4aiDockerClient
    
@@ -24,19 +39,10 @@ async def test_with_docker_client():
            verbose=True,
        ) as client:
            
-            # Create filter chain - testing the serialization fix
-            filter_chain = [
-                URLPatternFilter(
-                    # patterns=["*about*", "*privacy*", "*terms*"],
-                    patterns=["*advanced*"],
-                    reverse=True
-                ),
-            ]
-            
            crawler_config = CrawlerRunConfig(
                deep_crawl_strategy=BFSDeepCrawlStrategy(
                    max_depth=2,  # Keep it shallow for testing
-                    # max_pages=5,  # Limit pages for testing
+                    max_pages=max_pages,  # Limit pages for testing
                    filter_chain=FilterChain(filter_chain)
                ),
                cache_mode=CacheMode.BYPASS,
@@ -47,6 +53,7 @@ async def test_with_docker_client():
                ["https://docs.crawl4ai.com"],  # Simple test page
                browser_config=BrowserConfig(headless=True),
                crawler_config=crawler_config,
+                hooks_timeout=timeout,
            )
            
            if results:
@@ -74,12 +81,11 @@ async def test_with_docker_client():
        
    except Exception as e:
        print(f"❌ Docker client test failed: {e}")
-        import traceback
        traceback.print_exc()
        return False


-async def test_with_rest_api():
+async def test_with_rest_api(filters: list[dict[str, Any]], max_pages: int = 20, timeout: int = 30) -> bool:
    """Test using REST API directly."""
    print("\n" + "=" * 60)
    print("Testing with REST API")
@@ -90,19 +96,11 @@ async def test_with_rest_api():
        "type": "BFSDeepCrawlStrategy",
        "params": {
            "max_depth": 2,
-            # "max_pages": 5,
+            "max_pages": max_pages,
            "filter_chain": {
                "type": "FilterChain",
                "params": {
-                    "filters": [
-                        {
-                            "type": "URLPatternFilter",
-                            "params": {
-                                "patterns": ["*advanced*"],
-                                "reverse": True
-                            }
-                        }
-                    ]
+                    "filters": filters
                }
            }
        }
@@ -126,7 +124,7 @@ async def test_with_rest_api():
            response = await client.post(
                f"{BASE_URL}crawl",
                json=crawl_payload,
-                timeout=30
+                timeout=timeout,
            )
            
            if response.status_code == 200:
@@ -150,7 +148,6 @@ async def test_with_rest_api():
        
    except Exception as e:
        print(f"❌ REST API test failed: {e}")
-        import traceback
        traceback.print_exc()
        return False

@@ -165,12 +162,62 @@ async def main():
    results = []
    
    # Test 1: Docker client
-    docker_passed = await test_with_docker_client()
-    results.append(("Docker Client", docker_passed))
+    max_pages_ = [20, 5]
+    timeouts = [30, 60]
+    filter_chain_test_cases = [
+        [
+            URLPatternFilter(
+                # patterns=["*about*", "*privacy*", "*terms*"],
+                patterns=["*advanced*"],
+                reverse=True
+            ),
+        ],
+        [
+            ContentRelevanceFilter(
+                query="about faq",
+                threshold=0.2,
+            ),
+        ],
+    ]
+    for idx, (filter_chain, max_pages, timeout) in enumerate(zip(filter_chain_test_cases, max_pages_, timeouts)):
+        docker_passed = await test_with_docker_client(filter_chain=filter_chain, max_pages=max_pages, timeout=timeout)
+        results.append((f"Docker Client w/ filter chain {idx}", docker_passed))
    
    # Test 2: REST API
-    rest_passed = await test_with_rest_api()
-    results.append(("REST API", rest_passed))
+    max_pages_ = [20, 5, 5]
+    timeouts = [30, 60, 60]
+    filters_test_cases = [
+        [
+            {
+                "type": "URLPatternFilter",
+                "params": {
+                    "patterns": ["*advanced*"],
+                    "reverse": True
+                }
+            }
+        ],
+        [
+            {
+                "type": "ContentRelevanceFilter",
+                "params": {
+                    "query": "about faq",
+                    "threshold": 0.2,
+                }
+            }
+        ],
+        [
+            {
+                "type": "ContentRelevanceFilter",
+                "params": {
+                    "query": ["about", "faq"],
+                    "threshold": 0.2,
+                }
+            }
+        ],
+    ]
+    for idx, (filters, max_pages, timeout) in enumerate(zip(filters_test_cases, max_pages_, timeouts)):
+        rest_passed = await test_with_rest_api(filters=filters, max_pages=max_pages, timeout=timeout)
+        results.append((f"REST API w/ filters {idx}", rest_passed))
    
    # Summary
    print("\n" + "=" * 60)
@@ -186,10 +233,7 @@ async def main():
    
    print("=" * 60)
    if all_passed:
-        print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
-        print("\nThe fixes:")
-        print("1. Filter serialization: Fixed by not serializing private __slots__")
-        print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
+        print("🎉 ALL TESTS PASSED!")
    else:
        print("⚠️ Some tests failed. Please check the server logs for details.")
    
@@ -198,4 +242,4 @@ async def main():

 if __name__ == "__main__":
    import sys
-    sys.exit(asyncio.run(main()))
+    sys.exit(asyncio.run(main()))