From 0024c82cdcbd7c2d9e9e93ec40c8fec2563eff8f Mon Sep 17 00:00:00 2001
From: Aravind <aravind.karanam@gmail.com>
Date: Mon, 24 Nov 2025 17:59:33 +0530
Subject: [PATCH 1/7] Sponsors/new (#1637)

---
 README.md                                     |  2 +-
 .../cloud_browser/scrapeless_browser.py       | 61 +++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 docs/examples/cloud_browser/scrapeless_browser.py
diff --git a/README.md b/README.md
index 79161a8a..09178cb9 100644
--- a/README.md
+++ b/README.md
@@ -1034,7 +1034,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro
 
 | Company | About | Sponsorship Tier |
 |------|------|----------------------------|
-| <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a>  | Scrapeless is the best full-stack web scraping toolkit offering Scraping API, Scraping Browser, Web Unlocker, Captcha Solver, and Proxies, designed to handle all your data collection needs. | 🥈 Silver |
+| <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a>  | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver |
 | <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze |
 | <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold |
 | <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold |
diff --git a/docs/examples/cloud_browser/scrapeless_browser.py b/docs/examples/cloud_browser/scrapeless_browser.py
new file mode 100644
index 00000000..4981c813
--- /dev/null
+++ b/docs/examples/cloud_browser/scrapeless_browser.py
@@ -0,0 +1,61 @@
+import json
+import asyncio
+from urllib.parse import quote, urlencode
+from crawl4ai import CrawlerRunConfig, BrowserConfig, AsyncWebCrawler
+
+# Scrapeless provides a free anti-detection fingerprint browser client and cloud browsers:
+# https://www.scrapeless.com/en/blog/scrapeless-nstbrowser-strategic-integration
+
+async def main():
+    # customize browser fingerprint
+    fingerprint = {
+        "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.1.2.3 Safari/537.36",
+        "platform": "Windows",
+        "screen": {
+            "width": 1280, "height": 1024
+        },
+        "localization": {
+            "languages": ["zh-HK", "en-US", "en"], "timezone": "Asia/Hong_Kong",
+        }
+    }
+
+    fingerprint_json = json.dumps(fingerprint)
+    encoded_fingerprint = quote(fingerprint_json)
+
+    scrapeless_params = {
+        "token": "your token",
+        "sessionTTL": 1000,
+        "sessionName": "Demo",
+        "fingerprint": encoded_fingerprint,
+        # Sets the target country/region for the proxy, sending requests via an IP address from that region. You can specify a country code (e.g., US for the United States, GB for the United Kingdom, ANY for any country). See country codes for all supported options.
+        # "proxyCountry": "ANY",
+        # create profile on scrapeless
+        # "profileId": "your profileId",
+        # For more usage details, please refer to https://docs.scrapeless.com/en/scraping-browser/quickstart/getting-started
+    }
+    query_string = urlencode(scrapeless_params)
+    scrapeless_connection_url = f"wss://browser.scrapeless.com/api/v2/browser?{query_string}"
+    async with AsyncWebCrawler(
+        config=BrowserConfig(
+            headless=False,
+            browser_mode="cdp",
+            cdp_url=scrapeless_connection_url,
+        )
+    ) as crawler:
+        result = await crawler.arun(
+            url="https://www.scrapeless.com/en",
+            config=CrawlerRunConfig(
+                wait_for="css:.content",
+                scan_full_page=True,
+            ),
+        )
+        print("-" * 20)
+        print(f'Status Code: {result.status_code}')
+        print("-" * 20)
+        print(f'Title: {result.metadata["title"]}')
+        print(f'Description: {result.metadata["description"]}')
+        print("-" * 20)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+    
\ No newline at end of file

From 33a3cc3933c551d93098a4e4caab565cc78bc511 Mon Sep 17 00:00:00 2001
From: Chris Murphy <chris.murphy@klaviyo.com>
Date: Mon, 1 Dec 2025 11:31:07 -0500
Subject: [PATCH 2/7] reproduced AttributeError from #1642

---
 tests/docker/test_filter_deep_crawl.py | 102 +++++++++++++++++--------
 1 file changed, 71 insertions(+), 31 deletions(-)

diff --git a/tests/docker/test_filter_deep_crawl.py b/tests/docker/test_filter_deep_crawl.py
index 4ee0df40..25feacd9 100644
--- a/tests/docker/test_filter_deep_crawl.py
+++ b/tests/docker/test_filter_deep_crawl.py
@@ -1,16 +1,30 @@
 """
 Test the complete fix for both the filter serialization and JSON serialization issues.
 """
+import os
+from typing import Any
 
 import asyncio
 import httpx
 
 from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
+from crawl4ai.deep_crawling import (
+    BFSDeepCrawlStrategy,
+    ContentRelevanceFilter,
+    FilterChain,
+    URLFilter,
+    URLPatternFilter,
+)
 
-BASE_URL = "http://localhost:11234/"  # Adjust port as needed
+CRAWL4AI_DOCKER_PORT = os.environ.get("CRAWL4AI_DOCKER_PORT", "11234")
+try:
+    BASE_PORT = int(CRAWL4AI_DOCKER_PORT)
+except TypeError:
+    BASE_PORT = 11234
+BASE_URL = f"http://localhost:{BASE_PORT}/"  # Adjust port as needed
 
-async def test_with_docker_client():
+
+async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool:
     """Test using the Docker client (same as 1419.py)."""
     from crawl4ai.docker_client import Crawl4aiDockerClient
     
@@ -24,15 +38,6 @@ async def test_with_docker_client():
             verbose=True,
         ) as client:
             
-            # Create filter chain - testing the serialization fix
-            filter_chain = [
-                URLPatternFilter(
-                    # patterns=["*about*", "*privacy*", "*terms*"],
-                    patterns=["*advanced*"],
-                    reverse=True
-                ),
-            ]
-            
             crawler_config = CrawlerRunConfig(
                 deep_crawl_strategy=BFSDeepCrawlStrategy(
                     max_depth=2,  # Keep it shallow for testing
@@ -79,7 +84,7 @@ async def test_with_docker_client():
         return False
 
 
-async def test_with_rest_api():
+async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool:
     """Test using REST API directly."""
     print("\n" + "=" * 60)
     print("Testing with REST API")
@@ -94,15 +99,7 @@ async def test_with_rest_api():
             "filter_chain": {
                 "type": "FilterChain",
                 "params": {
-                    "filters": [
-                        {
-                            "type": "URLPatternFilter",
-                            "params": {
-                                "patterns": ["*advanced*"],
-                                "reverse": True
-                            }
-                        }
-                    ]
+                    "filters": filters
                 }
             }
         }
@@ -165,12 +162,58 @@ async def main():
     results = []
     
     # Test 1: Docker client
-    docker_passed = await test_with_docker_client()
-    results.append(("Docker Client", docker_passed))
+    filter_chain_test_cases = [
+        [
+            URLPatternFilter(
+                # patterns=["*about*", "*privacy*", "*terms*"],
+                patterns=["*advanced*"],
+                reverse=True
+            ),
+        ],
+        [
+            ContentRelevanceFilter(
+                query="about faq",
+                threshold=0.2,
+            ),
+        ],
+    ]
+    for idx, filter_chain in enumerate(filter_chain_test_cases):
+        docker_passed = await test_with_docker_client(filter_chain=filter_chain)
+        results.append((f"Docker Client w/ filter chain {idx}", docker_passed))
     
     # Test 2: REST API
-    rest_passed = await test_with_rest_api()
-    results.append(("REST API", rest_passed))
+    filters_test_cases = [
+        [
+            {
+                "type": "URLPatternFilter",
+                "params": {
+                    "patterns": ["*advanced*"],
+                    "reverse": True
+                }
+            }
+        ],
+        [
+            {
+                "type": "ContentRelevanceFilter",
+                "params": {
+                    "query": "about faq",
+                    "threshold": 0.2,
+                }
+            }
+        ],
+        [
+            {
+                "type": "ContentRelevanceFilter",
+                "params": {
+                    "query": ["about", "faq"],
+                    "threshold": 0.2,
+                }
+            }
+        ],
+    ]
+    for idx, filters in enumerate(filters_test_cases):
+        rest_passed = await test_with_rest_api(filters=filters)
+        results.append((f"REST API w/ filters {idx}", rest_passed))
     
     # Summary
     print("\n" + "=" * 60)
@@ -186,10 +229,7 @@ async def main():
     
     print("=" * 60)
     if all_passed:
-        print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
-        print("\nThe fixes:")
-        print("1. Filter serialization: Fixed by not serializing private __slots__")
-        print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
+        print("🎉 ALL TESTS PASSED!")
     else:
         print("⚠️ Some tests failed. Please check the server logs for details.")
     
@@ -198,4 +238,4 @@ async def main():
 
 if __name__ == "__main__":
     import sys
-    sys.exit(asyncio.run(main()))
\ No newline at end of file
+    sys.exit(asyncio.run(main()))

From 6ec6bc4d8aee72484f7ae567aba74ec3da9f5753 Mon Sep 17 00:00:00 2001
From: Chris Murphy <chris.murphy@klaviyo.com>
Date: Mon, 1 Dec 2025 16:15:27 -0500
Subject: [PATCH 3/7] pass timeout parameter to docker client request

---
 crawl4ai/docker_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py
index 969fee7c..6624cf07 100644
--- a/crawl4ai/docker_client.py
+++ b/crawl4ai/docker_client.py
@@ -180,7 +180,7 @@ class Crawl4aiDockerClient:
                                 yield CrawlResult(**result)
             return stream_results()
 
-        response = await self._request("POST", "/crawl", json=data)
+        response = await self._request("POST", "/crawl", json=data, timeout=hooks_timeout)
         result_data = response.json()
         if not result_data.get("success", False):
             raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")

From eb76df2c0d8a5d77667fcaaa413655639e9208ca Mon Sep 17 00:00:00 2001
From: Chris Murphy <chris.murphy@klaviyo.com>
Date: Mon, 1 Dec 2025 16:15:58 -0500
Subject: [PATCH 4/7] added missing deep crawling objects to init

---
 crawl4ai/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 8f1fdef4..af35e6a0 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -72,6 +72,8 @@ from .deep_crawling import (
     BestFirstCrawlingStrategy,
     DFSDeepCrawlStrategy,
     DeepCrawlDecorator,
+    ContentRelevanceFilter,
+    ContentTypeScorer,
 )
 # NEW: Import AsyncUrlSeeder
 from .async_url_seeder import AsyncUrlSeeder

From e95e8e1a974ebee12ba42a21387fd7ecc7d8fec9 Mon Sep 17 00:00:00 2001
From: Chris Murphy <chris.murphy@klaviyo.com>
Date: Mon, 1 Dec 2025 16:16:31 -0500
Subject: [PATCH 5/7] generalized query in ContentRelevanceFilter to be a str
 or list

---
 crawl4ai/deep_crawling/filters.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py
index 981cbcd8..c075cb7d 100644
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -509,18 +509,22 @@ class DomainFilter(URLFilter):
 class ContentRelevanceFilter(URLFilter):
     """BM25-based relevance filter using head section content"""
 
-    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
+    __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl", "query")
 
     def __init__(
         self,
-        query: str,
+        query: Union[str, List[str]],
         threshold: float,
         k1: float = 1.2,
         b: float = 0.75,
         avgdl: int = 1000,
     ):
         super().__init__(name="BM25RelevanceFilter")
-        self.query_terms = self._tokenize(query)
+        if isinstance(query, list):
+            self.query = " ".join(query)
+        else:
+            self.query = query
+        self.query_terms = self._tokenize(self.query)
         self.threshold = threshold
         self.k1 = k1  # TF saturation parameter
         self.b = b  # Length normalization parameter

From 3a8f8298d3357049251e611ccdbba233d56f2e61 Mon Sep 17 00:00:00 2001
From: Chris Murphy <chris.murphy@klaviyo.com>
Date: Mon, 1 Dec 2025 16:18:59 -0500
Subject: [PATCH 6/7] import modules from enhanceable deserialization

---
 crawl4ai/async_configs.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index bfa0d398..eee43547 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,5 +1,5 @@
+import importlib
 import os
-from typing import Union
 import warnings
 import requests
 from .config import (
@@ -27,14 +27,14 @@ from .table_extraction import TableExtractionStrategy, DefaultTableExtraction
 from .cache_context import CacheMode
 from .proxy_strategy import ProxyRotationStrategy
 
-from typing import Union, List, Callable
 import inspect
-from typing import Any, Dict, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 from enum import Enum
 
 # Type alias for URL matching
 UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
 
+
 class MatchMode(Enum):
     OR = "or"
     AND = "and"
@@ -42,8 +42,7 @@ class MatchMode(Enum):
 # from .proxy_strategy import ProxyConfig
 
 
-
-def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
+def to_serializable_dict(obj: Any, ignore_default_value : bool = False):
     """
     Recursively convert an object to a serializable dictionary using {type, params} structure
     for complex objects.
@@ -110,8 +109,6 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
         #             if value is not None:
         #                 current_values[attr_name] = to_serializable_dict(value)
 
-            
-        
         return {
             "type": obj.__class__.__name__,
             "params": current_values
@@ -137,12 +134,20 @@ def from_serializable_dict(data: Any) -> Any:
         if data["type"] == "dict" and "value" in data:
             return {k: from_serializable_dict(v) for k, v in data["value"].items()}
 
-        # Import from crawl4ai for class instances
-        import crawl4ai
-
-        if hasattr(crawl4ai, data["type"]):
-            cls = getattr(crawl4ai, data["type"])
+        cls = None
+        # If you are receiving an error while trying to convert a dict to an object:
+        # Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file
+        module_paths = ["crawl4ai"]
+        for module_path in module_paths:
+            try:
+                mod = importlib.import_module(module_path)
+                if hasattr(mod, data["type"]):
+                    cls = getattr(mod, data["type"])
+                    break
+            except (ImportError, AttributeError):
+                continue
 
+        if cls is not None:
             # Handle Enum
             if issubclass(cls, Enum):
                 return cls(data["params"])

From 6893094f58582e7787888d65582c8d5767b14645 Mon Sep 17 00:00:00 2001
From: Chris Murphy <chris.murphy@klaviyo.com>
Date: Mon, 1 Dec 2025 16:19:19 -0500
Subject: [PATCH 7/7] parameterized tests

---
 tests/docker/test_filter_deep_crawl.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/tests/docker/test_filter_deep_crawl.py b/tests/docker/test_filter_deep_crawl.py
index 25feacd9..9e82073c 100644
--- a/tests/docker/test_filter_deep_crawl.py
+++ b/tests/docker/test_filter_deep_crawl.py
@@ -2,6 +2,7 @@
 Test the complete fix for both the filter serialization and JSON serialization issues.
 """
 import os
+import traceback
 from typing import Any
 
 import asyncio
@@ -24,7 +25,7 @@ except TypeError:
 BASE_URL = f"http://localhost:{BASE_PORT}/"  # Adjust port as needed
 
 
-async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool:
+async def test_with_docker_client(filter_chain: list[URLFilter], max_pages: int = 20, timeout: int = 30) -> bool:
     """Test using the Docker client (same as 1419.py)."""
     from crawl4ai.docker_client import Crawl4aiDockerClient
     
@@ -41,7 +42,7 @@ async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool:
             crawler_config = CrawlerRunConfig(
                 deep_crawl_strategy=BFSDeepCrawlStrategy(
                     max_depth=2,  # Keep it shallow for testing
-                    # max_pages=5,  # Limit pages for testing
+                    max_pages=max_pages,  # Limit pages for testing
                     filter_chain=FilterChain(filter_chain)
                 ),
                 cache_mode=CacheMode.BYPASS,
@@ -52,6 +53,7 @@ async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool:
                 ["https://docs.crawl4ai.com"],  # Simple test page
                 browser_config=BrowserConfig(headless=True),
                 crawler_config=crawler_config,
+                hooks_timeout=timeout,
             )
             
             if results:
@@ -79,12 +81,11 @@ async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool:
         
     except Exception as e:
         print(f"❌ Docker client test failed: {e}")
-        import traceback
         traceback.print_exc()
         return False
 
 
-async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool:
+async def test_with_rest_api(filters: list[dict[str, Any]], max_pages: int = 20, timeout: int = 30) -> bool:
     """Test using REST API directly."""
     print("\n" + "=" * 60)
     print("Testing with REST API")
@@ -95,7 +96,7 @@ async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool:
         "type": "BFSDeepCrawlStrategy",
         "params": {
             "max_depth": 2,
-            # "max_pages": 5,
+            "max_pages": max_pages,
             "filter_chain": {
                 "type": "FilterChain",
                 "params": {
@@ -123,7 +124,7 @@ async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool:
             response = await client.post(
                 f"{BASE_URL}crawl",
                 json=crawl_payload,
-                timeout=30
+                timeout=timeout,
             )
             
             if response.status_code == 200:
@@ -147,7 +148,6 @@ async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool:
         
     except Exception as e:
         print(f"❌ REST API test failed: {e}")
-        import traceback
         traceback.print_exc()
         return False
 
@@ -162,6 +162,8 @@ async def main():
     results = []
     
     # Test 1: Docker client
+    max_pages_ = [20, 5]
+    timeouts = [30, 60]
     filter_chain_test_cases = [
         [
             URLPatternFilter(
@@ -177,11 +179,13 @@ async def main():
             ),
         ],
     ]
-    for idx, filter_chain in enumerate(filter_chain_test_cases):
-        docker_passed = await test_with_docker_client(filter_chain=filter_chain)
+    for idx, (filter_chain, max_pages, timeout) in enumerate(zip(filter_chain_test_cases, max_pages_, timeouts)):
+        docker_passed = await test_with_docker_client(filter_chain=filter_chain, max_pages=max_pages, timeout=timeout)
         results.append((f"Docker Client w/ filter chain {idx}", docker_passed))
     
     # Test 2: REST API
+    max_pages_ = [20, 5, 5]
+    timeouts = [30, 60, 60]
     filters_test_cases = [
         [
             {
@@ -211,8 +215,8 @@ async def main():
             }
         ],
     ]
-    for idx, filters in enumerate(filters_test_cases):
-        rest_passed = await test_with_rest_api(filters=filters)
+    for idx, (filters, max_pages, timeout) in enumerate(zip(filters_test_cases, max_pages_, timeouts)):
+        rest_passed = await test_with_rest_api(filters=filters, max_pages=max_pages, timeout=timeout)
         results.append((f"REST API w/ filters {idx}", rest_passed))
     
     # Summary