Merge pull request #1648 from christopher-w-murphy/fix/content-relevance-filter

[Fix]: Docker server does not decode ContentRelevanceFilter
This commit is contained in:
Nasrin
2025-12-03 18:36:07 +08:00
committed by GitHub
7 changed files with 169 additions and 53 deletions

View File

@@ -1034,7 +1034,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro
| Company | About | Sponsorship Tier | | Company | About | Sponsorship Tier |
|------|------|----------------------------| |------|------|----------------------------|
| <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a> | Scrapeless is the best full-stack web scraping toolkit offering Scraping API, Scraping Browser, Web Unlocker, Captcha Solver, and Proxies, designed to handle all your data collection needs. | 🥈 Silver | | <a href="https://app.scrapeless.com/passport/register?utm_source=official&utm_term=crawl4ai" target="_blank"><picture><source width="250" media="(prefers-color-scheme: dark)" srcset="https://gist.githubusercontent.com/aravindkarnam/0d275b942705604263e5c32d2db27bc1/raw/Scrapeless-light-logo.svg"><source width="250" media="(prefers-color-scheme: light)" srcset="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"><img alt="Scrapeless" src="https://gist.githubusercontent.com/aravindkarnam/22d0525cc0f3021bf19ebf6e11a69ccd/raw/Scrapeless-dark-logo.svg"></picture></a> | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver |
| <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze | | <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze |
| <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold | | <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold |
| <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 518, offering both online and on-campus education. | 🥇 Gold | | <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 518, offering both online and on-campus education. | 🥇 Gold |

View File

@@ -72,6 +72,8 @@ from .deep_crawling import (
BestFirstCrawlingStrategy, BestFirstCrawlingStrategy,
DFSDeepCrawlStrategy, DFSDeepCrawlStrategy,
DeepCrawlDecorator, DeepCrawlDecorator,
ContentRelevanceFilter,
ContentTypeScorer,
) )
# NEW: Import AsyncUrlSeeder # NEW: Import AsyncUrlSeeder
from .async_url_seeder import AsyncUrlSeeder from .async_url_seeder import AsyncUrlSeeder

View File

@@ -1,5 +1,5 @@
import importlib
import os import os
from typing import Union
import warnings import warnings
import requests import requests
from .config import ( from .config import (
@@ -27,14 +27,14 @@ from .table_extraction import TableExtractionStrategy, DefaultTableExtraction
from .cache_context import CacheMode from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy from .proxy_strategy import ProxyRotationStrategy
from typing import Union, List, Callable
import inspect import inspect
from typing import Any, Dict, Optional from typing import Any, Callable, Dict, List, Optional, Union
from enum import Enum from enum import Enum
# Type alias for URL matching # Type alias for URL matching
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]] UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
class MatchMode(Enum): class MatchMode(Enum):
OR = "or" OR = "or"
AND = "and" AND = "and"
@@ -42,8 +42,7 @@ class MatchMode(Enum):
# from .proxy_strategy import ProxyConfig # from .proxy_strategy import ProxyConfig
def to_serializable_dict(obj: Any, ignore_default_value : bool = False):
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
""" """
Recursively convert an object to a serializable dictionary using {type, params} structure Recursively convert an object to a serializable dictionary using {type, params} structure
for complex objects. for complex objects.
@@ -110,8 +109,6 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
# if value is not None: # if value is not None:
# current_values[attr_name] = to_serializable_dict(value) # current_values[attr_name] = to_serializable_dict(value)
return { return {
"type": obj.__class__.__name__, "type": obj.__class__.__name__,
"params": current_values "params": current_values
@@ -137,12 +134,20 @@ def from_serializable_dict(data: Any) -> Any:
if data["type"] == "dict" and "value" in data: if data["type"] == "dict" and "value" in data:
return {k: from_serializable_dict(v) for k, v in data["value"].items()} return {k: from_serializable_dict(v) for k, v in data["value"].items()}
# Import from crawl4ai for class instances cls = None
import crawl4ai # If you are receiving an error while trying to convert a dict to an object:
# Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file
if hasattr(crawl4ai, data["type"]): module_paths = ["crawl4ai"]
cls = getattr(crawl4ai, data["type"]) for module_path in module_paths:
try:
mod = importlib.import_module(module_path)
if hasattr(mod, data["type"]):
cls = getattr(mod, data["type"])
break
except (ImportError, AttributeError):
continue
if cls is not None:
# Handle Enum # Handle Enum
if issubclass(cls, Enum): if issubclass(cls, Enum):
return cls(data["params"]) return cls(data["params"])

View File

@@ -509,18 +509,22 @@ class DomainFilter(URLFilter):
class ContentRelevanceFilter(URLFilter): class ContentRelevanceFilter(URLFilter):
"""BM25-based relevance filter using head section content""" """BM25-based relevance filter using head section content"""
__slots__ = ("query_terms", "threshold", "k1", "b", "avgdl") __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl", "query")
def __init__( def __init__(
self, self,
query: str, query: Union[str, List[str]],
threshold: float, threshold: float,
k1: float = 1.2, k1: float = 1.2,
b: float = 0.75, b: float = 0.75,
avgdl: int = 1000, avgdl: int = 1000,
): ):
super().__init__(name="BM25RelevanceFilter") super().__init__(name="BM25RelevanceFilter")
self.query_terms = self._tokenize(query) if isinstance(query, list):
self.query = " ".join(query)
else:
self.query = query
self.query_terms = self._tokenize(self.query)
self.threshold = threshold self.threshold = threshold
self.k1 = k1 # TF saturation parameter self.k1 = k1 # TF saturation parameter
self.b = b # Length normalization parameter self.b = b # Length normalization parameter

View File

@@ -180,7 +180,7 @@ class Crawl4aiDockerClient:
yield CrawlResult(**result) yield CrawlResult(**result)
return stream_results() return stream_results()
response = await self._request("POST", "/crawl", json=data) response = await self._request("POST", "/crawl", json=data, timeout=hooks_timeout)
result_data = response.json() result_data = response.json()
if not result_data.get("success", False): if not result_data.get("success", False):
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}") raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")

View File

@@ -0,0 +1,61 @@
import json
import asyncio
from urllib.parse import quote, urlencode
from crawl4ai import CrawlerRunConfig, BrowserConfig, AsyncWebCrawler
# Scrapeless provides a free anti-detection fingerprint browser client and cloud browsers:
# https://www.scrapeless.com/en/blog/scrapeless-nstbrowser-strategic-integration
async def main():
# customize browser fingerprint
fingerprint = {
"userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.1.2.3 Safari/537.36",
"platform": "Windows",
"screen": {
"width": 1280, "height": 1024
},
"localization": {
"languages": ["zh-HK", "en-US", "en"], "timezone": "Asia/Hong_Kong",
}
}
fingerprint_json = json.dumps(fingerprint)
encoded_fingerprint = quote(fingerprint_json)
scrapeless_params = {
"token": "your token",
"sessionTTL": 1000,
"sessionName": "Demo",
"fingerprint": encoded_fingerprint,
# Sets the target country/region for the proxy, sending requests via an IP address from that region. You can specify a country code (e.g., US for the United States, GB for the United Kingdom, ANY for any country). See country codes for all supported options.
# "proxyCountry": "ANY",
# create profile on scrapeless
# "profileId": "your profileId",
# For more usage details, please refer to https://docs.scrapeless.com/en/scraping-browser/quickstart/getting-started
}
query_string = urlencode(scrapeless_params)
scrapeless_connection_url = f"wss://browser.scrapeless.com/api/v2/browser?{query_string}"
async with AsyncWebCrawler(
config=BrowserConfig(
headless=False,
browser_mode="cdp",
cdp_url=scrapeless_connection_url,
)
) as crawler:
result = await crawler.arun(
url="https://www.scrapeless.com/en",
config=CrawlerRunConfig(
wait_for="css:.content",
scan_full_page=True,
),
)
print("-" * 20)
print(f'Status Code: {result.status_code}')
print("-" * 20)
print(f'Title: {result.metadata["title"]}')
print(f'Description: {result.metadata["description"]}')
print("-" * 20)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,16 +1,31 @@
""" """
Test the complete fix for both the filter serialization and JSON serialization issues. Test the complete fix for both the filter serialization and JSON serialization issues.
""" """
import os
import traceback
from typing import Any
import asyncio import asyncio
import httpx import httpx
from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter from crawl4ai.deep_crawling import (
BFSDeepCrawlStrategy,
ContentRelevanceFilter,
FilterChain,
URLFilter,
URLPatternFilter,
)
BASE_URL = "http://localhost:11234/" # Adjust port as needed CRAWL4AI_DOCKER_PORT = os.environ.get("CRAWL4AI_DOCKER_PORT", "11234")
try:
BASE_PORT = int(CRAWL4AI_DOCKER_PORT)
except TypeError:
BASE_PORT = 11234
BASE_URL = f"http://localhost:{BASE_PORT}/" # Adjust port as needed
async def test_with_docker_client():
async def test_with_docker_client(filter_chain: list[URLFilter], max_pages: int = 20, timeout: int = 30) -> bool:
"""Test using the Docker client (same as 1419.py).""" """Test using the Docker client (same as 1419.py)."""
from crawl4ai.docker_client import Crawl4aiDockerClient from crawl4ai.docker_client import Crawl4aiDockerClient
@@ -24,19 +39,10 @@ async def test_with_docker_client():
verbose=True, verbose=True,
) as client: ) as client:
# Create filter chain - testing the serialization fix
filter_chain = [
URLPatternFilter(
# patterns=["*about*", "*privacy*", "*terms*"],
patterns=["*advanced*"],
reverse=True
),
]
crawler_config = CrawlerRunConfig( crawler_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy( deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2, # Keep it shallow for testing max_depth=2, # Keep it shallow for testing
# max_pages=5, # Limit pages for testing max_pages=max_pages, # Limit pages for testing
filter_chain=FilterChain(filter_chain) filter_chain=FilterChain(filter_chain)
), ),
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
@@ -47,6 +53,7 @@ async def test_with_docker_client():
["https://docs.crawl4ai.com"], # Simple test page ["https://docs.crawl4ai.com"], # Simple test page
browser_config=BrowserConfig(headless=True), browser_config=BrowserConfig(headless=True),
crawler_config=crawler_config, crawler_config=crawler_config,
hooks_timeout=timeout,
) )
if results: if results:
@@ -74,12 +81,11 @@ async def test_with_docker_client():
except Exception as e: except Exception as e:
print(f"❌ Docker client test failed: {e}") print(f"❌ Docker client test failed: {e}")
import traceback
traceback.print_exc() traceback.print_exc()
return False return False
async def test_with_rest_api(): async def test_with_rest_api(filters: list[dict[str, Any]], max_pages: int = 20, timeout: int = 30) -> bool:
"""Test using REST API directly.""" """Test using REST API directly."""
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("Testing with REST API") print("Testing with REST API")
@@ -90,19 +96,11 @@ async def test_with_rest_api():
"type": "BFSDeepCrawlStrategy", "type": "BFSDeepCrawlStrategy",
"params": { "params": {
"max_depth": 2, "max_depth": 2,
# "max_pages": 5, "max_pages": max_pages,
"filter_chain": { "filter_chain": {
"type": "FilterChain", "type": "FilterChain",
"params": { "params": {
"filters": [ "filters": filters
{
"type": "URLPatternFilter",
"params": {
"patterns": ["*advanced*"],
"reverse": True
}
}
]
} }
} }
} }
@@ -126,7 +124,7 @@ async def test_with_rest_api():
response = await client.post( response = await client.post(
f"{BASE_URL}crawl", f"{BASE_URL}crawl",
json=crawl_payload, json=crawl_payload,
timeout=30 timeout=timeout,
) )
if response.status_code == 200: if response.status_code == 200:
@@ -150,7 +148,6 @@ async def test_with_rest_api():
except Exception as e: except Exception as e:
print(f"❌ REST API test failed: {e}") print(f"❌ REST API test failed: {e}")
import traceback
traceback.print_exc() traceback.print_exc()
return False return False
@@ -165,12 +162,62 @@ async def main():
results = [] results = []
# Test 1: Docker client # Test 1: Docker client
docker_passed = await test_with_docker_client() max_pages_ = [20, 5]
results.append(("Docker Client", docker_passed)) timeouts = [30, 60]
filter_chain_test_cases = [
[
URLPatternFilter(
# patterns=["*about*", "*privacy*", "*terms*"],
patterns=["*advanced*"],
reverse=True
),
],
[
ContentRelevanceFilter(
query="about faq",
threshold=0.2,
),
],
]
for idx, (filter_chain, max_pages, timeout) in enumerate(zip(filter_chain_test_cases, max_pages_, timeouts)):
docker_passed = await test_with_docker_client(filter_chain=filter_chain, max_pages=max_pages, timeout=timeout)
results.append((f"Docker Client w/ filter chain {idx}", docker_passed))
# Test 2: REST API # Test 2: REST API
rest_passed = await test_with_rest_api() max_pages_ = [20, 5, 5]
results.append(("REST API", rest_passed)) timeouts = [30, 60, 60]
filters_test_cases = [
[
{
"type": "URLPatternFilter",
"params": {
"patterns": ["*advanced*"],
"reverse": True
}
}
],
[
{
"type": "ContentRelevanceFilter",
"params": {
"query": "about faq",
"threshold": 0.2,
}
}
],
[
{
"type": "ContentRelevanceFilter",
"params": {
"query": ["about", "faq"],
"threshold": 0.2,
}
}
],
]
for idx, (filters, max_pages, timeout) in enumerate(zip(filters_test_cases, max_pages_, timeouts)):
rest_passed = await test_with_rest_api(filters=filters, max_pages=max_pages, timeout=timeout)
results.append((f"REST API w/ filters {idx}", rest_passed))
# Summary # Summary
print("\n" + "=" * 60) print("\n" + "=" * 60)
@@ -186,10 +233,7 @@ async def main():
print("=" * 60) print("=" * 60)
if all_passed: if all_passed:
print("🎉 ALL TESTS PASSED! Both issues are fully resolved!") print("🎉 ALL TESTS PASSED!")
print("\nThe fixes:")
print("1. Filter serialization: Fixed by not serializing private __slots__")
print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
else: else:
print("⚠️ Some tests failed. Please check the server logs for details.") print("⚠️ Some tests failed. Please check the server logs for details.")