From 0024c82cdcbd7c2d9e9e93ec40c8fec2563eff8f Mon Sep 17 00:00:00 2001 From: Aravind Date: Mon, 24 Nov 2025 17:59:33 +0530 Subject: [PATCH 1/7] Sponsors/new (#1637) --- README.md | 2 +- .../cloud_browser/scrapeless_browser.py | 61 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 docs/examples/cloud_browser/scrapeless_browser.py diff --git a/README.md b/README.md index 79161a8a..09178cb9 100644 --- a/README.md +++ b/README.md @@ -1034,7 +1034,7 @@ Our enterprise sponsors and technology partners help scale Crawl4AI to power pro | Company | About | Sponsorship Tier | |------|------|----------------------------| -| Scrapeless | Scrapeless is the best full-stack web scraping toolkit offering Scraping API, Scraping Browser, Web Unlocker, Captcha Solver, and Proxies, designed to handle all your data collection needs. | 🥈 Silver | +| Scrapeless | Scrapeless provides production-grade infrastructure for Crawling, Automation, and AI Agents, offering Scraping Browser, 4 Proxy Types and Universal Scraping API. | 🥈 Silver | | Capsolver | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥉 Bronze | | DataSync | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold | | Kidocode

KidoCode

| Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold | diff --git a/docs/examples/cloud_browser/scrapeless_browser.py b/docs/examples/cloud_browser/scrapeless_browser.py new file mode 100644 index 00000000..4981c813 --- /dev/null +++ b/docs/examples/cloud_browser/scrapeless_browser.py @@ -0,0 +1,61 @@ +import json +import asyncio +from urllib.parse import quote, urlencode +from crawl4ai import CrawlerRunConfig, BrowserConfig, AsyncWebCrawler + +# Scrapeless provides a free anti-detection fingerprint browser client and cloud browsers: +# https://www.scrapeless.com/en/blog/scrapeless-nstbrowser-strategic-integration + +async def main(): + # customize browser fingerprint + fingerprint = { + "userAgent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.1.2.3 Safari/537.36", + "platform": "Windows", + "screen": { + "width": 1280, "height": 1024 + }, + "localization": { + "languages": ["zh-HK", "en-US", "en"], "timezone": "Asia/Hong_Kong", + } + } + + fingerprint_json = json.dumps(fingerprint) + encoded_fingerprint = quote(fingerprint_json) + + scrapeless_params = { + "token": "your token", + "sessionTTL": 1000, + "sessionName": "Demo", + "fingerprint": encoded_fingerprint, + # Sets the target country/region for the proxy, sending requests via an IP address from that region. You can specify a country code (e.g., US for the United States, GB for the United Kingdom, ANY for any country). See country codes for all supported options. + # "proxyCountry": "ANY", + # create profile on scrapeless + # "profileId": "your profileId", + # For more usage details, please refer to https://docs.scrapeless.com/en/scraping-browser/quickstart/getting-started + } + query_string = urlencode(scrapeless_params) + scrapeless_connection_url = f"wss://browser.scrapeless.com/api/v2/browser?{query_string}" + async with AsyncWebCrawler( + config=BrowserConfig( + headless=False, + browser_mode="cdp", + cdp_url=scrapeless_connection_url, + ) + ) as crawler: + result = await crawler.arun( + url="https://www.scrapeless.com/en", + config=CrawlerRunConfig( + wait_for="css:.content", + scan_full_page=True, + ), + ) + print("-" * 20) + print(f'Status Code: {result.status_code}') + print("-" * 20) + print(f'Title: {result.metadata["title"]}') + print(f'Description: {result.metadata["description"]}') + print("-" * 20) + +if __name__ == "__main__": + asyncio.run(main()) + \ No newline at end of file From 33a3cc3933c551d93098a4e4caab565cc78bc511 Mon Sep 17 00:00:00 2001 From: Chris Murphy Date: Mon, 1 Dec 2025 11:31:07 -0500 Subject: [PATCH 2/7] reproduced AttributeError from #1642 --- tests/docker/test_filter_deep_crawl.py | 102 +++++++++++++++++-------- 1 file changed, 71 insertions(+), 31 deletions(-) diff --git a/tests/docker/test_filter_deep_crawl.py b/tests/docker/test_filter_deep_crawl.py index 4ee0df40..25feacd9 100644 --- a/tests/docker/test_filter_deep_crawl.py +++ b/tests/docker/test_filter_deep_crawl.py @@ -1,16 +1,30 @@ """ Test the complete fix for both the filter serialization and JSON serialization issues. """ +import os +from typing import Any import asyncio import httpx from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig -from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter +from crawl4ai.deep_crawling import ( + BFSDeepCrawlStrategy, + ContentRelevanceFilter, + FilterChain, + URLFilter, + URLPatternFilter, +) -BASE_URL = "http://localhost:11234/" # Adjust port as needed +CRAWL4AI_DOCKER_PORT = os.environ.get("CRAWL4AI_DOCKER_PORT", "11234") +try: + BASE_PORT = int(CRAWL4AI_DOCKER_PORT) +except TypeError: + BASE_PORT = 11234 +BASE_URL = f"http://localhost:{BASE_PORT}/" # Adjust port as needed -async def test_with_docker_client(): + +async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool: """Test using the Docker client (same as 1419.py).""" from crawl4ai.docker_client import Crawl4aiDockerClient @@ -24,15 +38,6 @@ async def test_with_docker_client(): verbose=True, ) as client: - # Create filter chain - testing the serialization fix - filter_chain = [ - URLPatternFilter( - # patterns=["*about*", "*privacy*", "*terms*"], - patterns=["*advanced*"], - reverse=True - ), - ] - crawler_config = CrawlerRunConfig( deep_crawl_strategy=BFSDeepCrawlStrategy( max_depth=2, # Keep it shallow for testing @@ -79,7 +84,7 @@ async def test_with_docker_client(): return False -async def test_with_rest_api(): +async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool: """Test using REST API directly.""" print("\n" + "=" * 60) print("Testing with REST API") @@ -94,15 +99,7 @@ async def test_with_rest_api(): "filter_chain": { "type": "FilterChain", "params": { - "filters": [ - { - "type": "URLPatternFilter", - "params": { - "patterns": ["*advanced*"], - "reverse": True - } - } - ] + "filters": filters } } } @@ -165,12 +162,58 @@ async def main(): results = [] # Test 1: Docker client - docker_passed = await test_with_docker_client() - results.append(("Docker Client", docker_passed)) + filter_chain_test_cases = [ + [ + URLPatternFilter( + # patterns=["*about*", "*privacy*", "*terms*"], + patterns=["*advanced*"], + reverse=True + ), + ], + [ + ContentRelevanceFilter( + query="about faq", + threshold=0.2, + ), + ], + ] + for idx, filter_chain in enumerate(filter_chain_test_cases): + docker_passed = await test_with_docker_client(filter_chain=filter_chain) + results.append((f"Docker Client w/ filter chain {idx}", docker_passed)) # Test 2: REST API - rest_passed = await test_with_rest_api() - results.append(("REST API", rest_passed)) + filters_test_cases = [ + [ + { + "type": "URLPatternFilter", + "params": { + "patterns": ["*advanced*"], + "reverse": True + } + } + ], + [ + { + "type": "ContentRelevanceFilter", + "params": { + "query": "about faq", + "threshold": 0.2, + } + } + ], + [ + { + "type": "ContentRelevanceFilter", + "params": { + "query": ["about", "faq"], + "threshold": 0.2, + } + } + ], + ] + for idx, filters in enumerate(filters_test_cases): + rest_passed = await test_with_rest_api(filters=filters) + results.append((f"REST API w/ filters {idx}", rest_passed)) # Summary print("\n" + "=" * 60) @@ -186,10 +229,7 @@ async def main(): print("=" * 60) if all_passed: - print("🎉 ALL TESTS PASSED! Both issues are fully resolved!") - print("\nThe fixes:") - print("1. Filter serialization: Fixed by not serializing private __slots__") - print("2. JSON serialization: Fixed by removing property descriptors from model_dump()") + print("🎉 ALL TESTS PASSED!") else: print("⚠️ Some tests failed. Please check the server logs for details.") @@ -198,4 +238,4 @@ async def main(): if __name__ == "__main__": import sys - sys.exit(asyncio.run(main())) \ No newline at end of file + sys.exit(asyncio.run(main())) From 6ec6bc4d8aee72484f7ae567aba74ec3da9f5753 Mon Sep 17 00:00:00 2001 From: Chris Murphy Date: Mon, 1 Dec 2025 16:15:27 -0500 Subject: [PATCH 3/7] pass timeout parameter to docker client request --- crawl4ai/docker_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py index 969fee7c..6624cf07 100644 --- a/crawl4ai/docker_client.py +++ b/crawl4ai/docker_client.py @@ -180,7 +180,7 @@ class Crawl4aiDockerClient: yield CrawlResult(**result) return stream_results() - response = await self._request("POST", "/crawl", json=data) + response = await self._request("POST", "/crawl", json=data, timeout=hooks_timeout) result_data = response.json() if not result_data.get("success", False): raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}") From eb76df2c0d8a5d77667fcaaa413655639e9208ca Mon Sep 17 00:00:00 2001 From: Chris Murphy Date: Mon, 1 Dec 2025 16:15:58 -0500 Subject: [PATCH 4/7] added missing deep crawling objects to init --- crawl4ai/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 8f1fdef4..af35e6a0 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -72,6 +72,8 @@ from .deep_crawling import ( BestFirstCrawlingStrategy, DFSDeepCrawlStrategy, DeepCrawlDecorator, + ContentRelevanceFilter, + ContentTypeScorer, ) # NEW: Import AsyncUrlSeeder from .async_url_seeder import AsyncUrlSeeder From e95e8e1a974ebee12ba42a21387fd7ecc7d8fec9 Mon Sep 17 00:00:00 2001 From: Chris Murphy Date: Mon, 1 Dec 2025 16:16:31 -0500 Subject: [PATCH 5/7] generalized query in ContentRelevanceFilter to be a str or list --- crawl4ai/deep_crawling/filters.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index 981cbcd8..c075cb7d 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -509,18 +509,22 @@ class DomainFilter(URLFilter): class ContentRelevanceFilter(URLFilter): """BM25-based relevance filter using head section content""" - __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl") + __slots__ = ("query_terms", "threshold", "k1", "b", "avgdl", "query") def __init__( self, - query: str, + query: Union[str, List[str]], threshold: float, k1: float = 1.2, b: float = 0.75, avgdl: int = 1000, ): super().__init__(name="BM25RelevanceFilter") - self.query_terms = self._tokenize(query) + if isinstance(query, list): + self.query = " ".join(query) + else: + self.query = query + self.query_terms = self._tokenize(self.query) self.threshold = threshold self.k1 = k1 # TF saturation parameter self.b = b # Length normalization parameter From 3a8f8298d3357049251e611ccdbba233d56f2e61 Mon Sep 17 00:00:00 2001 From: Chris Murphy Date: Mon, 1 Dec 2025 16:18:59 -0500 Subject: [PATCH 6/7] import modules from enhanceable deserialization --- crawl4ai/async_configs.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index bfa0d398..eee43547 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,5 +1,5 @@ +import importlib import os -from typing import Union import warnings import requests from .config import ( @@ -27,14 +27,14 @@ from .table_extraction import TableExtractionStrategy, DefaultTableExtraction from .cache_context import CacheMode from .proxy_strategy import ProxyRotationStrategy -from typing import Union, List, Callable import inspect -from typing import Any, Dict, Optional +from typing import Any, Callable, Dict, List, Optional, Union from enum import Enum # Type alias for URL matching UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]] + class MatchMode(Enum): OR = "or" AND = "and" @@ -42,8 +42,7 @@ class MatchMode(Enum): # from .proxy_strategy import ProxyConfig - -def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: +def to_serializable_dict(obj: Any, ignore_default_value : bool = False): """ Recursively convert an object to a serializable dictionary using {type, params} structure for complex objects. @@ -110,8 +109,6 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: # if value is not None: # current_values[attr_name] = to_serializable_dict(value) - - return { "type": obj.__class__.__name__, "params": current_values @@ -137,12 +134,20 @@ def from_serializable_dict(data: Any) -> Any: if data["type"] == "dict" and "value" in data: return {k: from_serializable_dict(v) for k, v in data["value"].items()} - # Import from crawl4ai for class instances - import crawl4ai - - if hasattr(crawl4ai, data["type"]): - cls = getattr(crawl4ai, data["type"]) + cls = None + # If you are receiving an error while trying to convert a dict to an object: + # Either add a module to `modules_paths` list, or add the `data["type"]` to the crawl4ai __init__.py file + module_paths = ["crawl4ai"] + for module_path in module_paths: + try: + mod = importlib.import_module(module_path) + if hasattr(mod, data["type"]): + cls = getattr(mod, data["type"]) + break + except (ImportError, AttributeError): + continue + if cls is not None: # Handle Enum if issubclass(cls, Enum): return cls(data["params"]) From 6893094f58582e7787888d65582c8d5767b14645 Mon Sep 17 00:00:00 2001 From: Chris Murphy Date: Mon, 1 Dec 2025 16:19:19 -0500 Subject: [PATCH 7/7] parameterized tests --- tests/docker/test_filter_deep_crawl.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/docker/test_filter_deep_crawl.py b/tests/docker/test_filter_deep_crawl.py index 25feacd9..9e82073c 100644 --- a/tests/docker/test_filter_deep_crawl.py +++ b/tests/docker/test_filter_deep_crawl.py @@ -2,6 +2,7 @@ Test the complete fix for both the filter serialization and JSON serialization issues. """ import os +import traceback from typing import Any import asyncio @@ -24,7 +25,7 @@ except TypeError: BASE_URL = f"http://localhost:{BASE_PORT}/" # Adjust port as needed -async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool: +async def test_with_docker_client(filter_chain: list[URLFilter], max_pages: int = 20, timeout: int = 30) -> bool: """Test using the Docker client (same as 1419.py).""" from crawl4ai.docker_client import Crawl4aiDockerClient @@ -41,7 +42,7 @@ async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool: crawler_config = CrawlerRunConfig( deep_crawl_strategy=BFSDeepCrawlStrategy( max_depth=2, # Keep it shallow for testing - # max_pages=5, # Limit pages for testing + max_pages=max_pages, # Limit pages for testing filter_chain=FilterChain(filter_chain) ), cache_mode=CacheMode.BYPASS, @@ -52,6 +53,7 @@ async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool: ["https://docs.crawl4ai.com"], # Simple test page browser_config=BrowserConfig(headless=True), crawler_config=crawler_config, + hooks_timeout=timeout, ) if results: @@ -79,12 +81,11 @@ async def test_with_docker_client(filter_chain: list[URLFilter]) -> bool: except Exception as e: print(f"❌ Docker client test failed: {e}") - import traceback traceback.print_exc() return False -async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool: +async def test_with_rest_api(filters: list[dict[str, Any]], max_pages: int = 20, timeout: int = 30) -> bool: """Test using REST API directly.""" print("\n" + "=" * 60) print("Testing with REST API") @@ -95,7 +96,7 @@ async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool: "type": "BFSDeepCrawlStrategy", "params": { "max_depth": 2, - # "max_pages": 5, + "max_pages": max_pages, "filter_chain": { "type": "FilterChain", "params": { @@ -123,7 +124,7 @@ async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool: response = await client.post( f"{BASE_URL}crawl", json=crawl_payload, - timeout=30 + timeout=timeout, ) if response.status_code == 200: @@ -147,7 +148,6 @@ async def test_with_rest_api(filters: list[dict[str, Any]]) -> bool: except Exception as e: print(f"❌ REST API test failed: {e}") - import traceback traceback.print_exc() return False @@ -162,6 +162,8 @@ async def main(): results = [] # Test 1: Docker client + max_pages_ = [20, 5] + timeouts = [30, 60] filter_chain_test_cases = [ [ URLPatternFilter( @@ -177,11 +179,13 @@ async def main(): ), ], ] - for idx, filter_chain in enumerate(filter_chain_test_cases): - docker_passed = await test_with_docker_client(filter_chain=filter_chain) + for idx, (filter_chain, max_pages, timeout) in enumerate(zip(filter_chain_test_cases, max_pages_, timeouts)): + docker_passed = await test_with_docker_client(filter_chain=filter_chain, max_pages=max_pages, timeout=timeout) results.append((f"Docker Client w/ filter chain {idx}", docker_passed)) # Test 2: REST API + max_pages_ = [20, 5, 5] + timeouts = [30, 60, 60] filters_test_cases = [ [ { @@ -211,8 +215,8 @@ async def main(): } ], ] - for idx, filters in enumerate(filters_test_cases): - rest_passed = await test_with_rest_api(filters=filters) + for idx, (filters, max_pages, timeout) in enumerate(zip(filters_test_cases, max_pages_, timeouts)): + rest_passed = await test_with_rest_api(filters=filters, max_pages=max_pages, timeout=timeout) results.append((f"REST API w/ filters {idx}", rest_passed)) # Summary