diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index a43b50a4..0c843b2b 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -97,13 +97,16 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: if value != param.default and not ignore_default_value: current_values[name] = to_serializable_dict(value) - if hasattr(obj, '__slots__'): - for slot in obj.__slots__: - if slot.startswith('_'): # Handle private slots - attr_name = slot[1:] # Remove leading '_' - value = getattr(obj, slot, None) - if value is not None: - current_values[attr_name] = to_serializable_dict(value) + # Don't serialize private __slots__ - they're internal implementation details + # not constructor parameters. This was causing URLPatternFilter to fail + # because _simple_suffixes was being serialized as 'simple_suffixes' + # if hasattr(obj, '__slots__'): + # for slot in obj.__slots__: + # if slot.startswith('_'): # Handle private slots + # attr_name = slot[1:] # Remove leading '_' + # value = getattr(obj, slot, None) + # if value is not None: + # current_values[attr_name] = to_serializable_dict(value) diff --git a/crawl4ai/deep_crawling/bff_strategy.py b/crawl4ai/deep_crawling/bff_strategy.py index 7779c9f4..dc7a0bf0 100644 --- a/crawl4ai/deep_crawling/bff_strategy.py +++ b/crawl4ai/deep_crawling/bff_strategy.py @@ -47,7 +47,13 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy): self.url_scorer = url_scorer self.include_external = include_external self.max_pages = max_pages - self.logger = logger or logging.getLogger(__name__) + # self.logger = logger or logging.getLogger(__name__) + # Ensure logger is always a Logger instance, not a dict from serialization + if isinstance(logger, logging.Logger): + self.logger = logger + else: + # Create a new logger if logger is None, dict, or any other non-Logger type + self.logger = logging.getLogger(__name__) self.stats = TraversalStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() self._pages_crawled = 0 diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 950c3980..eb699f82 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -38,7 +38,13 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self.include_external = include_external self.score_threshold = score_threshold self.max_pages = max_pages - self.logger = logger or logging.getLogger(__name__) + # self.logger = logger or logging.getLogger(__name__) + # Ensure logger is always a Logger instance, not a dict from serialization + if isinstance(logger, logging.Logger): + self.logger = logger + else: + # Create a new logger if logger is None, dict, or any other non-Logger type + self.logger = logging.getLogger(__name__) self.stats = TraversalStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() self._pages_crawled = 0 diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index b65112e2..981cbcd8 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -120,6 +120,9 @@ class URLPatternFilter(URLFilter): """Pattern filter balancing speed and completeness""" __slots__ = ( + "patterns", # Store original patterns for serialization + "use_glob", # Store original use_glob for serialization + "reverse", # Store original reverse for serialization "_simple_suffixes", "_simple_prefixes", "_domain_patterns", @@ -142,6 +145,11 @@ class URLPatternFilter(URLFilter): reverse: bool = False, ): super().__init__() + # Store original constructor params for serialization + self.patterns = patterns + self.use_glob = use_glob + self.reverse = reverse + self._reverse = reverse patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 640c2f2d..63e39885 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -253,6 +253,16 @@ class CrawlResult(BaseModel): requirements change, this is where you would update the logic. """ result = super().model_dump(*args, **kwargs) + + # Remove any property descriptors that might have been included + # These deprecated properties should not be in the serialized output + for key in ['fit_html', 'fit_markdown', 'markdown_v2']: + if key in result and isinstance(result[key], property): + # del result[key] + # Nasrin: I decided to convert it to string instead of removing it. + result[key] = str(result[key]) + + # Add the markdown field properly if self._markdown is not None: result["markdown"] = self._markdown.model_dump() return result diff --git a/tests/docker/test_filter_deep_crawl.py b/tests/docker/test_filter_deep_crawl.py new file mode 100644 index 00000000..4ee0df40 --- /dev/null +++ b/tests/docker/test_filter_deep_crawl.py @@ -0,0 +1,201 @@ +""" +Test the complete fix for both the filter serialization and JSON serialization issues. +""" + +import asyncio +import httpx + +from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig +from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter + +BASE_URL = "http://localhost:11234/" # Adjust port as needed + +async def test_with_docker_client(): + """Test using the Docker client (same as 1419.py).""" + from crawl4ai.docker_client import Crawl4aiDockerClient + + print("=" * 60) + print("Testing with Docker Client") + print("=" * 60) + + try: + async with Crawl4aiDockerClient( + base_url=BASE_URL, + verbose=True, + ) as client: + + # Create filter chain - testing the serialization fix + filter_chain = [ + URLPatternFilter( + # patterns=["*about*", "*privacy*", "*terms*"], + patterns=["*advanced*"], + reverse=True + ), + ] + + crawler_config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, # Keep it shallow for testing + # max_pages=5, # Limit pages for testing + filter_chain=FilterChain(filter_chain) + ), + cache_mode=CacheMode.BYPASS, + ) + + print("\n1. Testing crawl with filters...") + results = await client.crawl( + ["https://docs.crawl4ai.com"], # Simple test page + browser_config=BrowserConfig(headless=True), + crawler_config=crawler_config, + ) + + if results: + print(f"✅ Crawl succeeded! Type: {type(results)}") + if hasattr(results, 'success'): + print(f"✅ Results success: {results.success}") + # Test that we can iterate results without JSON errors + if hasattr(results, '__iter__'): + for i, result in enumerate(results): + if hasattr(result, 'url'): + print(f" Result {i}: {result.url[:50]}...") + else: + print(f" Result {i}: {str(result)[:50]}...") + else: + # Handle list of results + print(f"✅ Got {len(results)} results") + for i, result in enumerate(results[:3]): # Show first 3 + print(f" Result {i}: {result.url[:50]}...") + else: + print("❌ Crawl failed - no results returned") + return False + + print("\n✅ Docker client test completed successfully!") + return True + + except Exception as e: + print(f"❌ Docker client test failed: {e}") + import traceback + traceback.print_exc() + return False + + +async def test_with_rest_api(): + """Test using REST API directly.""" + print("\n" + "=" * 60) + print("Testing with REST API") + print("=" * 60) + + # Create filter configuration + deep_crawl_strategy_payload = { + "type": "BFSDeepCrawlStrategy", + "params": { + "max_depth": 2, + # "max_pages": 5, + "filter_chain": { + "type": "FilterChain", + "params": { + "filters": [ + { + "type": "URLPatternFilter", + "params": { + "patterns": ["*advanced*"], + "reverse": True + } + } + ] + } + } + } + } + + crawl_payload = { + "urls": ["https://docs.crawl4ai.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "deep_crawl_strategy": deep_crawl_strategy_payload, + "cache_mode": "bypass" + } + } + } + + try: + async with httpx.AsyncClient() as client: + print("\n1. Sending crawl request to REST API...") + response = await client.post( + f"{BASE_URL}crawl", + json=crawl_payload, + timeout=30 + ) + + if response.status_code == 200: + print(f"✅ REST API returned 200 OK") + data = response.json() + if data.get("success"): + results = data.get("results", []) + print(f"✅ Got {len(results)} results") + for i, result in enumerate(results[:3]): + print(f" Result {i}: {result.get('url', 'unknown')[:50]}...") + else: + print(f"❌ Crawl not successful: {data}") + return False + else: + print(f"❌ REST API returned {response.status_code}") + print(f" Response: {response.text[:500]}") + return False + + print("\n✅ REST API test completed successfully!") + return True + + except Exception as e: + print(f"❌ REST API test failed: {e}") + import traceback + traceback.print_exc() + return False + + +async def main(): + """Run all tests.""" + print("\n🧪 TESTING COMPLETE FIX FOR DOCKER FILTER AND JSON ISSUES") + print("=" * 60) + print("Make sure the server is running with the updated code!") + print("=" * 60) + + results = [] + + # Test 1: Docker client + docker_passed = await test_with_docker_client() + results.append(("Docker Client", docker_passed)) + + # Test 2: REST API + rest_passed = await test_with_rest_api() + results.append(("REST API", rest_passed)) + + # Summary + print("\n" + "=" * 60) + print("FINAL TEST SUMMARY") + print("=" * 60) + + all_passed = True + for test_name, passed in results: + status = "✅ PASSED" if passed else "❌ FAILED" + print(f"{test_name:20} {status}") + if not passed: + all_passed = False + + print("=" * 60) + if all_passed: + print("🎉 ALL TESTS PASSED! Both issues are fully resolved!") + print("\nThe fixes:") + print("1. Filter serialization: Fixed by not serializing private __slots__") + print("2. JSON serialization: Fixed by removing property descriptors from model_dump()") + else: + print("⚠️ Some tests failed. Please check the server logs for details.") + + return 0 if all_passed else 1 + + +if __name__ == "__main__": + import sys + sys.exit(asyncio.run(main())) \ No newline at end of file