fix: Enhance URLPatternFilter to enforce path boundary checks for prefix matching. ref #1003

2025-06-10 11:19:18 +02:00
parent f9b7090084
commit 4679ee023d
2 changed files with 100 additions and 4 deletions
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -227,10 +227,21 @@ class URLPatternFilter(URLFilter):
        # Prefix check (/foo/*)
        if self._simple_prefixes:
            path = url.split("?")[0]
-            if any(path.startswith(p) for p in self._simple_prefixes):
-                result = True
-                self._update_stats(result)
-                return not result if self._reverse else result
+            # if any(path.startswith(p) for p in self._simple_prefixes):
+            #     result = True
+            #     self._update_stats(result)
+            #     return not result if self._reverse else result
+            ####
+            # Modified the prefix matching logic to ensure path boundary checking:
+            # - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path
+            # - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/`
+            ####
+            for prefix in self._simple_prefixes:
+                if path.startswith(prefix):
+                    if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']:
+                        result = True
+                        self._update_stats(result)
+                        return not result if self._reverse else result

        # Complex patterns
        if self._path_patterns:
--- a/tests/general/test_url_pattern.py
+++ b/tests/general/test_url_pattern.py
@@ -0,0 +1,85 @@
+import sys
+import os
+
+# Get the grandparent directory
+grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(grandparent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+import asyncio
+from crawl4ai.deep_crawling.filters import URLPatternFilter
+
+
+def test_prefix_boundary_matching():
+    """Test that prefix patterns respect path boundaries"""
+    print("=== Testing URLPatternFilter Prefix Boundary Fix ===")
+    
+    filter_obj = URLPatternFilter(patterns=['https://langchain-ai.github.io/langgraph/*'])
+    
+    test_cases = [
+        ('https://langchain-ai.github.io/langgraph/', True),
+        ('https://langchain-ai.github.io/langgraph/concepts/', True),
+        ('https://langchain-ai.github.io/langgraph/tutorials/', True),
+        ('https://langchain-ai.github.io/langgraph?param=1', True),
+        ('https://langchain-ai.github.io/langgraph#section', True),
+        ('https://langchain-ai.github.io/langgraphjs/', False),
+        ('https://langchain-ai.github.io/langgraphjs/concepts/', False),
+        ('https://other-site.com/langgraph/', False),
+    ]
+    
+    all_passed = True
+    for url, expected in test_cases:
+        result = filter_obj.apply(url)
+        status = "PASS" if result == expected else "FAIL"
+        if result != expected:
+            all_passed = False
+        print(f"{status:4} | Expected: {expected:5} | Got: {result:5} | {url}")
+    
+    return all_passed
+
+
+def test_edge_cases():
+    """Test edge cases for path boundary matching"""
+    print("\n=== Testing Edge Cases ===")
+    
+    test_patterns = [
+        ('/api/*', [
+            ('/api/', True),
+            ('/api/v1', True),
+            ('/api?param=1', True),
+            ('/apiv2/', False),
+            ('/api_old/', False),
+        ]),
+        
+        ('*/docs/*', [
+            ('example.com/docs/', True),
+            ('example.com/docs/guide', True),
+            ('example.com/documentation/', False),
+            ('example.com/docs_old/', False),
+        ]),
+    ]
+    
+    all_passed = True
+    for pattern, test_cases in test_patterns:
+        print(f"\nPattern: {pattern}")
+        filter_obj = URLPatternFilter(patterns=[pattern])
+        
+        for url, expected in test_cases:
+            result = filter_obj.apply(url)
+            status = "PASS" if result == expected else "FAIL"
+            if result != expected:
+                all_passed = False
+            print(f"  {status:4} | Expected: {expected:5} | Got: {result:5} | {url}")
+    
+    return all_passed
+
+if __name__ == "__main__":
+    test1_passed = test_prefix_boundary_matching()
+    test2_passed = test_edge_cases()
+    
+    if test1_passed and test2_passed:
+        print("\n✅ All tests passed!")
+        sys.exit(0)
+    else:
+        print("\n❌ Some tests failed!")
+        sys.exit(1)