From 4679ee023d06f94ecad73a963ab23f9f0d08da14 Mon Sep 17 00:00:00 2001
From: ntohidi <nasrin@kidocode.com>
Date: Tue, 10 Jun 2025 11:19:18 +0200
Subject: [PATCH] fix: Enhance URLPatternFilter to enforce path boundary checks
 for prefix matching. ref #1003

---
 crawl4ai/deep_crawling/filters.py | 19 +++++--
 tests/general/test_url_pattern.py | 85 +++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 4 deletions(-)
 create mode 100644 tests/general/test_url_pattern.py

diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py
index 8d0bcc4d..b65112e2 100644
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -227,10 +227,21 @@ class URLPatternFilter(URLFilter):
         # Prefix check (/foo/*)
         if self._simple_prefixes:
             path = url.split("?")[0]
-            if any(path.startswith(p) for p in self._simple_prefixes):
-                result = True
-                self._update_stats(result)
-                return not result if self._reverse else result
+            # if any(path.startswith(p) for p in self._simple_prefixes):
+            #     result = True
+            #     self._update_stats(result)
+            #     return not result if self._reverse else result
+            ####
+            # Modified the prefix matching logic to ensure path boundary checking:
+            # - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path
+            # - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/`
+            ####
+            for prefix in self._simple_prefixes:
+                if path.startswith(prefix):
+                    if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']:
+                        result = True
+                        self._update_stats(result)
+                        return not result if self._reverse else result
 
         # Complex patterns
         if self._path_patterns:
diff --git a/tests/general/test_url_pattern.py b/tests/general/test_url_pattern.py
new file mode 100644
index 00000000..3aea14d9
--- /dev/null
+++ b/tests/general/test_url_pattern.py
@@ -0,0 +1,85 @@
+import sys
+import os
+
+# Get the grandparent directory
+grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.append(grandparent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+import asyncio
+from crawl4ai.deep_crawling.filters import URLPatternFilter
+
+
+def test_prefix_boundary_matching():
+    """Test that prefix patterns respect path boundaries"""
+    print("=== Testing URLPatternFilter Prefix Boundary Fix ===")
+    
+    filter_obj = URLPatternFilter(patterns=['https://langchain-ai.github.io/langgraph/*'])
+    
+    test_cases = [
+        ('https://langchain-ai.github.io/langgraph/', True),
+        ('https://langchain-ai.github.io/langgraph/concepts/', True),
+        ('https://langchain-ai.github.io/langgraph/tutorials/', True),
+        ('https://langchain-ai.github.io/langgraph?param=1', True),
+        ('https://langchain-ai.github.io/langgraph#section', True),
+        ('https://langchain-ai.github.io/langgraphjs/', False),
+        ('https://langchain-ai.github.io/langgraphjs/concepts/', False),
+        ('https://other-site.com/langgraph/', False),
+    ]
+    
+    all_passed = True
+    for url, expected in test_cases:
+        result = filter_obj.apply(url)
+        status = "PASS" if result == expected else "FAIL"
+        if result != expected:
+            all_passed = False
+        print(f"{status:4} | Expected: {expected:5} | Got: {result:5} | {url}")
+    
+    return all_passed
+
+
+def test_edge_cases():
+    """Test edge cases for path boundary matching"""
+    print("\n=== Testing Edge Cases ===")
+    
+    test_patterns = [
+        ('/api/*', [
+            ('/api/', True),
+            ('/api/v1', True),
+            ('/api?param=1', True),
+            ('/apiv2/', False),
+            ('/api_old/', False),
+        ]),
+        
+        ('*/docs/*', [
+            ('example.com/docs/', True),
+            ('example.com/docs/guide', True),
+            ('example.com/documentation/', False),
+            ('example.com/docs_old/', False),
+        ]),
+    ]
+    
+    all_passed = True
+    for pattern, test_cases in test_patterns:
+        print(f"\nPattern: {pattern}")
+        filter_obj = URLPatternFilter(patterns=[pattern])
+        
+        for url, expected in test_cases:
+            result = filter_obj.apply(url)
+            status = "PASS" if result == expected else "FAIL"
+            if result != expected:
+                all_passed = False
+            print(f"  {status:4} | Expected: {expected:5} | Got: {result:5} | {url}")
+    
+    return all_passed
+
+if __name__ == "__main__":
+    test1_passed = test_prefix_boundary_matching()
+    test2_passed = test_edge_cases()
+    
+    if test1_passed and test2_passed:
+        print("\n✅ All tests passed!")
+        sys.exit(0)
+    else:
+        print("\n❌ Some tests failed!")
+        sys.exit(1)