fix: Enhance URLPatternFilter to enforce path boundary checks for prefix matching. ref #1003
This commit is contained in:
@@ -227,10 +227,21 @@ class URLPatternFilter(URLFilter):
|
|||||||
# Prefix check (/foo/*)
|
# Prefix check (/foo/*)
|
||||||
if self._simple_prefixes:
|
if self._simple_prefixes:
|
||||||
path = url.split("?")[0]
|
path = url.split("?")[0]
|
||||||
if any(path.startswith(p) for p in self._simple_prefixes):
|
# if any(path.startswith(p) for p in self._simple_prefixes):
|
||||||
result = True
|
# result = True
|
||||||
self._update_stats(result)
|
# self._update_stats(result)
|
||||||
return not result if self._reverse else result
|
# return not result if self._reverse else result
|
||||||
|
####
|
||||||
|
# Modified the prefix matching logic to ensure path boundary checking:
|
||||||
|
# - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path
|
||||||
|
# - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/`
|
||||||
|
####
|
||||||
|
for prefix in self._simple_prefixes:
|
||||||
|
if path.startswith(prefix):
|
||||||
|
if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']:
|
||||||
|
result = True
|
||||||
|
self._update_stats(result)
|
||||||
|
return not result if self._reverse else result
|
||||||
|
|
||||||
# Complex patterns
|
# Complex patterns
|
||||||
if self._path_patterns:
|
if self._path_patterns:
|
||||||
|
|||||||
85
tests/general/test_url_pattern.py
Normal file
85
tests/general/test_url_pattern.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Get the grandparent directory
|
||||||
|
grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
sys.path.append(grandparent_dir)
|
||||||
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai.deep_crawling.filters import URLPatternFilter
|
||||||
|
|
||||||
|
|
||||||
|
def test_prefix_boundary_matching():
|
||||||
|
"""Test that prefix patterns respect path boundaries"""
|
||||||
|
print("=== Testing URLPatternFilter Prefix Boundary Fix ===")
|
||||||
|
|
||||||
|
filter_obj = URLPatternFilter(patterns=['https://langchain-ai.github.io/langgraph/*'])
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
('https://langchain-ai.github.io/langgraph/', True),
|
||||||
|
('https://langchain-ai.github.io/langgraph/concepts/', True),
|
||||||
|
('https://langchain-ai.github.io/langgraph/tutorials/', True),
|
||||||
|
('https://langchain-ai.github.io/langgraph?param=1', True),
|
||||||
|
('https://langchain-ai.github.io/langgraph#section', True),
|
||||||
|
('https://langchain-ai.github.io/langgraphjs/', False),
|
||||||
|
('https://langchain-ai.github.io/langgraphjs/concepts/', False),
|
||||||
|
('https://other-site.com/langgraph/', False),
|
||||||
|
]
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for url, expected in test_cases:
|
||||||
|
result = filter_obj.apply(url)
|
||||||
|
status = "PASS" if result == expected else "FAIL"
|
||||||
|
if result != expected:
|
||||||
|
all_passed = False
|
||||||
|
print(f"{status:4} | Expected: {expected:5} | Got: {result:5} | {url}")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
|
def test_edge_cases():
|
||||||
|
"""Test edge cases for path boundary matching"""
|
||||||
|
print("\n=== Testing Edge Cases ===")
|
||||||
|
|
||||||
|
test_patterns = [
|
||||||
|
('/api/*', [
|
||||||
|
('/api/', True),
|
||||||
|
('/api/v1', True),
|
||||||
|
('/api?param=1', True),
|
||||||
|
('/apiv2/', False),
|
||||||
|
('/api_old/', False),
|
||||||
|
]),
|
||||||
|
|
||||||
|
('*/docs/*', [
|
||||||
|
('example.com/docs/', True),
|
||||||
|
('example.com/docs/guide', True),
|
||||||
|
('example.com/documentation/', False),
|
||||||
|
('example.com/docs_old/', False),
|
||||||
|
]),
|
||||||
|
]
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for pattern, test_cases in test_patterns:
|
||||||
|
print(f"\nPattern: {pattern}")
|
||||||
|
filter_obj = URLPatternFilter(patterns=[pattern])
|
||||||
|
|
||||||
|
for url, expected in test_cases:
|
||||||
|
result = filter_obj.apply(url)
|
||||||
|
status = "PASS" if result == expected else "FAIL"
|
||||||
|
if result != expected:
|
||||||
|
all_passed = False
|
||||||
|
print(f" {status:4} | Expected: {expected:5} | Got: {result:5} | {url}")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test1_passed = test_prefix_boundary_matching()
|
||||||
|
test2_passed = test_edge_cases()
|
||||||
|
|
||||||
|
if test1_passed and test2_passed:
|
||||||
|
print("\n✅ All tests passed!")
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print("\n❌ Some tests failed!")
|
||||||
|
sys.exit(1)
|
||||||
Reference in New Issue
Block a user