From 4679ee023d06f94ecad73a963ab23f9f0d08da14 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Tue, 10 Jun 2025 11:19:18 +0200 Subject: [PATCH] fix: Enhance URLPatternFilter to enforce path boundary checks for prefix matching. ref #1003 --- crawl4ai/deep_crawling/filters.py | 19 +++++-- tests/general/test_url_pattern.py | 85 +++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 4 deletions(-) create mode 100644 tests/general/test_url_pattern.py diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index 8d0bcc4d..b65112e2 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -227,10 +227,21 @@ class URLPatternFilter(URLFilter): # Prefix check (/foo/*) if self._simple_prefixes: path = url.split("?")[0] - if any(path.startswith(p) for p in self._simple_prefixes): - result = True - self._update_stats(result) - return not result if self._reverse else result + # if any(path.startswith(p) for p in self._simple_prefixes): + # result = True + # self._update_stats(result) + # return not result if self._reverse else result + #### + # Modified the prefix matching logic to ensure path boundary checking: + # - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path + # - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/` + #### + for prefix in self._simple_prefixes: + if path.startswith(prefix): + if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']: + result = True + self._update_stats(result) + return not result if self._reverse else result # Complex patterns if self._path_patterns: diff --git a/tests/general/test_url_pattern.py b/tests/general/test_url_pattern.py new file mode 100644 index 00000000..3aea14d9 --- /dev/null +++ b/tests/general/test_url_pattern.py @@ -0,0 +1,85 @@ +import sys +import os + +# Get the grandparent directory +grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(grandparent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +import asyncio +from crawl4ai.deep_crawling.filters import URLPatternFilter + + +def test_prefix_boundary_matching(): + """Test that prefix patterns respect path boundaries""" + print("=== Testing URLPatternFilter Prefix Boundary Fix ===") + + filter_obj = URLPatternFilter(patterns=['https://langchain-ai.github.io/langgraph/*']) + + test_cases = [ + ('https://langchain-ai.github.io/langgraph/', True), + ('https://langchain-ai.github.io/langgraph/concepts/', True), + ('https://langchain-ai.github.io/langgraph/tutorials/', True), + ('https://langchain-ai.github.io/langgraph?param=1', True), + ('https://langchain-ai.github.io/langgraph#section', True), + ('https://langchain-ai.github.io/langgraphjs/', False), + ('https://langchain-ai.github.io/langgraphjs/concepts/', False), + ('https://other-site.com/langgraph/', False), + ] + + all_passed = True + for url, expected in test_cases: + result = filter_obj.apply(url) + status = "PASS" if result == expected else "FAIL" + if result != expected: + all_passed = False + print(f"{status:4} | Expected: {expected:5} | Got: {result:5} | {url}") + + return all_passed + + +def test_edge_cases(): + """Test edge cases for path boundary matching""" + print("\n=== Testing Edge Cases ===") + + test_patterns = [ + ('/api/*', [ + ('/api/', True), + ('/api/v1', True), + ('/api?param=1', True), + ('/apiv2/', False), + ('/api_old/', False), + ]), + + ('*/docs/*', [ + ('example.com/docs/', True), + ('example.com/docs/guide', True), + ('example.com/documentation/', False), + ('example.com/docs_old/', False), + ]), + ] + + all_passed = True + for pattern, test_cases in test_patterns: + print(f"\nPattern: {pattern}") + filter_obj = URLPatternFilter(patterns=[pattern]) + + for url, expected in test_cases: + result = filter_obj.apply(url) + status = "PASS" if result == expected else "FAIL" + if result != expected: + all_passed = False + print(f" {status:4} | Expected: {expected:5} | Got: {result:5} | {url}") + + return all_passed + +if __name__ == "__main__": + test1_passed = test_prefix_boundary_matching() + test2_passed = test_edge_cases() + + if test1_passed and test2_passed: + print("\n✅ All tests passed!") + sys.exit(0) + else: + print("\n❌ Some tests failed!") + sys.exit(1)