crawl4ai/tests/general/test_url_pattern.py

import sys
import os

# Get the grandparent directory
grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(grandparent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

import asyncio
from crawl4ai.deep_crawling.filters import URLPatternFilter


def test_prefix_boundary_matching():
    """Test that prefix patterns respect path boundaries"""
    print("=== Testing URLPatternFilter Prefix Boundary Fix ===")

    filter_obj = URLPatternFilter(patterns=['https://langchain-ai.github.io/langgraph/*'])

    test_cases = [
        ('https://langchain-ai.github.io/langgraph/', True),
        ('https://langchain-ai.github.io/langgraph/concepts/', True),
        ('https://langchain-ai.github.io/langgraph/tutorials/', True),
        ('https://langchain-ai.github.io/langgraph?param=1', True),
        ('https://langchain-ai.github.io/langgraph#section', True),
        ('https://langchain-ai.github.io/langgraphjs/', False),
        ('https://langchain-ai.github.io/langgraphjs/concepts/', False),
        ('https://other-site.com/langgraph/', False),
    ]

    all_passed = True
    for url, expected in test_cases:
        result = filter_obj.apply(url)
        status = "PASS" if result == expected else "FAIL"
        if result != expected:
            all_passed = False
        print(f"{status:4} | Expected: {expected:5} | Got: {result:5} | {url}")

    return all_passed


def test_edge_cases():
    """Test edge cases for path boundary matching"""
    print("\n=== Testing Edge Cases ===")

    test_patterns = [
        ('/api/*', [
            ('/api/', True),
            ('/api/v1', True),
            ('/api?param=1', True),
            ('/apiv2/', False),
            ('/api_old/', False),
        ]),

        ('*/docs/*', [
            ('example.com/docs/', True),
            ('example.com/docs/guide', True),
            ('example.com/documentation/', False),
            ('example.com/docs_old/', False),
        ]),
    ]

    all_passed = True
    for pattern, test_cases in test_patterns:
        print(f"\nPattern: {pattern}")
        filter_obj = URLPatternFilter(patterns=[pattern])

        for url, expected in test_cases:
            result = filter_obj.apply(url)
            status = "PASS" if result == expected else "FAIL"
            if result != expected:
                all_passed = False
            print(f"  {status:4} | Expected: {expected:5} | Got: {result:5} | {url}")

    return all_passed

if __name__ == "__main__":
    test1_passed = test_prefix_boundary_matching()
    test2_passed = test_edge_cases()

    if test1_passed and test2_passed:
        print("\n✅ All tests passed!")
        sys.exit(0)
    else:
        print("\n❌ Some tests failed!")
        sys.exit(1)