crawl4ai/tests/test_config_matching_only.py

"""
Test only the config matching logic without running crawler
"""
import sys
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from crawl4ai.async_configs import CrawlerRunConfig, MatchMode

def test_all_matching_scenarios():
    print("Testing CrawlerRunConfig.is_match() method")
    print("=" * 50)

    # Test 1: Single string pattern
    print("\n1. Single string pattern (glob style)")
    config = CrawlerRunConfig(
        url_matcher="*.pdf",
        # For example we can set this => scraping_strategy=PDFContentScrapingStrategy()
    )
    test_urls = [
        ("https://example.com/file.pdf", True),
        ("https://example.com/doc.PDF", False),  # Case sensitive
        ("https://example.com/file.txt", False),
        ("file.pdf", True),
    ]
    for url, expected in test_urls:
        result = config.is_match(url)
        status = "✓" if result == expected else "✗"
        print(f"  {status} {url} -> {result}")

    # Test 2: List of patterns with OR
    print("\n2. List of patterns with OR (default)")
    config = CrawlerRunConfig(
        url_matcher=["*/article/*", "*/blog/*", "*.html"],
        match_mode=MatchMode.OR
    )
    test_urls = [
        ("https://example.com/article/news", True),
        ("https://example.com/blog/post", True),
        ("https://example.com/page.html", True),
        ("https://example.com/page.php", False),
    ]
    for url, expected in test_urls:
        result = config.is_match(url)
        status = "✓" if result == expected else "✗"
        print(f"  {status} {url} -> {result}")

    # Test 3: Custom function
    print("\n3. Custom function matcher")
    config = CrawlerRunConfig(
        url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml'))
    )
    test_urls = [
        ("https://api.example.com/data.json", True),
        ("https://api.example.com/data.xml", True),
        ("https://api.example.com/data.html", False),
        ("https://example.com/data.json", False),  # No 'api'
    ]
    for url, expected in test_urls:
        result = config.is_match(url)
        status = "✓" if result == expected else "✗"
        print(f"  {status} {url} -> {result}")

    # Test 4: Mixed list with AND
    print("\n4. Mixed patterns and functions with AND")
    config = CrawlerRunConfig(
        url_matcher=[
            "https://*",  # Must be HTTPS
            lambda url: '.com' in url,  # Must have .com
            lambda url: len(url) < 50  # Must be short
        ],
        match_mode=MatchMode.AND
    )
    test_urls = [
        ("https://example.com/page", True),
        ("http://example.com/page", False),  # Not HTTPS
        ("https://example.org/page", False),  # No .com
        ("https://example.com/" + "x" * 50, False),  # Too long
    ]
    for url, expected in test_urls:
        result = config.is_match(url)
        status = "✓" if result == expected else "✗"
        print(f"  {status} {url} -> {result}")

    # Test 5: Complex real-world scenario
    print("\n5. Complex pattern combinations")
    config = CrawlerRunConfig(
        url_matcher=[
            "*/api/v[0-9]/*",  # API versioned endpoints
            lambda url: 'graphql' in url,  # GraphQL endpoints
            "*.json"  # JSON files
        ],
        match_mode=MatchMode.OR
    )
    test_urls = [
        ("https://example.com/api/v1/users", True),
        ("https://example.com/api/v2/posts", True),
        ("https://example.com/graphql", True),
        ("https://example.com/data.json", True),
        ("https://example.com/api/users", False),  # No version
    ]
    for url, expected in test_urls:
        result = config.is_match(url)
        status = "✓" if result == expected else "✗"
        print(f"  {status} {url} -> {result}")

    # Test 6: Edge cases
    print("\n6. Edge cases")

    # No matcher
    config = CrawlerRunConfig()
    result = config.is_match("https://example.com")
    print(f"  {'✓' if not result else '✗'} No matcher -> {result}")

    # Empty list
    config = CrawlerRunConfig(url_matcher=[])
    result = config.is_match("https://example.com")
    print(f"  {'✓' if not result else '✗'} Empty list -> {result}")

    # None in list (should be skipped)
    config = CrawlerRunConfig(url_matcher=["*.pdf", None, "*.doc"])
    result = config.is_match("test.pdf")
    print(f"  {'✓' if result else '✗'} List with None -> {result}")

    print("\n" + "=" * 50)
    print("All matching tests completed!")

if __name__ == "__main__":
    test_all_matching_scenarios()