crawl4ai/tests/test_multi_config.py

"""
Test example for multiple crawler configs feature
"""
import asyncio
import sys
from pathlib import Path

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode

async def test_multi_config():
    # Create different configs for different URL patterns

    # Config for PDF files
    pdf_config = CrawlerRunConfig(
        url_matcher="*.pdf",
    )

    # Config for articles (using multiple patterns with OR logic)
    article_config = CrawlerRunConfig(
        url_matcher=["*/news/*", "*blog*", "*/article/*"],
        match_mode=MatchMode.OR,
        screenshot=True,
    )

    # Config using custom matcher function
    api_config = CrawlerRunConfig(
        url_matcher=lambda url: 'api' in url or 'json' in url,
    )

    # Config combining patterns and functions with AND logic
    secure_docs_config = CrawlerRunConfig(
        url_matcher=[
            "*.doc*",  # Matches .doc, .docx
            lambda url: url.startswith('https://')  # Must be HTTPS
        ],
        match_mode=MatchMode.AND,
    )

    # Default config (no url_matcher means it won't match anything unless it's the fallback)
    default_config = CrawlerRunConfig(
        # cache_mode=CacheMode.BYPASS,
    )

    # List of configs - order matters! First match wins
    configs = [
        pdf_config,
        article_config,
        api_config,
        secure_docs_config,
        default_config  # Fallback
    ]

    # Test URLs - using real URLs that exist
    test_urls = [
        # "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",  # Real PDF
        # "https://www.bbc.com/news/articles/c5y3e3glnldo",  # News article
        # "https://blog.python.org/",  # Blog URL
        # "https://api.github.com/users/github",  # GitHub API (returns JSON)
        # "https://httpbin.org/json",  # API endpoint that returns JSON
        # "https://www.python.org/",  # Generic HTTPS page
        # "http://info.cern.ch/",  # HTTP (not HTTPS) page
        "https://example.com/",  # → Default config
    ]

    # Test the matching logic
    print("Config matching test:")
    print("-" * 50)
    for url in test_urls:
        for i, config in enumerate(configs):
            if config.is_match(url):
                print(f"{url} -> Config {i} matches")
                break
        else:
            print(f"{url} -> No match, will use fallback (first config)")

    print("\n" + "=" * 50 + "\n")

    # Now test with actual crawler
    async with AsyncWebCrawler() as crawler:
        # Single config - traditional usage still works
        print("Test 1: Single config (backwards compatible)")
        result = await crawler.arun_many(
            urls=["https://www.python.org/"],
            config=default_config
        )
        print(f"Crawled {len(result)} URLs with single config\n")

        # Multiple configs - new feature
        print("Test 2: Multiple configs")
        # Just test with 2 URLs to avoid timeout
        results = await crawler.arun_many(
            urls=test_urls[:2],  # Just test first 2 URLs
            config=configs  # Pass list of configs
        )
        print(f"Crawled {len(results)} URLs with multiple configs")

        # Using custom matcher inline
        print("\nTest 3: Inline custom matcher")
        custom_config = CrawlerRunConfig(
            url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(),
            verbose=False
        )
        results = await crawler.arun_many(
            urls=[
                "https://docs.python.org/3/library/asyncio.html",  # Long URL with 'python'
                "https://python.org/",  # Short URL with 'python' - won't match
                "https://www.google.com/"  # No 'python' - won't match
            ],
            config=[custom_config, default_config]
        )
        print(f"Crawled {len(results)} URLs with custom matcher")

if __name__ == "__main__":
    asyncio.run(test_multi_config())