crawl4ai/tests/20241401/test_stream.py

import os, sys
# append 2 parent directories to sys.path to import crawl4ai
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
parent_parent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_parent_dir)

import asyncio
from crawl4ai import *

async def test_crawler():
    # Setup configurations
    browser_config = BrowserConfig(headless=True, verbose=False)
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        markdown_generator=DefaultMarkdownGenerator(
            content_filter=PruningContentFilter(
                threshold=0.48,
                threshold_type="fixed",
                min_word_threshold=0
            )
        ),
    )

    # Test URLs - mix of different sites
    urls = [
        "http://example.com",
        "http://example.org",
        "http://example.net",
    ] * 10  # 15 total URLs

    async with AsyncWebCrawler(config=browser_config) as crawler:
        print("\n=== Testing Streaming Mode ===")
        async for result in await crawler.arun_many(
            urls=urls,
            config=crawler_config.clone(stream=True),
        ):
            print(f"Received result for: {result.url} - Success: {result.success}")

        print("\n=== Testing Batch Mode ===")
        results = await crawler.arun_many(
            urls=urls,
            config=crawler_config,
        )
        print(f"Received all {len(results)} results at once")
        for result in results:
            print(f"Batch result for: {result.url} - Success: {result.success}")

if __name__ == "__main__":
    asyncio.run(test_crawler())