feat(crawler): optimize single URL handling and add performance comparison

Add special handling for single URL requests in Docker API to use arun() instead of arun_many() Add new example script demonstrating performance differences between sequential and parallel crawling Update cache mode from aggressive to bypass in examples and tests Remove unused dependencies (zstandard, msgpack) BREAKING CHANGE: Changed default cache_mode from aggressive to bypass in examples
2025-03-13 22:15:15 +08:00
parent dc36997a08
commit b750542e6d
6 changed files with 95 additions and 10 deletions
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -554,7 +554,7 @@ async def test_stream_crawl(session, token: str):
            "https://example.com/page3",  
        ],
        "browser_config": {"headless": True, "viewport": {"width": 1200}},
-        "crawler_config": {"stream": True, "cache_mode": "aggressive"}
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
    }
    # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -388,11 +388,19 @@ async def handle_crawl_request(
        )
        async with AsyncWebCrawler(config=browser_config) as crawler:
-            results = await crawler.arun_many(
+            results = []
-                urls=urls,
+            if len(urls) == 1:
-                config=crawler_config,
+                results = await crawler.arun(
-                dispatcher=dispatcher
+                    url=urls[0],
-            )
+                    config=crawler_config,
                    dispatcher=dispatcher
                )
            else:    
                results = await crawler.arun_many(
                    urls=urls,
                    config=crawler_config,
                    dispatcher=dispatcher
                )
            return {
                "success": True,
--- a/docs/examples/arun_vs_arun_many.py
+++ b/docs/examples/arun_vs_arun_many.py
@@ -0,0 +1,79 @@
 import asyncio
 import time
 from crawl4ai.async_webcrawler import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter
 VERBOSE = False
 async def crawl_sequential(urls):
    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
    results = []
    start_time = time.perf_counter()
    async with AsyncWebCrawler() as crawler:
        for url in urls:
            result_container = await crawler.arun(url=url, config=config)
            results.append(result_container[0])
    total_time = time.perf_counter() - start_time
    return total_time, results
 async def crawl_parallel_dispatcher(urls):
    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
    # Dispatcher with rate limiter enabled (default behavior)
    dispatcher = MemoryAdaptiveDispatcher(
        rate_limiter=RateLimiter(base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3),
        max_session_permit=50,
    )
    start_time = time.perf_counter()
    async with AsyncWebCrawler() as crawler:
        result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
        results = []
        if isinstance(result_container, list):
            results = result_container
        else:
            async for res in result_container:
                results.append(res)
    total_time = time.perf_counter() - start_time
    return total_time, results
 async def crawl_parallel_no_rate_limit(urls):
    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
    # Dispatcher with no rate limiter and a high session permit to avoid queuing
    dispatcher = MemoryAdaptiveDispatcher(
        rate_limiter=None,
        max_session_permit=len(urls)  # allow all URLs concurrently
    )
    start_time = time.perf_counter()
    async with AsyncWebCrawler() as crawler:
        result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
        results = []
        if isinstance(result_container, list):
            results = result_container
        else:
            async for res in result_container:
                results.append(res)
    total_time = time.perf_counter() - start_time
    return total_time, results
 async def main():
    urls = ["https://example.com"] * 100
    print(f"Crawling {len(urls)} URLs sequentially...")
    seq_time, seq_results = await crawl_sequential(urls)
    print(f"Sequential crawling took: {seq_time:.2f} seconds\n")
    print(f"Crawling {len(urls)} URLs in parallel using arun_many with dispatcher (with rate limit)...")
    disp_time, disp_results = await crawl_parallel_dispatcher(urls)
    print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds\n")
    print(f"Crawling {len(urls)} URLs in parallel using dispatcher with no rate limiter...")
    no_rl_time, no_rl_results = await crawl_parallel_no_rate_limit(urls)
    print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds\n")
    print("Crawl4ai - Crawling Comparison")
    print("--------------------------------------------------------")
    print(f"Sequential crawling took: {seq_time:.2f} seconds")
    print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds")
    print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds")
 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/examples/docker_python_rest_api.py
+++ b/docs/examples/docker_python_rest_api.py
@@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str):
            # "https://news.ycombinator.com/news"
        ],
        "browser_config": {"headless": True, "viewport": {"width": 1200}},
-        "crawler_config": {"stream": True, "cache_mode": "aggressive"}
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
    }
    headers = {"Authorization": f"Bearer {token}"}
    print(f"\nTesting Streaming Crawl: {url}")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,8 +43,6 @@ dependencies = [
    "faust-cchardet>=2.1.19",
    "aiohttp>=3.11.11",
    "humanize>=4.10.0",
    "zstandard>=0.23.0",
    "msgpack>=1.1.0"
 ]
 classifiers = [
    "Development Status :: 4 - Beta",
--- a/tests/docker/test_server_token.py
+++ b/tests/docker/test_server_token.py
@@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str):
            # "https://news.ycombinator.com/news"
        ],
        "browser_config": {"headless": True, "viewport": {"width": 1200}},
-        "crawler_config": {"stream": True, "cache_mode": "aggressive"}
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
    }
    headers = {"Authorization": f"Bearer {token}"}
    print(f"\nTesting Streaming Crawl: {url}")