feat(crawler): optimize single URL handling and add performance comparison

Add special handling for single URL requests in Docker API to use arun() instead of arun_many() Add new example script demonstrating performance differences between sequential and parallel crawling Update cache mode from aggressive to bypass in examples and tests Remove unused dependencies (zstandard, msgpack) BREAKING CHANGE: Changed default cache_mode from aggressive to bypass in examples
2025-03-13 22:15:15 +08:00
parent dc36997a08
commit b750542e6d
6 changed files with 95 additions and 10 deletions
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -554,7 +554,7 @@ async def test_stream_crawl(session, token: str):
            "https://example.com/page3",  
        ],
        "browser_config": {"headless": True, "viewport": {"width": 1200}},
-        "crawler_config": {"stream": True, "cache_mode": "aggressive"}
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
    }

    # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -388,11 +388,19 @@ async def handle_crawl_request(
        )

        async with AsyncWebCrawler(config=browser_config) as crawler:
-            results = await crawler.arun_many(
-                urls=urls,
-                config=crawler_config,
-                dispatcher=dispatcher
-            )
+            results = []
+            if len(urls) == 1:
+                results = await crawler.arun(
+                    url=urls[0],
+                    config=crawler_config,
+                    dispatcher=dispatcher
+                )
+            else:    
+                results = await crawler.arun_many(
+                    urls=urls,
+                    config=crawler_config,
+                    dispatcher=dispatcher
+                )
            
            return {
                "success": True,
--- a/docs/examples/arun_vs_arun_many.py
+++ b/docs/examples/arun_vs_arun_many.py
@@ -0,0 +1,79 @@
+import asyncio
+import time
+from crawl4ai.async_webcrawler import AsyncWebCrawler, CacheMode
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter
+
+VERBOSE = False
+
+async def crawl_sequential(urls):
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
+    results = []
+    start_time = time.perf_counter()
+    async with AsyncWebCrawler() as crawler:
+        for url in urls:
+            result_container = await crawler.arun(url=url, config=config)
+            results.append(result_container[0])
+    total_time = time.perf_counter() - start_time
+    return total_time, results
+
+async def crawl_parallel_dispatcher(urls):
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
+    # Dispatcher with rate limiter enabled (default behavior)
+    dispatcher = MemoryAdaptiveDispatcher(
+        rate_limiter=RateLimiter(base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3),
+        max_session_permit=50,
+    )
+    start_time = time.perf_counter()
+    async with AsyncWebCrawler() as crawler:
+        result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
+        results = []
+        if isinstance(result_container, list):
+            results = result_container
+        else:
+            async for res in result_container:
+                results.append(res)
+    total_time = time.perf_counter() - start_time
+    return total_time, results
+
+async def crawl_parallel_no_rate_limit(urls):
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
+    # Dispatcher with no rate limiter and a high session permit to avoid queuing
+    dispatcher = MemoryAdaptiveDispatcher(
+        rate_limiter=None,
+        max_session_permit=len(urls)  # allow all URLs concurrently
+    )
+    start_time = time.perf_counter()
+    async with AsyncWebCrawler() as crawler:
+        result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
+        results = []
+        if isinstance(result_container, list):
+            results = result_container
+        else:
+            async for res in result_container:
+                results.append(res)
+    total_time = time.perf_counter() - start_time
+    return total_time, results
+
+async def main():
+    urls = ["https://example.com"] * 100
+    print(f"Crawling {len(urls)} URLs sequentially...")
+    seq_time, seq_results = await crawl_sequential(urls)
+    print(f"Sequential crawling took: {seq_time:.2f} seconds\n")
+    
+    print(f"Crawling {len(urls)} URLs in parallel using arun_many with dispatcher (with rate limit)...")
+    disp_time, disp_results = await crawl_parallel_dispatcher(urls)
+    print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds\n")
+       
+    print(f"Crawling {len(urls)} URLs in parallel using dispatcher with no rate limiter...")
+    no_rl_time, no_rl_results = await crawl_parallel_no_rate_limit(urls)
+    print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds\n")
+    
+    print("Crawl4ai - Crawling Comparison")
+    print("--------------------------------------------------------")
+    print(f"Sequential crawling took: {seq_time:.2f} seconds")
+    print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds")
+    print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds")
+    
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/docker_python_rest_api.py
+++ b/docs/examples/docker_python_rest_api.py
@@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str):
            # "https://news.ycombinator.com/news"
        ],
        "browser_config": {"headless": True, "viewport": {"width": 1200}},
-        "crawler_config": {"stream": True, "cache_mode": "aggressive"}
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
    }
    headers = {"Authorization": f"Bearer {token}"}
    print(f"\nTesting Streaming Crawl: {url}")
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,8 +43,6 @@ dependencies = [
    "faust-cchardet>=2.1.19",
    "aiohttp>=3.11.11",
    "humanize>=4.10.0",
-    "zstandard>=0.23.0",
-    "msgpack>=1.1.0"
 ]
 classifiers = [
    "Development Status :: 4 - Beta",
--- a/tests/docker/test_server_token.py
+++ b/tests/docker/test_server_token.py
@@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str):
            # "https://news.ycombinator.com/news"
        ],
        "browser_config": {"headless": True, "viewport": {"width": 1200}},
-        "crawler_config": {"stream": True, "cache_mode": "aggressive"}
+        "crawler_config": {"stream": True, "cache_mode": "bypass"}
    }
    headers = {"Authorization": f"Bearer {token}"}
    print(f"\nTesting Streaming Crawl: {url}")