diff --git a/deploy/docker/README.md b/deploy/docker/README.md index c4582031..b4b6e414 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -554,7 +554,7 @@ async def test_stream_crawl(session, token: str): "https://example.com/page3", ], "browser_config": {"headless": True, "viewport": {"width": 1200}}, - "crawler_config": {"stream": True, "cache_mode": "aggressive"} + "crawler_config": {"stream": True, "cache_mode": "bypass"} } # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later diff --git a/deploy/docker/api.py b/deploy/docker/api.py index cc103905..4c7e17d2 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -388,11 +388,19 @@ async def handle_crawl_request( ) async with AsyncWebCrawler(config=browser_config) as crawler: - results = await crawler.arun_many( - urls=urls, - config=crawler_config, - dispatcher=dispatcher - ) + results = [] + if len(urls) == 1: + results = await crawler.arun( + url=urls[0], + config=crawler_config, + dispatcher=dispatcher + ) + else: + results = await crawler.arun_many( + urls=urls, + config=crawler_config, + dispatcher=dispatcher + ) return { "success": True, diff --git a/docs/examples/arun_vs_arun_many.py b/docs/examples/arun_vs_arun_many.py new file mode 100644 index 00000000..40bc4381 --- /dev/null +++ b/docs/examples/arun_vs_arun_many.py @@ -0,0 +1,79 @@ +import asyncio +import time +from crawl4ai.async_webcrawler import AsyncWebCrawler, CacheMode +from crawl4ai.async_configs import CrawlerRunConfig +from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter + +VERBOSE = False + +async def crawl_sequential(urls): + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE) + results = [] + start_time = time.perf_counter() + async with AsyncWebCrawler() as crawler: + for url in urls: + result_container = await crawler.arun(url=url, config=config) + results.append(result_container[0]) + total_time = time.perf_counter() - start_time + return total_time, results + +async def crawl_parallel_dispatcher(urls): + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE) + # Dispatcher with rate limiter enabled (default behavior) + dispatcher = MemoryAdaptiveDispatcher( + rate_limiter=RateLimiter(base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3), + max_session_permit=50, + ) + start_time = time.perf_counter() + async with AsyncWebCrawler() as crawler: + result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher) + results = [] + if isinstance(result_container, list): + results = result_container + else: + async for res in result_container: + results.append(res) + total_time = time.perf_counter() - start_time + return total_time, results + +async def crawl_parallel_no_rate_limit(urls): + config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE) + # Dispatcher with no rate limiter and a high session permit to avoid queuing + dispatcher = MemoryAdaptiveDispatcher( + rate_limiter=None, + max_session_permit=len(urls) # allow all URLs concurrently + ) + start_time = time.perf_counter() + async with AsyncWebCrawler() as crawler: + result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher) + results = [] + if isinstance(result_container, list): + results = result_container + else: + async for res in result_container: + results.append(res) + total_time = time.perf_counter() - start_time + return total_time, results + +async def main(): + urls = ["https://example.com"] * 100 + print(f"Crawling {len(urls)} URLs sequentially...") + seq_time, seq_results = await crawl_sequential(urls) + print(f"Sequential crawling took: {seq_time:.2f} seconds\n") + + print(f"Crawling {len(urls)} URLs in parallel using arun_many with dispatcher (with rate limit)...") + disp_time, disp_results = await crawl_parallel_dispatcher(urls) + print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds\n") + + print(f"Crawling {len(urls)} URLs in parallel using dispatcher with no rate limiter...") + no_rl_time, no_rl_results = await crawl_parallel_no_rate_limit(urls) + print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds\n") + + print("Crawl4ai - Crawling Comparison") + print("--------------------------------------------------------") + print(f"Sequential crawling took: {seq_time:.2f} seconds") + print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds") + print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/docker_python_rest_api.py b/docs/examples/docker_python_rest_api.py index 000d6464..6650f8d5 100644 --- a/docs/examples/docker_python_rest_api.py +++ b/docs/examples/docker_python_rest_api.py @@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str): # "https://news.ycombinator.com/news" ], "browser_config": {"headless": True, "viewport": {"width": 1200}}, - "crawler_config": {"stream": True, "cache_mode": "aggressive"} + "crawler_config": {"stream": True, "cache_mode": "bypass"} } headers = {"Authorization": f"Bearer {token}"} print(f"\nTesting Streaming Crawl: {url}") diff --git a/pyproject.toml b/pyproject.toml index c3f03bfd..ad07548d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,8 +43,6 @@ dependencies = [ "faust-cchardet>=2.1.19", "aiohttp>=3.11.11", "humanize>=4.10.0", - "zstandard>=0.23.0", - "msgpack>=1.1.0" ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/tests/docker/test_server_token.py b/tests/docker/test_server_token.py index d8c7df89..220b6ca2 100644 --- a/tests/docker/test_server_token.py +++ b/tests/docker/test_server_token.py @@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str): # "https://news.ycombinator.com/news" ], "browser_config": {"headless": True, "viewport": {"width": 1200}}, - "crawler_config": {"stream": True, "cache_mode": "aggressive"} + "crawler_config": {"stream": True, "cache_mode": "bypass"} } headers = {"Authorization": f"Bearer {token}"} print(f"\nTesting Streaming Crawl: {url}")