feat(crawler): optimize single URL handling and add performance comparison
Add special handling for single URL requests in Docker API to use arun() instead of arun_many() Add new example script demonstrating performance differences between sequential and parallel crawling Update cache mode from aggressive to bypass in examples and tests Remove unused dependencies (zstandard, msgpack) BREAKING CHANGE: Changed default cache_mode from aggressive to bypass in examples
This commit is contained in:
@@ -554,7 +554,7 @@ async def test_stream_crawl(session, token: str):
|
|||||||
"https://example.com/page3",
|
"https://example.com/page3",
|
||||||
],
|
],
|
||||||
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
||||||
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
|
"crawler_config": {"stream": True, "cache_mode": "bypass"}
|
||||||
}
|
}
|
||||||
|
|
||||||
# headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
|
# headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
|
||||||
|
|||||||
@@ -388,11 +388,19 @@ async def handle_crawl_request(
|
|||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
results = await crawler.arun_many(
|
results = []
|
||||||
urls=urls,
|
if len(urls) == 1:
|
||||||
config=crawler_config,
|
results = await crawler.arun(
|
||||||
dispatcher=dispatcher
|
url=urls[0],
|
||||||
)
|
config=crawler_config,
|
||||||
|
dispatcher=dispatcher
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=urls,
|
||||||
|
config=crawler_config,
|
||||||
|
dispatcher=dispatcher
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
|
|||||||
79
docs/examples/arun_vs_arun_many.py
Normal file
79
docs/examples/arun_vs_arun_many.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from crawl4ai.async_webcrawler import AsyncWebCrawler, CacheMode
|
||||||
|
from crawl4ai.async_configs import CrawlerRunConfig
|
||||||
|
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter
|
||||||
|
|
||||||
|
VERBOSE = False
|
||||||
|
|
||||||
|
async def crawl_sequential(urls):
|
||||||
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
|
||||||
|
results = []
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
for url in urls:
|
||||||
|
result_container = await crawler.arun(url=url, config=config)
|
||||||
|
results.append(result_container[0])
|
||||||
|
total_time = time.perf_counter() - start_time
|
||||||
|
return total_time, results
|
||||||
|
|
||||||
|
async def crawl_parallel_dispatcher(urls):
|
||||||
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
|
||||||
|
# Dispatcher with rate limiter enabled (default behavior)
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
rate_limiter=RateLimiter(base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3),
|
||||||
|
max_session_permit=50,
|
||||||
|
)
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
|
||||||
|
results = []
|
||||||
|
if isinstance(result_container, list):
|
||||||
|
results = result_container
|
||||||
|
else:
|
||||||
|
async for res in result_container:
|
||||||
|
results.append(res)
|
||||||
|
total_time = time.perf_counter() - start_time
|
||||||
|
return total_time, results
|
||||||
|
|
||||||
|
async def crawl_parallel_no_rate_limit(urls):
|
||||||
|
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
|
||||||
|
# Dispatcher with no rate limiter and a high session permit to avoid queuing
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
rate_limiter=None,
|
||||||
|
max_session_permit=len(urls) # allow all URLs concurrently
|
||||||
|
)
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
|
||||||
|
results = []
|
||||||
|
if isinstance(result_container, list):
|
||||||
|
results = result_container
|
||||||
|
else:
|
||||||
|
async for res in result_container:
|
||||||
|
results.append(res)
|
||||||
|
total_time = time.perf_counter() - start_time
|
||||||
|
return total_time, results
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
urls = ["https://example.com"] * 100
|
||||||
|
print(f"Crawling {len(urls)} URLs sequentially...")
|
||||||
|
seq_time, seq_results = await crawl_sequential(urls)
|
||||||
|
print(f"Sequential crawling took: {seq_time:.2f} seconds\n")
|
||||||
|
|
||||||
|
print(f"Crawling {len(urls)} URLs in parallel using arun_many with dispatcher (with rate limit)...")
|
||||||
|
disp_time, disp_results = await crawl_parallel_dispatcher(urls)
|
||||||
|
print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds\n")
|
||||||
|
|
||||||
|
print(f"Crawling {len(urls)} URLs in parallel using dispatcher with no rate limiter...")
|
||||||
|
no_rl_time, no_rl_results = await crawl_parallel_no_rate_limit(urls)
|
||||||
|
print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds\n")
|
||||||
|
|
||||||
|
print("Crawl4ai - Crawling Comparison")
|
||||||
|
print("--------------------------------------------------------")
|
||||||
|
print(f"Sequential crawling took: {seq_time:.2f} seconds")
|
||||||
|
print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds")
|
||||||
|
print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str):
|
|||||||
# "https://news.ycombinator.com/news"
|
# "https://news.ycombinator.com/news"
|
||||||
],
|
],
|
||||||
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
||||||
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
|
"crawler_config": {"stream": True, "cache_mode": "bypass"}
|
||||||
}
|
}
|
||||||
headers = {"Authorization": f"Bearer {token}"}
|
headers = {"Authorization": f"Bearer {token}"}
|
||||||
print(f"\nTesting Streaming Crawl: {url}")
|
print(f"\nTesting Streaming Crawl: {url}")
|
||||||
|
|||||||
@@ -43,8 +43,6 @@ dependencies = [
|
|||||||
"faust-cchardet>=2.1.19",
|
"faust-cchardet>=2.1.19",
|
||||||
"aiohttp>=3.11.11",
|
"aiohttp>=3.11.11",
|
||||||
"humanize>=4.10.0",
|
"humanize>=4.10.0",
|
||||||
"zstandard>=0.23.0",
|
|
||||||
"msgpack>=1.1.0"
|
|
||||||
]
|
]
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Development Status :: 4 - Beta",
|
"Development Status :: 4 - Beta",
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str):
|
|||||||
# "https://news.ycombinator.com/news"
|
# "https://news.ycombinator.com/news"
|
||||||
],
|
],
|
||||||
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
||||||
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
|
"crawler_config": {"stream": True, "cache_mode": "bypass"}
|
||||||
}
|
}
|
||||||
headers = {"Authorization": f"Bearer {token}"}
|
headers = {"Authorization": f"Bearer {token}"}
|
||||||
print(f"\nTesting Streaming Crawl: {url}")
|
print(f"\nTesting Streaming Crawl: {url}")
|
||||||
|
|||||||
Reference in New Issue
Block a user