Apply Ruff Corrections

This commit is contained in:
UncleCode
2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions

View File

@@ -3,11 +3,18 @@ import time
from rich import print
from rich.table import Table
from crawl4ai import (
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
MemoryAdaptiveDispatcher, SemaphoreDispatcher,
RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
MemoryAdaptiveDispatcher,
SemaphoreDispatcher,
RateLimiter,
CrawlerMonitor,
DisplayMode,
CacheMode,
)
async def memory_adaptive(urls, browser_config, run_config):
"""Memory adaptive crawler with monitoring"""
start = time.perf_counter()
@@ -16,14 +23,16 @@ async def memory_adaptive(urls, browser_config, run_config):
memory_threshold_percent=70.0,
max_session_permit=10,
monitor=CrawlerMonitor(
max_visible_rows=15,
display_mode=DisplayMode.DETAILED
)
max_visible_rows=15, display_mode=DisplayMode.DETAILED
),
)
results = await crawler.arun_many(
urls, config=run_config, dispatcher=dispatcher
)
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
duration = time.perf_counter() - start
return len(results), duration
async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
"""Memory adaptive crawler with rate limiting"""
start = time.perf_counter()
@@ -32,19 +41,19 @@ async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
memory_threshold_percent=70.0,
max_session_permit=10,
rate_limiter=RateLimiter(
base_delay=(1.0, 2.0),
max_delay=30.0,
max_retries=2
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
),
monitor=CrawlerMonitor(
max_visible_rows=15,
display_mode=DisplayMode.DETAILED
)
max_visible_rows=15, display_mode=DisplayMode.DETAILED
),
)
results = await crawler.arun_many(
urls, config=run_config, dispatcher=dispatcher
)
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
duration = time.perf_counter() - start
return len(results), duration
async def semaphore(urls, browser_config, run_config):
"""Basic semaphore crawler"""
start = time.perf_counter()
@@ -52,14 +61,16 @@ async def semaphore(urls, browser_config, run_config):
dispatcher = SemaphoreDispatcher(
semaphore_count=5,
monitor=CrawlerMonitor(
max_visible_rows=15,
display_mode=DisplayMode.DETAILED
)
max_visible_rows=15, display_mode=DisplayMode.DETAILED
),
)
results = await crawler.arun_many(
urls, config=run_config, dispatcher=dispatcher
)
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
duration = time.perf_counter() - start
return len(results), duration
async def semaphore_with_rate_limit(urls, browser_config, run_config):
"""Semaphore crawler with rate limiting"""
start = time.perf_counter()
@@ -67,19 +78,19 @@ async def semaphore_with_rate_limit(urls, browser_config, run_config):
dispatcher = SemaphoreDispatcher(
semaphore_count=5,
rate_limiter=RateLimiter(
base_delay=(1.0, 2.0),
max_delay=30.0,
max_retries=2
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
),
monitor=CrawlerMonitor(
max_visible_rows=15,
display_mode=DisplayMode.DETAILED
)
max_visible_rows=15, display_mode=DisplayMode.DETAILED
),
)
results = await crawler.arun_many(
urls, config=run_config, dispatcher=dispatcher
)
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
duration = time.perf_counter() - start
return len(results), duration
def create_performance_table(results):
"""Creates a rich table showing performance results"""
table = Table(title="Crawler Strategy Performance Comparison")
@@ -89,18 +100,16 @@ def create_performance_table(results):
table.add_column("URLs/second", justify="right", style="magenta")
sorted_results = sorted(results.items(), key=lambda x: x[1][1])
for strategy, (urls_crawled, duration) in sorted_results:
urls_per_second = urls_crawled / duration
table.add_row(
strategy,
str(urls_crawled),
f"{duration:.2f}",
f"{urls_per_second:.2f}"
strategy, str(urls_crawled), f"{duration:.2f}", f"{urls_per_second:.2f}"
)
return table
async def main():
urls = [f"https://example.com/page{i}" for i in range(1, 20)]
browser_config = BrowserConfig(headless=True, verbose=False)
@@ -108,14 +117,19 @@ async def main():
results = {
"Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
"Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(urls, browser_config, run_config),
"Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
urls, browser_config, run_config
),
"Semaphore": await semaphore(urls, browser_config, run_config),
"Semaphore + Rate Limit": await semaphore_with_rate_limit(urls, browser_config, run_config),
"Semaphore + Rate Limit": await semaphore_with_rate_limit(
urls, browser_config, run_config
),
}
table = create_performance_table(results)
print("\nPerformance Summary:")
print(table)
if __name__ == "__main__":
asyncio.run(main())
asyncio.run(main())