Apply Ruff Corrections

This commit is contained in:
UncleCode
2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions

View File

@@ -1,14 +1,19 @@
from typing import Dict, Optional, List
from .async_configs import *
from .models import *
from typing import Dict, Optional, List, Tuple
from .async_configs import CrawlerRunConfig
from .models import (
CrawlResult,
CrawlerTaskResult,
CrawlStatus,
DisplayMode,
CrawlStats,
DomainState,
)
from rich.live import Live
from rich.table import Table
from rich.console import Console
from rich.style import Style
from rich import box
from datetime import datetime, timedelta
from dataclasses import dataclass
import time
import psutil
@@ -26,63 +31,66 @@ class RateLimiter:
base_delay: Tuple[float, float] = (1.0, 3.0),
max_delay: float = 60.0,
max_retries: int = 3,
rate_limit_codes: List[int] = None
rate_limit_codes: List[int] = None,
):
self.base_delay = base_delay
self.max_delay = max_delay
self.max_retries = max_retries
self.rate_limit_codes = rate_limit_codes or [429, 503]
self.domains: Dict[str, DomainState] = {}
def get_domain(self, url: str) -> str:
return urlparse(url).netloc
async def wait_if_needed(self, url: str) -> None:
domain = self.get_domain(url)
state = self.domains.get(domain)
if not state:
self.domains[domain] = DomainState()
state = self.domains[domain]
now = time.time()
if state.last_request_time:
wait_time = max(0, state.current_delay - (now - state.last_request_time))
if wait_time > 0:
await asyncio.sleep(wait_time)
# Random delay within base range if no current delay
if state.current_delay == 0:
state.current_delay = random.uniform(*self.base_delay)
state.last_request_time = time.time()
def update_delay(self, url: str, status_code: int) -> bool:
domain = self.get_domain(url)
state = self.domains[domain]
if status_code in self.rate_limit_codes:
state.fail_count += 1
if state.fail_count > self.max_retries:
return False
# Exponential backoff with random jitter
state.current_delay = min(
state.current_delay * 2 * random.uniform(0.75, 1.25),
self.max_delay
state.current_delay * 2 * random.uniform(0.75, 1.25), self.max_delay
)
else:
# Gradually reduce delay on success
state.current_delay = max(
random.uniform(*self.base_delay),
state.current_delay * 0.75
random.uniform(*self.base_delay), state.current_delay * 0.75
)
state.fail_count = 0
return True
class CrawlerMonitor:
def __init__(self, max_visible_rows: int = 15, display_mode: DisplayMode = DisplayMode.DETAILED):
def __init__(
self,
max_visible_rows: int = 15,
display_mode: DisplayMode = DisplayMode.DETAILED,
):
self.console = Console()
self.max_visible_rows = max_visible_rows
self.display_mode = display_mode
@@ -90,23 +98,25 @@ class CrawlerMonitor:
self.process = psutil.Process()
self.start_time = datetime.now()
self.live = Live(self._create_table(), refresh_per_second=2)
def start(self):
self.live.start()
def stop(self):
self.live.stop()
def add_task(self, task_id: str, url: str):
self.stats[task_id] = CrawlStats(task_id=task_id, url=url, status=CrawlStatus.QUEUED)
self.stats[task_id] = CrawlStats(
task_id=task_id, url=url, status=CrawlStatus.QUEUED
)
self.live.update(self._create_table())
def update_task(self, task_id: str, **kwargs):
if task_id in self.stats:
for key, value in kwargs.items():
setattr(self.stats[task_id], key, value)
self.live.update(self._create_table())
def _create_aggregated_table(self) -> Table:
"""Creates a compact table showing only aggregated statistics"""
table = Table(
@@ -114,78 +124,78 @@ class CrawlerMonitor:
title="Crawler Status Overview",
title_style="bold magenta",
header_style="bold blue",
show_lines=True
show_lines=True,
)
# Calculate statistics
total_tasks = len(self.stats)
queued = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED)
in_progress = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS)
completed = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED)
failed = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED)
queued = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED
)
in_progress = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
)
completed = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
)
failed = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
)
# Memory statistics
current_memory = self.process.memory_info().rss / (1024 * 1024)
total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
peak_memory = max((stat.peak_memory for stat in self.stats.values()), default=0.0)
peak_memory = max(
(stat.peak_memory for stat in self.stats.values()), default=0.0
)
# Duration
duration = datetime.now() - self.start_time
# Create status row
table.add_column("Status", style="bold cyan")
table.add_column("Count", justify="right")
table.add_column("Percentage", justify="right")
table.add_row(
"Total Tasks",
str(total_tasks),
"100%"
)
table.add_row("Total Tasks", str(total_tasks), "100%")
table.add_row(
"[yellow]In Queue[/yellow]",
str(queued),
f"{(queued/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
f"{(queued/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
)
table.add_row(
"[blue]In Progress[/blue]",
str(in_progress),
f"{(in_progress/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
f"{(in_progress/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
)
table.add_row(
"[green]Completed[/green]",
str(completed),
f"{(completed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
f"{(completed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
)
table.add_row(
"[red]Failed[/red]",
str(failed),
f"{(failed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
f"{(failed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%",
)
# Add memory information
table.add_section()
table.add_row(
"[magenta]Current Memory[/magenta]",
f"{current_memory:.1f} MB",
""
"[magenta]Current Memory[/magenta]", f"{current_memory:.1f} MB", ""
)
table.add_row(
"[magenta]Total Task Memory[/magenta]",
f"{total_task_memory:.1f} MB",
""
"[magenta]Total Task Memory[/magenta]", f"{total_task_memory:.1f} MB", ""
)
table.add_row(
"[magenta]Peak Task Memory[/magenta]",
f"{peak_memory:.1f} MB",
""
"[magenta]Peak Task Memory[/magenta]", f"{peak_memory:.1f} MB", ""
)
table.add_row(
"[yellow]Runtime[/yellow]",
str(timedelta(seconds=int(duration.total_seconds()))),
""
"",
)
return table
def _create_detailed_table(self) -> Table:
@@ -193,9 +203,9 @@ class CrawlerMonitor:
box=box.ROUNDED,
title="Crawler Performance Monitor",
title_style="bold magenta",
header_style="bold blue"
header_style="bold blue",
)
# Add columns
table.add_column("Task ID", style="cyan", no_wrap=True)
table.add_column("URL", style="cyan", no_wrap=True)
@@ -204,47 +214,54 @@ class CrawlerMonitor:
table.add_column("Peak (MB)", justify="right")
table.add_column("Duration", justify="right")
table.add_column("Info", style="italic")
# Add summary row
total_memory = sum(stat.memory_usage for stat in self.stats.values())
active_count = sum(1 for stat in self.stats.values()
if stat.status == CrawlStatus.IN_PROGRESS)
completed_count = sum(1 for stat in self.stats.values()
if stat.status == CrawlStatus.COMPLETED)
failed_count = sum(1 for stat in self.stats.values()
if stat.status == CrawlStatus.FAILED)
active_count = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
)
completed_count = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
)
failed_count = sum(
1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
)
table.add_row(
"[bold yellow]SUMMARY",
f"Total: {len(self.stats)}",
f"Active: {active_count}",
f"{total_memory:.1f}",
f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
str(timedelta(seconds=int((datetime.now() - self.start_time).total_seconds()))),
str(
timedelta(
seconds=int((datetime.now() - self.start_time).total_seconds())
)
),
f"{completed_count}{failed_count}",
style="bold"
style="bold",
)
table.add_section()
# Add rows for each task
visible_stats = sorted(
self.stats.values(),
key=lambda x: (
x.status != CrawlStatus.IN_PROGRESS,
x.status != CrawlStatus.QUEUED,
x.end_time or datetime.max
)
)[:self.max_visible_rows]
x.end_time or datetime.max,
),
)[: self.max_visible_rows]
for stat in visible_stats:
status_style = {
CrawlStatus.QUEUED: "white",
CrawlStatus.IN_PROGRESS: "yellow",
CrawlStatus.COMPLETED: "green",
CrawlStatus.FAILED: "red"
CrawlStatus.FAILED: "red",
}[stat.status]
table.add_row(
stat.task_id[:8], # Show first 8 chars of task ID
stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
@@ -252,9 +269,9 @@ class CrawlerMonitor:
f"{stat.memory_usage:.1f}",
f"{stat.peak_memory:.1f}",
stat.duration,
stat.error_message[:40] if stat.error_message else ""
stat.error_message[:40] if stat.error_message else "",
)
return table
def _create_table(self) -> Table:
@@ -268,7 +285,7 @@ class BaseDispatcher(ABC):
def __init__(
self,
rate_limiter: Optional[RateLimiter] = None,
monitor: Optional[CrawlerMonitor] = None
monitor: Optional[CrawlerMonitor] = None,
):
self.crawler = None
self._domain_last_hit: Dict[str, float] = {}
@@ -278,24 +295,25 @@ class BaseDispatcher(ABC):
@abstractmethod
async def crawl_url(
self,
url: str,
config: CrawlerRunConfig,
self,
url: str,
config: CrawlerRunConfig,
task_id: str,
monitor: Optional[CrawlerMonitor] = None
monitor: Optional[CrawlerMonitor] = None,
) -> CrawlerTaskResult:
pass
@abstractmethod
async def run_urls(
self,
urls: List[str],
crawler: "AsyncWebCrawler",
self,
urls: List[str],
crawler: "AsyncWebCrawler", # noqa: F821
config: CrawlerRunConfig,
monitor: Optional[CrawlerMonitor] = None
monitor: Optional[CrawlerMonitor] = None,
) -> List[CrawlerTaskResult]:
pass
class MemoryAdaptiveDispatcher(BaseDispatcher):
def __init__(
self,
@@ -304,39 +322,41 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
max_session_permit: int = 20,
memory_wait_timeout: float = 300.0, # 5 minutes default timeout
rate_limiter: Optional[RateLimiter] = None,
monitor: Optional[CrawlerMonitor] = None
monitor: Optional[CrawlerMonitor] = None,
):
super().__init__(rate_limiter, monitor)
self.memory_threshold_percent = memory_threshold_percent
self.check_interval = check_interval
self.max_session_permit = max_session_permit
self.memory_wait_timeout = memory_wait_timeout
async def crawl_url(
self,
url: str,
config: CrawlerRunConfig,
self,
url: str,
config: CrawlerRunConfig,
task_id: str,
) -> CrawlerTaskResult:
start_time = datetime.now()
error_message = ""
memory_usage = peak_memory = 0.0
try:
if self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time)
self.monitor.update_task(
task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
)
self.concurrent_sessions += 1
if self.rate_limiter:
await self.rate_limiter.wait_if_needed(url)
process = psutil.Process()
start_memory = process.memory_info().rss / (1024 * 1024)
result = await self.crawler.arun(url, config=config, session_id=task_id)
end_memory = process.memory_info().rss / (1024 * 1024)
memory_usage = peak_memory = end_memory - start_memory
if self.rate_limiter and result.status_code:
if not self.rate_limiter.update_delay(url, result.status_code):
error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
@@ -350,22 +370,24 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
peak_memory=peak_memory,
start_time=start_time,
end_time=datetime.now(),
error_message=error_message
error_message=error_message,
)
if not result.success:
error_message = result.error_message
if self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
elif self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
except Exception as e:
error_message = str(e)
if self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
result = CrawlResult(url=url, html="", metadata={}, success=False, error_message=str(e))
result = CrawlResult(
url=url, html="", metadata={}, success=False, error_message=str(e)
)
finally:
end_time = datetime.now()
if self.monitor:
@@ -374,10 +396,10 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
end_time=end_time,
memory_usage=memory_usage,
peak_memory=peak_memory,
error_message=error_message
error_message=error_message,
)
self.concurrent_sessions -= 1
return CrawlerTaskResult(
task_id=task_id,
url=url,
@@ -386,20 +408,20 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
peak_memory=peak_memory,
start_time=start_time,
end_time=end_time,
error_message=error_message
error_message=error_message,
)
async def run_urls(
self,
urls: List[str],
crawler: "AsyncWebCrawler",
self,
urls: List[str],
crawler: "AsyncWebCrawler", # noqa: F821
config: CrawlerRunConfig,
) -> List[CrawlerTaskResult]:
self.crawler = crawler
if self.monitor:
self.monitor.start()
try:
pending_tasks = []
active_tasks = []
@@ -417,23 +439,24 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
if psutil.virtual_memory().percent >= self.memory_threshold_percent:
# Check if we've exceeded the timeout
if time.time() - wait_start_time > self.memory_wait_timeout:
raise MemoryError(f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds")
raise MemoryError(
f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds"
)
await asyncio.sleep(self.check_interval)
continue
url, task_id = task_queue.pop(0)
task = asyncio.create_task(self.crawl_url(url, config, task_id))
active_tasks.append(task)
if not active_tasks:
await asyncio.sleep(self.check_interval)
continue
done, pending = await asyncio.wait(
active_tasks,
return_when=asyncio.FIRST_COMPLETED
active_tasks, return_when=asyncio.FIRST_COMPLETED
)
pending_tasks.extend(done)
active_tasks = list(pending)
@@ -442,24 +465,25 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
if self.monitor:
self.monitor.stop()
class SemaphoreDispatcher(BaseDispatcher):
def __init__(
self,
semaphore_count: int = 5,
max_session_permit: int = 20,
rate_limiter: Optional[RateLimiter] = None,
monitor: Optional[CrawlerMonitor] = None
monitor: Optional[CrawlerMonitor] = None,
):
super().__init__(rate_limiter, monitor)
self.semaphore_count = semaphore_count
self.max_session_permit = max_session_permit
self.max_session_permit = max_session_permit
async def crawl_url(
self,
url: str,
config: CrawlerRunConfig,
self,
url: str,
config: CrawlerRunConfig,
task_id: str,
semaphore: asyncio.Semaphore = None
semaphore: asyncio.Semaphore = None,
) -> CrawlerTaskResult:
start_time = datetime.now()
error_message = ""
@@ -467,7 +491,9 @@ class SemaphoreDispatcher(BaseDispatcher):
try:
if self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time)
self.monitor.update_task(
task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
)
if self.rate_limiter:
await self.rate_limiter.wait_if_needed(url)
@@ -477,7 +503,7 @@ class SemaphoreDispatcher(BaseDispatcher):
start_memory = process.memory_info().rss / (1024 * 1024)
result = await self.crawler.arun(url, config=config, session_id=task_id)
end_memory = process.memory_info().rss / (1024 * 1024)
memory_usage = peak_memory = end_memory - start_memory
if self.rate_limiter and result.status_code:
@@ -493,7 +519,7 @@ class SemaphoreDispatcher(BaseDispatcher):
peak_memory=peak_memory,
start_time=start_time,
end_time=datetime.now(),
error_message=error_message
error_message=error_message,
)
if not result.success:
@@ -507,7 +533,9 @@ class SemaphoreDispatcher(BaseDispatcher):
error_message = str(e)
if self.monitor:
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
result = CrawlResult(url=url, html="", metadata={}, success=False, error_message=str(e))
result = CrawlResult(
url=url, html="", metadata={}, success=False, error_message=str(e)
)
finally:
end_time = datetime.now()
@@ -517,7 +545,7 @@ class SemaphoreDispatcher(BaseDispatcher):
end_time=end_time,
memory_usage=memory_usage,
peak_memory=peak_memory,
error_message=error_message
error_message=error_message,
)
return CrawlerTaskResult(
@@ -528,13 +556,13 @@ class SemaphoreDispatcher(BaseDispatcher):
peak_memory=peak_memory,
start_time=start_time,
end_time=end_time,
error_message=error_message
error_message=error_message,
)
async def run_urls(
self,
crawler: "AsyncWebCrawler",
urls: List[str],
self,
crawler: "AsyncWebCrawler", # noqa: F821
urls: List[str],
config: CrawlerRunConfig,
) -> List[CrawlerTaskResult]:
self.crawler = crawler
@@ -557,4 +585,4 @@ class SemaphoreDispatcher(BaseDispatcher):
return await asyncio.gather(*tasks, return_exceptions=True)
finally:
if self.monitor:
self.monitor.stop()
self.monitor.stop()