refactor(dispatcher): migrate to modular dispatcher system with enhanced monitoring
Reorganize dispatcher functionality into separate components: - Create dedicated dispatcher classes (MemoryAdaptive, Semaphore) - Add RateLimiter for smart request throttling - Implement CrawlerMonitor for real-time progress tracking - Move dispatcher config from CrawlerRunConfig to separate classes BREAKING CHANGE: Dispatcher configuration moved from CrawlerRunConfig to dedicated dispatcher classes. Users need to update their configuration approach for multi-URL crawling.
This commit is contained in:
@@ -6,7 +6,8 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, Cosi
|
|||||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter
|
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult, MarkdownGenerationResult
|
||||||
|
from .async_dispatcher import MemoryAdaptiveDispatcher, SemaphoreDispatcher, RateLimiter, CrawlerMonitor, DisplayMode
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -24,6 +25,12 @@ __all__ = [
|
|||||||
'DefaultMarkdownGenerator',
|
'DefaultMarkdownGenerator',
|
||||||
'PruningContentFilter',
|
'PruningContentFilter',
|
||||||
'BM25ContentFilter',
|
'BM25ContentFilter',
|
||||||
|
'MemoryAdaptiveDispatcher',
|
||||||
|
'SemaphoreDispatcher',
|
||||||
|
'RateLimiter',
|
||||||
|
'CrawlerMonitor',
|
||||||
|
'DisplayMode',
|
||||||
|
'MarkdownGenerationResult',
|
||||||
]
|
]
|
||||||
|
|
||||||
def is_sync_version_installed():
|
def is_sync_version_installed():
|
||||||
|
|||||||
@@ -11,8 +11,7 @@ from .user_agent_generator import UserAgentGenerator
|
|||||||
from .extraction_strategy import ExtractionStrategy
|
from .extraction_strategy import ExtractionStrategy
|
||||||
from .chunking_strategy import ChunkingStrategy
|
from .chunking_strategy import ChunkingStrategy
|
||||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||||
from typing import Union, List, Tuple, Optional
|
from typing import Union, List
|
||||||
from dataclasses import dataclass, field
|
|
||||||
|
|
||||||
class BrowserConfig:
|
class BrowserConfig:
|
||||||
"""
|
"""
|
||||||
@@ -184,14 +183,6 @@ class BrowserConfig:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class RateLimitConfig:
|
|
||||||
base_delay: Tuple[float, float] = (1.0, 3.0)
|
|
||||||
max_delay: float = 60.0
|
|
||||||
max_retries: int = 3
|
|
||||||
rate_limit_codes: List[int] = field(default_factory=lambda: [429, 503])
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlerRunConfig:
|
class CrawlerRunConfig:
|
||||||
"""
|
"""
|
||||||
Configuration class for controlling how the crawler runs each crawl operation.
|
Configuration class for controlling how the crawler runs each crawl operation.
|
||||||
@@ -320,14 +311,8 @@ class CrawlerRunConfig:
|
|||||||
log_console (bool): If True, log console messages from the page.
|
log_console (bool): If True, log console messages from the page.
|
||||||
Default: False.
|
Default: False.
|
||||||
|
|
||||||
# Dispatcher configuration
|
# Optional Parameters
|
||||||
memory_threshold_percent: float = 70.0
|
url: str = None # This is not a compulsory parameter
|
||||||
check_interval: float = 1.0
|
|
||||||
max_session_permit: int = 20
|
|
||||||
enable_rate_limiting: bool = False
|
|
||||||
rate_limit_config: Optional[RateLimitConfig] = None
|
|
||||||
display_mode: Optional[str] = None
|
|
||||||
url: str = None
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -400,13 +385,6 @@ class CrawlerRunConfig:
|
|||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
log_console: bool = False,
|
log_console: bool = False,
|
||||||
|
|
||||||
# Dispatcher configuration
|
|
||||||
memory_threshold_percent: float = 70.0,
|
|
||||||
check_interval: float = 1.0,
|
|
||||||
max_session_permit: int = 20,
|
|
||||||
enable_rate_limiting: bool = False,
|
|
||||||
rate_limit_config: Optional[RateLimitConfig] = None,
|
|
||||||
display_mode: Optional[str] = None,
|
|
||||||
url: str = None,
|
url: str = None,
|
||||||
):
|
):
|
||||||
self.url = url
|
self.url = url
|
||||||
@@ -479,14 +457,6 @@ class CrawlerRunConfig:
|
|||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.log_console = log_console
|
self.log_console = log_console
|
||||||
|
|
||||||
# Dispatcher configuration
|
|
||||||
self.memory_threshold_percent = memory_threshold_percent
|
|
||||||
self.check_interval = check_interval
|
|
||||||
self.max_session_permit = max_session_permit
|
|
||||||
self.enable_rate_limiting = enable_rate_limiting
|
|
||||||
self.rate_limit_config = rate_limit_config
|
|
||||||
self.display_mode = display_mode
|
|
||||||
|
|
||||||
# Validate type of extraction strategy and chunking strategy if they are provided
|
# Validate type of extraction strategy and chunking strategy if they are provided
|
||||||
if self.extraction_strategy is not None and not isinstance(
|
if self.extraction_strategy is not None and not isinstance(
|
||||||
self.extraction_strategy, ExtractionStrategy
|
self.extraction_strategy, ExtractionStrategy
|
||||||
@@ -573,13 +543,6 @@ class CrawlerRunConfig:
|
|||||||
verbose=kwargs.get("verbose", True),
|
verbose=kwargs.get("verbose", True),
|
||||||
log_console=kwargs.get("log_console", False),
|
log_console=kwargs.get("log_console", False),
|
||||||
|
|
||||||
# Dispatcher configuration
|
|
||||||
memory_threshold_percent=kwargs.get("memory_threshold_percent", 70.0),
|
|
||||||
check_interval=kwargs.get("check_interval", 1.0),
|
|
||||||
max_session_permit=kwargs.get("max_session_permit", 20),
|
|
||||||
enable_rate_limiting=kwargs.get("enable_rate_limiting", False),
|
|
||||||
rate_limit_config=kwargs.get("rate_limit_config"),
|
|
||||||
display_mode=kwargs.get("display_mode"),
|
|
||||||
url=kwargs.get("url"),
|
url=kwargs.get("url"),
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -638,11 +601,5 @@ class CrawlerRunConfig:
|
|||||||
"exclude_domains": self.exclude_domains,
|
"exclude_domains": self.exclude_domains,
|
||||||
"verbose": self.verbose,
|
"verbose": self.verbose,
|
||||||
"log_console": self.log_console,
|
"log_console": self.log_console,
|
||||||
"memory_threshold_percent": self.memory_threshold_percent,
|
|
||||||
"check_interval": self.check_interval,
|
|
||||||
"max_session_permit": self.max_session_permit,
|
|
||||||
"enable_rate_limiting": self.enable_rate_limiting,
|
|
||||||
"rate_limit_config": self.rate_limit_config,
|
|
||||||
"display_mode": self.display_mode,
|
|
||||||
"url": self.url,
|
"url": self.url,
|
||||||
}
|
}
|
||||||
|
|||||||
560
crawl4ai/async_dispatcher.py
Normal file
560
crawl4ai/async_dispatcher.py
Normal file
@@ -0,0 +1,560 @@
|
|||||||
|
from typing import Dict, Optional, List
|
||||||
|
from .async_configs import *
|
||||||
|
from .models import *
|
||||||
|
|
||||||
|
from rich.live import Live
|
||||||
|
from rich.table import Table
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.style import Style
|
||||||
|
from rich import box
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import time
|
||||||
|
import psutil
|
||||||
|
import asyncio
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import random
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
|
||||||
|
class RateLimiter:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_delay: Tuple[float, float] = (1.0, 3.0),
|
||||||
|
max_delay: float = 60.0,
|
||||||
|
max_retries: int = 3,
|
||||||
|
rate_limit_codes: List[int] = None
|
||||||
|
):
|
||||||
|
self.base_delay = base_delay
|
||||||
|
self.max_delay = max_delay
|
||||||
|
self.max_retries = max_retries
|
||||||
|
self.rate_limit_codes = rate_limit_codes or [429, 503]
|
||||||
|
self.domains: Dict[str, DomainState] = {}
|
||||||
|
|
||||||
|
def get_domain(self, url: str) -> str:
|
||||||
|
return urlparse(url).netloc
|
||||||
|
|
||||||
|
async def wait_if_needed(self, url: str) -> None:
|
||||||
|
domain = self.get_domain(url)
|
||||||
|
state = self.domains.get(domain)
|
||||||
|
|
||||||
|
if not state:
|
||||||
|
self.domains[domain] = DomainState()
|
||||||
|
state = self.domains[domain]
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
if state.last_request_time:
|
||||||
|
wait_time = max(0, state.current_delay - (now - state.last_request_time))
|
||||||
|
if wait_time > 0:
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
|
||||||
|
# Random delay within base range if no current delay
|
||||||
|
if state.current_delay == 0:
|
||||||
|
state.current_delay = random.uniform(*self.base_delay)
|
||||||
|
|
||||||
|
state.last_request_time = time.time()
|
||||||
|
|
||||||
|
def update_delay(self, url: str, status_code: int) -> bool:
|
||||||
|
domain = self.get_domain(url)
|
||||||
|
state = self.domains[domain]
|
||||||
|
|
||||||
|
if status_code in self.rate_limit_codes:
|
||||||
|
state.fail_count += 1
|
||||||
|
if state.fail_count > self.max_retries:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Exponential backoff with random jitter
|
||||||
|
state.current_delay = min(
|
||||||
|
state.current_delay * 2 * random.uniform(0.75, 1.25),
|
||||||
|
self.max_delay
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Gradually reduce delay on success
|
||||||
|
state.current_delay = max(
|
||||||
|
random.uniform(*self.base_delay),
|
||||||
|
state.current_delay * 0.75
|
||||||
|
)
|
||||||
|
state.fail_count = 0
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
class CrawlerMonitor:
|
||||||
|
def __init__(self, max_visible_rows: int = 15, display_mode: DisplayMode = DisplayMode.DETAILED):
|
||||||
|
self.console = Console()
|
||||||
|
self.max_visible_rows = max_visible_rows
|
||||||
|
self.display_mode = display_mode
|
||||||
|
self.stats: Dict[str, CrawlStats] = {}
|
||||||
|
self.process = psutil.Process()
|
||||||
|
self.start_time = datetime.now()
|
||||||
|
self.live = Live(self._create_table(), refresh_per_second=2)
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.live.start()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.live.stop()
|
||||||
|
|
||||||
|
def add_task(self, task_id: str, url: str):
|
||||||
|
self.stats[task_id] = CrawlStats(task_id=task_id, url=url, status=CrawlStatus.QUEUED)
|
||||||
|
self.live.update(self._create_table())
|
||||||
|
|
||||||
|
def update_task(self, task_id: str, **kwargs):
|
||||||
|
if task_id in self.stats:
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
setattr(self.stats[task_id], key, value)
|
||||||
|
self.live.update(self._create_table())
|
||||||
|
|
||||||
|
def _create_aggregated_table(self) -> Table:
|
||||||
|
"""Creates a compact table showing only aggregated statistics"""
|
||||||
|
table = Table(
|
||||||
|
box=box.ROUNDED,
|
||||||
|
title="Crawler Status Overview",
|
||||||
|
title_style="bold magenta",
|
||||||
|
header_style="bold blue",
|
||||||
|
show_lines=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calculate statistics
|
||||||
|
total_tasks = len(self.stats)
|
||||||
|
queued = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED)
|
||||||
|
in_progress = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS)
|
||||||
|
completed = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED)
|
||||||
|
failed = sum(1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED)
|
||||||
|
|
||||||
|
# Memory statistics
|
||||||
|
current_memory = self.process.memory_info().rss / (1024 * 1024)
|
||||||
|
total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
|
||||||
|
peak_memory = max((stat.peak_memory for stat in self.stats.values()), default=0.0)
|
||||||
|
|
||||||
|
# Duration
|
||||||
|
duration = datetime.now() - self.start_time
|
||||||
|
|
||||||
|
# Create status row
|
||||||
|
table.add_column("Status", style="bold cyan")
|
||||||
|
table.add_column("Count", justify="right")
|
||||||
|
table.add_column("Percentage", justify="right")
|
||||||
|
|
||||||
|
table.add_row(
|
||||||
|
"Total Tasks",
|
||||||
|
str(total_tasks),
|
||||||
|
"100%"
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[yellow]In Queue[/yellow]",
|
||||||
|
str(queued),
|
||||||
|
f"{(queued/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[blue]In Progress[/blue]",
|
||||||
|
str(in_progress),
|
||||||
|
f"{(in_progress/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[green]Completed[/green]",
|
||||||
|
str(completed),
|
||||||
|
f"{(completed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[red]Failed[/red]",
|
||||||
|
str(failed),
|
||||||
|
f"{(failed/total_tasks*100):.1f}%" if total_tasks > 0 else "0%"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add memory information
|
||||||
|
table.add_section()
|
||||||
|
table.add_row(
|
||||||
|
"[magenta]Current Memory[/magenta]",
|
||||||
|
f"{current_memory:.1f} MB",
|
||||||
|
""
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[magenta]Total Task Memory[/magenta]",
|
||||||
|
f"{total_task_memory:.1f} MB",
|
||||||
|
""
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[magenta]Peak Task Memory[/magenta]",
|
||||||
|
f"{peak_memory:.1f} MB",
|
||||||
|
""
|
||||||
|
)
|
||||||
|
table.add_row(
|
||||||
|
"[yellow]Runtime[/yellow]",
|
||||||
|
str(timedelta(seconds=int(duration.total_seconds()))),
|
||||||
|
""
|
||||||
|
)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
||||||
|
def _create_detailed_table(self) -> Table:
|
||||||
|
table = Table(
|
||||||
|
box=box.ROUNDED,
|
||||||
|
title="Crawler Performance Monitor",
|
||||||
|
title_style="bold magenta",
|
||||||
|
header_style="bold blue"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add columns
|
||||||
|
table.add_column("Task ID", style="cyan", no_wrap=True)
|
||||||
|
table.add_column("URL", style="cyan", no_wrap=True)
|
||||||
|
table.add_column("Status", style="bold")
|
||||||
|
table.add_column("Memory (MB)", justify="right")
|
||||||
|
table.add_column("Peak (MB)", justify="right")
|
||||||
|
table.add_column("Duration", justify="right")
|
||||||
|
table.add_column("Info", style="italic")
|
||||||
|
|
||||||
|
# Add summary row
|
||||||
|
total_memory = sum(stat.memory_usage for stat in self.stats.values())
|
||||||
|
active_count = sum(1 for stat in self.stats.values()
|
||||||
|
if stat.status == CrawlStatus.IN_PROGRESS)
|
||||||
|
completed_count = sum(1 for stat in self.stats.values()
|
||||||
|
if stat.status == CrawlStatus.COMPLETED)
|
||||||
|
failed_count = sum(1 for stat in self.stats.values()
|
||||||
|
if stat.status == CrawlStatus.FAILED)
|
||||||
|
|
||||||
|
table.add_row(
|
||||||
|
"[bold yellow]SUMMARY",
|
||||||
|
f"Total: {len(self.stats)}",
|
||||||
|
f"Active: {active_count}",
|
||||||
|
f"{total_memory:.1f}",
|
||||||
|
f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
|
||||||
|
str(timedelta(seconds=int((datetime.now() - self.start_time).total_seconds()))),
|
||||||
|
f"✓{completed_count} ✗{failed_count}",
|
||||||
|
style="bold"
|
||||||
|
)
|
||||||
|
|
||||||
|
table.add_section()
|
||||||
|
|
||||||
|
# Add rows for each task
|
||||||
|
visible_stats = sorted(
|
||||||
|
self.stats.values(),
|
||||||
|
key=lambda x: (
|
||||||
|
x.status != CrawlStatus.IN_PROGRESS,
|
||||||
|
x.status != CrawlStatus.QUEUED,
|
||||||
|
x.end_time or datetime.max
|
||||||
|
)
|
||||||
|
)[:self.max_visible_rows]
|
||||||
|
|
||||||
|
for stat in visible_stats:
|
||||||
|
status_style = {
|
||||||
|
CrawlStatus.QUEUED: "white",
|
||||||
|
CrawlStatus.IN_PROGRESS: "yellow",
|
||||||
|
CrawlStatus.COMPLETED: "green",
|
||||||
|
CrawlStatus.FAILED: "red"
|
||||||
|
}[stat.status]
|
||||||
|
|
||||||
|
table.add_row(
|
||||||
|
stat.task_id[:8], # Show first 8 chars of task ID
|
||||||
|
stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
|
||||||
|
f"[{status_style}]{stat.status.value}[/{status_style}]",
|
||||||
|
f"{stat.memory_usage:.1f}",
|
||||||
|
f"{stat.peak_memory:.1f}",
|
||||||
|
stat.duration,
|
||||||
|
stat.error_message[:40] if stat.error_message else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
||||||
|
def _create_table(self) -> Table:
|
||||||
|
"""Creates the appropriate table based on display mode"""
|
||||||
|
if self.display_mode == DisplayMode.AGGREGATED:
|
||||||
|
return self._create_aggregated_table()
|
||||||
|
return self._create_detailed_table()
|
||||||
|
|
||||||
|
|
||||||
|
class BaseDispatcher(ABC):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
rate_limiter: Optional[RateLimiter] = None,
|
||||||
|
monitor: Optional[CrawlerMonitor] = None
|
||||||
|
):
|
||||||
|
self.crawler = None
|
||||||
|
self._domain_last_hit: Dict[str, float] = {}
|
||||||
|
self.concurrent_sessions = 0
|
||||||
|
self.rate_limiter = rate_limiter
|
||||||
|
self.monitor = monitor
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def crawl_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
config: CrawlerRunConfig,
|
||||||
|
task_id: str,
|
||||||
|
monitor: Optional[CrawlerMonitor] = None
|
||||||
|
) -> CrawlerTaskResult:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def run_urls(
|
||||||
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
crawler: "AsyncWebCrawler",
|
||||||
|
config: CrawlerRunConfig,
|
||||||
|
monitor: Optional[CrawlerMonitor] = None
|
||||||
|
) -> List[CrawlerTaskResult]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
memory_threshold_percent: float = 70.0,
|
||||||
|
check_interval: float = 1.0,
|
||||||
|
max_session_permit: int = 20,
|
||||||
|
memory_wait_timeout: float = 300.0, # 5 minutes default timeout
|
||||||
|
rate_limiter: Optional[RateLimiter] = None,
|
||||||
|
monitor: Optional[CrawlerMonitor] = None
|
||||||
|
):
|
||||||
|
super().__init__(rate_limiter, monitor)
|
||||||
|
self.memory_threshold_percent = memory_threshold_percent
|
||||||
|
self.check_interval = check_interval
|
||||||
|
self.max_session_permit = max_session_permit
|
||||||
|
self.memory_wait_timeout = memory_wait_timeout
|
||||||
|
|
||||||
|
async def crawl_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
config: CrawlerRunConfig,
|
||||||
|
task_id: str,
|
||||||
|
) -> CrawlerTaskResult:
|
||||||
|
start_time = datetime.now()
|
||||||
|
error_message = ""
|
||||||
|
memory_usage = peak_memory = 0.0
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time)
|
||||||
|
self.concurrent_sessions += 1
|
||||||
|
|
||||||
|
if self.rate_limiter:
|
||||||
|
await self.rate_limiter.wait_if_needed(url)
|
||||||
|
|
||||||
|
process = psutil.Process()
|
||||||
|
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
|
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||||
|
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
|
|
||||||
|
memory_usage = peak_memory = end_memory - start_memory
|
||||||
|
|
||||||
|
if self.rate_limiter and result.status_code:
|
||||||
|
if not self.rate_limiter.update_delay(url, result.status_code):
|
||||||
|
error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=result,
|
||||||
|
memory_usage=memory_usage,
|
||||||
|
peak_memory=peak_memory,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=datetime.now(),
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
error_message = result.error_message
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||||
|
elif self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_message = str(e)
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||||
|
result = CrawlResult(url=url, html="", metadata={}, success=False, error_message=str(e))
|
||||||
|
|
||||||
|
finally:
|
||||||
|
end_time = datetime.now()
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(
|
||||||
|
task_id,
|
||||||
|
end_time=end_time,
|
||||||
|
memory_usage=memory_usage,
|
||||||
|
peak_memory=peak_memory,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
self.concurrent_sessions -= 1
|
||||||
|
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=result,
|
||||||
|
memory_usage=memory_usage,
|
||||||
|
peak_memory=peak_memory,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run_urls(
|
||||||
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
crawler: "AsyncWebCrawler",
|
||||||
|
config: CrawlerRunConfig,
|
||||||
|
) -> List[CrawlerTaskResult]:
|
||||||
|
self.crawler = crawler
|
||||||
|
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
pending_tasks = []
|
||||||
|
active_tasks = []
|
||||||
|
task_queue = []
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
task_id = str(uuid.uuid4())
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.add_task(task_id, url)
|
||||||
|
task_queue.append((url, task_id))
|
||||||
|
|
||||||
|
while task_queue or active_tasks:
|
||||||
|
wait_start_time = time.time()
|
||||||
|
while len(active_tasks) < self.max_session_permit and task_queue:
|
||||||
|
if psutil.virtual_memory().percent >= self.memory_threshold_percent:
|
||||||
|
# Check if we've exceeded the timeout
|
||||||
|
if time.time() - wait_start_time > self.memory_wait_timeout:
|
||||||
|
raise MemoryError(f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds")
|
||||||
|
await asyncio.sleep(self.check_interval)
|
||||||
|
continue
|
||||||
|
|
||||||
|
url, task_id = task_queue.pop(0)
|
||||||
|
task = asyncio.create_task(self.crawl_url(url, config, task_id))
|
||||||
|
active_tasks.append(task)
|
||||||
|
|
||||||
|
if not active_tasks:
|
||||||
|
await asyncio.sleep(self.check_interval)
|
||||||
|
continue
|
||||||
|
|
||||||
|
done, pending = await asyncio.wait(
|
||||||
|
active_tasks,
|
||||||
|
return_when=asyncio.FIRST_COMPLETED
|
||||||
|
)
|
||||||
|
|
||||||
|
pending_tasks.extend(done)
|
||||||
|
active_tasks = list(pending)
|
||||||
|
|
||||||
|
return await asyncio.gather(*pending_tasks)
|
||||||
|
finally:
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.stop()
|
||||||
|
|
||||||
|
class SemaphoreDispatcher(BaseDispatcher):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
semaphore_count: int = 5,
|
||||||
|
max_session_permit: int = 20,
|
||||||
|
rate_limiter: Optional[RateLimiter] = None,
|
||||||
|
monitor: Optional[CrawlerMonitor] = None
|
||||||
|
):
|
||||||
|
super().__init__(rate_limiter, monitor)
|
||||||
|
self.semaphore_count = semaphore_count
|
||||||
|
self.max_session_permit = max_session_permit
|
||||||
|
|
||||||
|
async def crawl_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
config: CrawlerRunConfig,
|
||||||
|
task_id: str,
|
||||||
|
semaphore: asyncio.Semaphore = None
|
||||||
|
) -> CrawlerTaskResult:
|
||||||
|
start_time = datetime.now()
|
||||||
|
error_message = ""
|
||||||
|
memory_usage = peak_memory = 0.0
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time)
|
||||||
|
|
||||||
|
if self.rate_limiter:
|
||||||
|
await self.rate_limiter.wait_if_needed(url)
|
||||||
|
|
||||||
|
async with semaphore:
|
||||||
|
process = psutil.Process()
|
||||||
|
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
|
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||||
|
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
|
|
||||||
|
memory_usage = peak_memory = end_memory - start_memory
|
||||||
|
|
||||||
|
if self.rate_limiter and result.status_code:
|
||||||
|
if not self.rate_limiter.update_delay(url, result.status_code):
|
||||||
|
error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=result,
|
||||||
|
memory_usage=memory_usage,
|
||||||
|
peak_memory=peak_memory,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=datetime.now(),
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
if not result.success:
|
||||||
|
error_message = result.error_message
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||||
|
elif self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_message = str(e)
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||||
|
result = CrawlResult(url=url, html="", metadata={}, success=False, error_message=str(e))
|
||||||
|
|
||||||
|
finally:
|
||||||
|
end_time = datetime.now()
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(
|
||||||
|
task_id,
|
||||||
|
end_time=end_time,
|
||||||
|
memory_usage=memory_usage,
|
||||||
|
peak_memory=peak_memory,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=result,
|
||||||
|
memory_usage=memory_usage,
|
||||||
|
peak_memory=peak_memory,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
async def run_urls(
|
||||||
|
self,
|
||||||
|
crawler: "AsyncWebCrawler",
|
||||||
|
urls: List[str],
|
||||||
|
config: CrawlerRunConfig,
|
||||||
|
) -> List[CrawlerTaskResult]:
|
||||||
|
self.crawler = crawler
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
semaphore = asyncio.Semaphore(self.semaphore_count)
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
task_id = str(uuid.uuid4())
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.add_task(task_id, url)
|
||||||
|
task = asyncio.create_task(
|
||||||
|
self.crawl_url(url, config, task_id, semaphore)
|
||||||
|
)
|
||||||
|
tasks.append(task)
|
||||||
|
|
||||||
|
return await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
finally:
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.stop()
|
||||||
@@ -9,7 +9,7 @@ import json
|
|||||||
import asyncio
|
import asyncio
|
||||||
# from contextlib import nullcontext, asynccontextmanager
|
# from contextlib import nullcontext, asynccontextmanager
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from .models import CrawlResult, MarkdownGenerationResult
|
from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult
|
||||||
from .async_database import async_db_manager
|
from .async_database import async_db_manager
|
||||||
from .chunking_strategy import *
|
from .chunking_strategy import *
|
||||||
from .content_filter_strategy import *
|
from .content_filter_strategy import *
|
||||||
@@ -20,6 +20,8 @@ from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGene
|
|||||||
from .content_scraping_strategy import WebScrapingStrategy
|
from .content_scraping_strategy import WebScrapingStrategy
|
||||||
from .async_logger import AsyncLogger
|
from .async_logger import AsyncLogger
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
|
from .async_dispatcher import *
|
||||||
|
|
||||||
from .config import (
|
from .config import (
|
||||||
MIN_WORD_THRESHOLD,
|
MIN_WORD_THRESHOLD,
|
||||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||||
@@ -675,6 +677,7 @@ class AsyncWebCrawler:
|
|||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
config: Optional[CrawlerRunConfig] = None,
|
config: Optional[CrawlerRunConfig] = None,
|
||||||
|
dispatcher: Optional[BaseDispatcher] = None,
|
||||||
# Legacy parameters maintained for backwards compatibility
|
# Legacy parameters maintained for backwards compatibility
|
||||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
@@ -690,7 +693,7 @@ class AsyncWebCrawler:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> List[CrawlResult]:
|
) -> List[CrawlResult]:
|
||||||
"""
|
"""
|
||||||
Runs the crawler for multiple URLs concurrently using MemoryAdaptiveDispatcher.
|
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
||||||
|
|
||||||
Migration Guide:
|
Migration Guide:
|
||||||
Old way (deprecated):
|
Old way (deprecated):
|
||||||
@@ -705,84 +708,83 @@ class AsyncWebCrawler:
|
|||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
word_count_threshold=200,
|
word_count_threshold=200,
|
||||||
screenshot=True,
|
screenshot=True,
|
||||||
enable_rate_limiting=True,
|
dispatcher_config=DispatcherConfig(
|
||||||
rate_limit_config=RateLimitConfig(...),
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config=RateLimitConfig(...),
|
||||||
|
),
|
||||||
...
|
...
|
||||||
)
|
)
|
||||||
results = await crawler.arun_many(urls, config=config)
|
results = await crawler.arun_many(
|
||||||
|
urls,
|
||||||
|
config=config,
|
||||||
|
dispatcher_strategy=MemoryAdaptiveDispatcher # Optional, this is the default
|
||||||
|
)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls: List of URLs to crawl
|
urls: List of URLs to crawl
|
||||||
config: Configuration object controlling crawl behavior for all URLs
|
config: Configuration object controlling crawl behavior for all URLs
|
||||||
|
dispatcher_strategy: The dispatcher strategy class to use. Defaults to MemoryAdaptiveDispatcher.
|
||||||
[other parameters maintained for backwards compatibility]
|
[other parameters maintained for backwards compatibility]
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[CrawlResult]: Results for each URL
|
List[CrawlResult]: Results for each URL
|
||||||
"""
|
"""
|
||||||
# Handle configuration
|
# Create config if not provided
|
||||||
if config is not None:
|
if config is None:
|
||||||
if any(param is not None for param in [
|
config = CrawlerRunConfig(
|
||||||
word_count_threshold, extraction_strategy, chunking_strategy,
|
word_count_threshold=word_count_threshold,
|
||||||
content_filter, cache_mode, css_selector, screenshot, pdf
|
extraction_strategy=extraction_strategy,
|
||||||
]):
|
chunking_strategy=chunking_strategy,
|
||||||
self.logger.warning(
|
content_filter=content_filter,
|
||||||
message="Both config and legacy parameters provided. config will take precedence.",
|
cache_mode=cache_mode,
|
||||||
tag="WARNING"
|
bypass_cache=bypass_cache,
|
||||||
)
|
css_selector=css_selector,
|
||||||
else:
|
screenshot=screenshot,
|
||||||
# Merge all parameters into a single kwargs dict for config creation
|
pdf=pdf,
|
||||||
config_kwargs = {
|
verbose=verbose,
|
||||||
"word_count_threshold": word_count_threshold,
|
|
||||||
"extraction_strategy": extraction_strategy,
|
|
||||||
"chunking_strategy": chunking_strategy,
|
|
||||||
"content_filter": content_filter,
|
|
||||||
"cache_mode": cache_mode,
|
|
||||||
"bypass_cache": bypass_cache,
|
|
||||||
"css_selector": css_selector,
|
|
||||||
"screenshot": screenshot,
|
|
||||||
"pdf": pdf,
|
|
||||||
"verbose": verbose,
|
|
||||||
**kwargs
|
**kwargs
|
||||||
}
|
|
||||||
config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
|
||||||
|
|
||||||
if bypass_cache:
|
|
||||||
if kwargs.get("warning", True):
|
|
||||||
warnings.warn(
|
|
||||||
"'bypass_cache' is deprecated and will be removed in version 0.5.0. "
|
|
||||||
"Use 'cache_mode=CacheMode.BYPASS' instead. "
|
|
||||||
"Pass warning=False to suppress this warning.",
|
|
||||||
DeprecationWarning,
|
|
||||||
stacklevel=2
|
|
||||||
)
|
|
||||||
if config.cache_mode is None:
|
|
||||||
config.cache_mode = CacheMode.BYPASS
|
|
||||||
|
|
||||||
from .dispatcher import MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
|
|
||||||
|
|
||||||
# Create dispatcher with configuration from CrawlerRunConfig
|
|
||||||
dispatcher = MemoryAdaptiveDispatcher(
|
|
||||||
crawler=self,
|
|
||||||
memory_threshold_percent=config.memory_threshold_percent,
|
|
||||||
check_interval=config.check_interval,
|
|
||||||
max_session_permit=config.max_session_permit,
|
|
||||||
enable_rate_limiting=config.enable_rate_limiting,
|
|
||||||
rate_limit_config=vars(config.rate_limit_config) if config.rate_limit_config else None
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create monitor if display mode is specified
|
|
||||||
monitor = None
|
|
||||||
if config.display_mode:
|
|
||||||
monitor = CrawlerMonitor(
|
|
||||||
max_visible_rows=15,
|
|
||||||
display_mode=DisplayMode(config.display_mode)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Run URLs through dispatcher
|
# # Initialize the dispatcher with the selected strategy
|
||||||
task_results = await dispatcher.run_urls(urls, config, monitor=monitor)
|
# dispatcher = dispatcher_strategy(self, config.dispatcher_config)
|
||||||
|
|
||||||
# Convert CrawlerTaskResult to CrawlResult
|
# memory_monitor: CrawlerMonitor = None
|
||||||
return [task_result.result for task_result in task_results]
|
# if config.dispatcher_config.enable_monitor:
|
||||||
|
# memory_monitor = CrawlerMonitor(max_visible_rows=config.dispatcher_config.max_display_rows, display_mode=config.dispatcher_config.display_mode)
|
||||||
|
|
||||||
|
# Create default dispatcher if none provided
|
||||||
|
if dispatcher is None:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
self,
|
||||||
|
rate_limiter=RateLimiter(
|
||||||
|
base_delay=(1.0, 3.0),
|
||||||
|
max_delay=60.0,
|
||||||
|
max_retries=3
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run the URLs through the dispatcher
|
||||||
|
_results: List[CrawlerTaskResult] = await dispatcher.run_urls(
|
||||||
|
crawler=self,
|
||||||
|
urls=urls,
|
||||||
|
config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
results: CrawlResult = []
|
||||||
|
for res in _results:
|
||||||
|
_res : CrawlResult = res.result
|
||||||
|
dispatch_result: DispatchResult = DispatchResult(
|
||||||
|
task_id=res.task_id,
|
||||||
|
memory_usage=res.memory_usage,
|
||||||
|
peak_memory=res.peak_memory,
|
||||||
|
start_time=res.start_time,
|
||||||
|
end_time=res.end_time,
|
||||||
|
error_message=res.error_message
|
||||||
|
)
|
||||||
|
_res.dispatch_result = dispatch_result
|
||||||
|
results.append(_res)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
async def aclear_cache(self):
|
async def aclear_cache(self):
|
||||||
"""Clear the cache database."""
|
"""Clear the cache database."""
|
||||||
|
|||||||
@@ -1,8 +1,70 @@
|
|||||||
from pydantic import BaseModel, HttpUrl
|
from pydantic import BaseModel, HttpUrl
|
||||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple
|
||||||
from dataclasses import dataclass
|
from enum import Enum
|
||||||
|
from dataclasses import dataclass, field
|
||||||
from .ssl_certificate import SSLCertificate
|
from .ssl_certificate import SSLCertificate
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###############################
|
||||||
|
# Dispatcher Models
|
||||||
|
###############################
|
||||||
|
@dataclass
|
||||||
|
class DomainState:
|
||||||
|
last_request_time: float = 0
|
||||||
|
current_delay: float = 0
|
||||||
|
fail_count: int = 0
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CrawlerTaskResult:
|
||||||
|
task_id: str
|
||||||
|
url: str
|
||||||
|
result: "CrawlResult"
|
||||||
|
memory_usage: float
|
||||||
|
peak_memory: float
|
||||||
|
start_time: datetime
|
||||||
|
end_time: datetime
|
||||||
|
error_message: str = ""
|
||||||
|
|
||||||
|
class CrawlStatus(Enum):
|
||||||
|
QUEUED = "QUEUED"
|
||||||
|
IN_PROGRESS = "IN_PROGRESS"
|
||||||
|
COMPLETED = "COMPLETED"
|
||||||
|
FAILED = "FAILED"
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CrawlStats:
|
||||||
|
task_id: str
|
||||||
|
url: str
|
||||||
|
status: CrawlStatus
|
||||||
|
start_time: Optional[datetime] = None
|
||||||
|
end_time: Optional[datetime] = None
|
||||||
|
memory_usage: float = 0.0
|
||||||
|
peak_memory: float = 0.0
|
||||||
|
error_message: str = ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def duration(self) -> str:
|
||||||
|
if not self.start_time:
|
||||||
|
return "0:00"
|
||||||
|
end = self.end_time or datetime.now()
|
||||||
|
duration = end - self.start_time
|
||||||
|
return str(timedelta(seconds=int(duration.total_seconds())))
|
||||||
|
|
||||||
|
class DisplayMode(Enum):
|
||||||
|
DETAILED = "DETAILED"
|
||||||
|
AGGREGATED = "AGGREGATED"
|
||||||
|
|
||||||
|
###############################
|
||||||
|
# Crawler Models
|
||||||
|
###############################
|
||||||
@dataclass
|
@dataclass
|
||||||
class TokenUsage:
|
class TokenUsage:
|
||||||
completion_tokens: int = 0
|
completion_tokens: int = 0
|
||||||
@@ -23,6 +85,13 @@ class MarkdownGenerationResult(BaseModel):
|
|||||||
fit_markdown: Optional[str] = None
|
fit_markdown: Optional[str] = None
|
||||||
fit_html: Optional[str] = None
|
fit_html: Optional[str] = None
|
||||||
|
|
||||||
|
class DispatchResult(BaseModel):
|
||||||
|
task_id: str
|
||||||
|
memory_usage: float
|
||||||
|
peak_memory: float
|
||||||
|
start_time: datetime
|
||||||
|
end_time: datetime
|
||||||
|
error_message: str = ""
|
||||||
class CrawlResult(BaseModel):
|
class CrawlResult(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
html: str
|
html: str
|
||||||
@@ -44,6 +113,7 @@ class CrawlResult(BaseModel):
|
|||||||
response_headers: Optional[dict] = None
|
response_headers: Optional[dict] = None
|
||||||
status_code: Optional[int] = None
|
status_code: Optional[int] = None
|
||||||
ssl_certificate: Optional[SSLCertificate] = None
|
ssl_certificate: Optional[SSLCertificate] = None
|
||||||
|
dispatch_result: Optional[DispatchResult] = None
|
||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|||||||
@@ -1,67 +1,121 @@
|
|||||||
import asyncio, time
|
import asyncio
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
import time
|
||||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
from rich import print
|
||||||
from crawl4ai.dispatcher import DisplayMode
|
from rich.table import Table
|
||||||
|
from crawl4ai import (
|
||||||
async def crawl_with_rate_limiting(urls):
|
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
|
||||||
"""
|
MemoryAdaptiveDispatcher, SemaphoreDispatcher,
|
||||||
Example function demonstrating how to use AsyncWebCrawler with rate limiting and resource monitoring.
|
RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
|
||||||
|
)
|
||||||
Args:
|
|
||||||
urls (List[str]): List of URLs to crawl
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List[CrawlResult]: List of crawl results for each URL
|
|
||||||
"""
|
|
||||||
# Configure browser settings
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
headless=True, # Run browser in headless mode
|
|
||||||
verbose=False # Minimize browser logging
|
|
||||||
)
|
|
||||||
|
|
||||||
# Configure crawler settings with rate limiting
|
|
||||||
run_config = CrawlerRunConfig(
|
|
||||||
# Enable rate limiting
|
|
||||||
enable_rate_limiting=True,
|
|
||||||
rate_limit_config=RateLimitConfig(
|
|
||||||
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds between requests
|
|
||||||
max_delay=30.0, # Maximum delay after rate limit hits
|
|
||||||
max_retries=2, # Number of retries before giving up
|
|
||||||
rate_limit_codes=[429, 503] # HTTP status codes to trigger rate limiting
|
|
||||||
),
|
|
||||||
# Resource monitoring settings
|
|
||||||
memory_threshold_percent=70.0, # Pause crawling if memory usage exceeds this
|
|
||||||
check_interval=0.5, # How often to check resource usage
|
|
||||||
max_session_permit=10, # Maximum concurrent crawls
|
|
||||||
display_mode=DisplayMode.DETAILED.value # Show detailed progress
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create and use crawler with context manager
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
results = await crawler.arun_many(urls, config=run_config)
|
|
||||||
return results
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Example URLs (replace with real URLs)
|
|
||||||
urls = [
|
|
||||||
f"https://example.com/page{i}" for i in range(1, 40)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
async def memory_adaptive(urls, browser_config, run_config):
|
||||||
|
"""Memory adaptive crawler with monitoring"""
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
max_session_permit=10,
|
||||||
|
monitor=CrawlerMonitor(
|
||||||
|
max_visible_rows=15,
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
|
)
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
duration = time.perf_counter() - start
|
||||||
|
return len(results), duration
|
||||||
|
|
||||||
# Run the crawler
|
async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
|
||||||
results = asyncio.run(crawl_with_rate_limiting(urls))
|
"""Memory adaptive crawler with rate limiting"""
|
||||||
|
start = time.perf_counter()
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
max_session_permit=10,
|
||||||
|
rate_limiter=RateLimiter(
|
||||||
|
base_delay=(1.0, 2.0),
|
||||||
|
max_delay=30.0,
|
||||||
|
max_retries=2
|
||||||
|
),
|
||||||
|
monitor=CrawlerMonitor(
|
||||||
|
max_visible_rows=15,
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
|
)
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
duration = time.perf_counter() - start
|
||||||
|
return len(results), duration
|
||||||
|
|
||||||
# Process results
|
async def semaphore(urls, browser_config, run_config):
|
||||||
successful_results = [result for result in results if result.success]
|
"""Basic semaphore crawler"""
|
||||||
failed_results = [result for result in results if not result.success]
|
start = time.perf_counter()
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = SemaphoreDispatcher(
|
||||||
|
semaphore_count=5,
|
||||||
|
monitor=CrawlerMonitor(
|
||||||
|
max_visible_rows=15,
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
|
)
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
duration = time.perf_counter() - start
|
||||||
|
return len(results), duration
|
||||||
|
|
||||||
end = time.perf_counter()
|
async def semaphore_with_rate_limit(urls, browser_config, run_config):
|
||||||
|
"""Semaphore crawler with rate limiting"""
|
||||||
|
start = time.perf_counter()
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = SemaphoreDispatcher(
|
||||||
|
semaphore_count=5,
|
||||||
|
rate_limiter=RateLimiter(
|
||||||
|
base_delay=(1.0, 2.0),
|
||||||
|
max_delay=30.0,
|
||||||
|
max_retries=2
|
||||||
|
),
|
||||||
|
monitor=CrawlerMonitor(
|
||||||
|
max_visible_rows=15,
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
|
)
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
duration = time.perf_counter() - start
|
||||||
|
return len(results), duration
|
||||||
|
|
||||||
# Print results
|
def create_performance_table(results):
|
||||||
print(f"Successful crawls: {len(successful_results)}")
|
"""Creates a rich table showing performance results"""
|
||||||
print(f"Failed crawls: {len(failed_results)}")
|
table = Table(title="Crawler Strategy Performance Comparison")
|
||||||
print(f"Time taken: {end - start:.2f} seconds")
|
table.add_column("Strategy", style="cyan")
|
||||||
|
table.add_column("URLs Crawled", justify="right", style="green")
|
||||||
|
table.add_column("Time (seconds)", justify="right", style="yellow")
|
||||||
|
table.add_column("URLs/second", justify="right", style="magenta")
|
||||||
|
|
||||||
|
sorted_results = sorted(results.items(), key=lambda x: x[1][1])
|
||||||
|
|
||||||
|
for strategy, (urls_crawled, duration) in sorted_results:
|
||||||
|
urls_per_second = urls_crawled / duration
|
||||||
|
table.add_row(
|
||||||
|
strategy,
|
||||||
|
str(urls_crawled),
|
||||||
|
f"{duration:.2f}",
|
||||||
|
f"{urls_per_second:.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return table
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
urls = [f"https://example.com/page{i}" for i in range(1, 20)]
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
results = {
|
||||||
|
"Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
|
||||||
|
"Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(urls, browser_config, run_config),
|
||||||
|
"Semaphore": await semaphore(urls, browser_config, run_config),
|
||||||
|
"Semaphore + Rate Limit": await semaphore_with_rate_limit(urls, browser_config, run_config),
|
||||||
|
}
|
||||||
|
|
||||||
|
table = create_performance_table(results)
|
||||||
|
print("\nPerformance Summary:")
|
||||||
|
print(table)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
asyncio.run(main())
|
||||||
264
docs/md_v2/advanced/multi-url-crawling copy.md
Normal file
264
docs/md_v2/advanced/multi-url-crawling copy.md
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
# Optimized Multi-URL Crawling
|
||||||
|
|
||||||
|
> **Note**: We’re developing a new **executor module** that uses a sophisticated algorithm to dynamically manage multi-URL crawling, optimizing for speed and memory usage. The approaches in this document remain fully valid, but keep an eye on **Crawl4AI**’s upcoming releases for this powerful feature! Follow [@unclecode](https://twitter.com/unclecode) on X and check the changelogs to stay updated.
|
||||||
|
|
||||||
|
|
||||||
|
Crawl4AI’s **AsyncWebCrawler** can handle multiple URLs in a single run, which can greatly reduce overhead and speed up crawling. This guide shows how to:
|
||||||
|
|
||||||
|
1. **Sequentially** crawl a list of URLs using the **same** session, avoiding repeated browser creation.
|
||||||
|
2. **Parallel**-crawl subsets of URLs in batches, again reusing the same browser.
|
||||||
|
|
||||||
|
When the entire process finishes, you close the browser once—**minimizing** memory and resource usage.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Why Avoid Simple Loops per URL?
|
||||||
|
|
||||||
|
If you naively do:
|
||||||
|
|
||||||
|
```python
|
||||||
|
for url in urls:
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url)
|
||||||
|
```
|
||||||
|
|
||||||
|
You end up:
|
||||||
|
|
||||||
|
1. Spinning up a **new** browser for each URL
|
||||||
|
2. Closing it immediately after the single crawl
|
||||||
|
3. Potentially using a lot of CPU/memory for short-living browsers
|
||||||
|
4. Missing out on session reusability if you have login or ongoing states
|
||||||
|
|
||||||
|
**Better** approaches ensure you **create** the browser once, then crawl multiple URLs with minimal overhead.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Sequential Crawling with Session Reuse
|
||||||
|
|
||||||
|
### 2.1 Overview
|
||||||
|
|
||||||
|
1. **One** `AsyncWebCrawler` instance for **all** URLs.
|
||||||
|
2. **One** session (via `session_id`) so we can preserve local storage or cookies across URLs if needed.
|
||||||
|
3. The crawler is only closed at the **end**.
|
||||||
|
|
||||||
|
**This** is the simplest pattern if your workload is moderate (dozens to a few hundred URLs).
|
||||||
|
|
||||||
|
### 2.2 Example Code
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from typing import List
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
async def crawl_sequential(urls: List[str]):
|
||||||
|
print("\n=== Sequential Crawling with Session Reuse ===")
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
# For better performance in Docker or low-memory environments:
|
||||||
|
extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
||||||
|
)
|
||||||
|
|
||||||
|
crawl_config = CrawlerRunConfig(
|
||||||
|
markdown_generator=DefaultMarkdownGenerator()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the crawler (opens the browser)
|
||||||
|
crawler = AsyncWebCrawler(config=browser_config)
|
||||||
|
await crawler.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
session_id = "session1" # Reuse the same session across all URLs
|
||||||
|
for url in urls:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
config=crawl_config,
|
||||||
|
session_id=session_id
|
||||||
|
)
|
||||||
|
if result.success:
|
||||||
|
print(f"Successfully crawled: {url}")
|
||||||
|
# E.g. check markdown length
|
||||||
|
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||||
|
else:
|
||||||
|
print(f"Failed: {url} - Error: {result.error_message}")
|
||||||
|
finally:
|
||||||
|
# After all URLs are done, close the crawler (and the browser)
|
||||||
|
await crawler.close()
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
urls = [
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://example.com/page2",
|
||||||
|
"https://example.com/page3"
|
||||||
|
]
|
||||||
|
await crawl_sequential(urls)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why It’s Good**:
|
||||||
|
|
||||||
|
- **One** browser launch.
|
||||||
|
- Minimal memory usage.
|
||||||
|
- If the site requires login, you can log in once in `session_id` context and preserve auth across all URLs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Parallel Crawling with Browser Reuse
|
||||||
|
|
||||||
|
### 3.1 Overview
|
||||||
|
|
||||||
|
To speed up crawling further, you can crawl multiple URLs in **parallel** (batches or a concurrency limit). The crawler still uses **one** browser, but spawns different sessions (or the same, depending on your logic) for each task.
|
||||||
|
|
||||||
|
### 3.2 Example Code
|
||||||
|
|
||||||
|
For this example make sure to install the [psutil](https://pypi.org/project/psutil/) package.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install psutil
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you can run the following code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import psutil
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
__location__ = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
__output__ = os.path.join(__location__, "output")
|
||||||
|
|
||||||
|
# Append parent directory to system path
|
||||||
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
sys.path.append(parent_dir)
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
||||||
|
print("\n=== Parallel Crawling with Browser Reuse + Memory Check ===")
|
||||||
|
|
||||||
|
# We'll keep track of peak memory usage across all tasks
|
||||||
|
peak_memory = 0
|
||||||
|
process = psutil.Process(os.getpid())
|
||||||
|
|
||||||
|
def log_memory(prefix: str = ""):
|
||||||
|
nonlocal peak_memory
|
||||||
|
current_mem = process.memory_info().rss # in bytes
|
||||||
|
if current_mem > peak_memory:
|
||||||
|
peak_memory = current_mem
|
||||||
|
print(f"{prefix} Current Memory: {current_mem // (1024 * 1024)} MB, Peak: {peak_memory // (1024 * 1024)} MB")
|
||||||
|
|
||||||
|
# Minimal browser config
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
verbose=False, # corrected from 'verbos=False'
|
||||||
|
extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
||||||
|
)
|
||||||
|
crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
# Create the crawler instance
|
||||||
|
crawler = AsyncWebCrawler(config=browser_config)
|
||||||
|
await crawler.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# We'll chunk the URLs in batches of 'max_concurrent'
|
||||||
|
success_count = 0
|
||||||
|
fail_count = 0
|
||||||
|
for i in range(0, len(urls), max_concurrent):
|
||||||
|
batch = urls[i : i + max_concurrent]
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
for j, url in enumerate(batch):
|
||||||
|
# Unique session_id per concurrent sub-task
|
||||||
|
session_id = f"parallel_session_{i + j}"
|
||||||
|
task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
|
||||||
|
tasks.append(task)
|
||||||
|
|
||||||
|
# Check memory usage prior to launching tasks
|
||||||
|
log_memory(prefix=f"Before batch {i//max_concurrent + 1}: ")
|
||||||
|
|
||||||
|
# Gather results
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
|
||||||
|
# Check memory usage after tasks complete
|
||||||
|
log_memory(prefix=f"After batch {i//max_concurrent + 1}: ")
|
||||||
|
|
||||||
|
# Evaluate results
|
||||||
|
for url, result in zip(batch, results):
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
print(f"Error crawling {url}: {result}")
|
||||||
|
fail_count += 1
|
||||||
|
elif result.success:
|
||||||
|
success_count += 1
|
||||||
|
else:
|
||||||
|
fail_count += 1
|
||||||
|
|
||||||
|
print(f"\nSummary:")
|
||||||
|
print(f" - Successfully crawled: {success_count}")
|
||||||
|
print(f" - Failed: {fail_count}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
print("\nClosing crawler...")
|
||||||
|
await crawler.close()
|
||||||
|
# Final memory log
|
||||||
|
log_memory(prefix="Final: ")
|
||||||
|
print(f"\nPeak memory usage (MB): {peak_memory // (1024 * 1024)}")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
urls = [
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://example.com/page2",
|
||||||
|
"https://example.com/page3",
|
||||||
|
"https://example.com/page4"
|
||||||
|
]
|
||||||
|
await crawl_parallel(urls, max_concurrent=2)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
**Notes**:
|
||||||
|
|
||||||
|
- We **reuse** the same `AsyncWebCrawler` instance for all parallel tasks, launching **one** browser.
|
||||||
|
- Each parallel sub-task might get its own `session_id` so they don’t share cookies/localStorage (unless that’s desired).
|
||||||
|
- We limit concurrency to `max_concurrent=2` or 3 to avoid saturating CPU/memory.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Performance Tips
|
||||||
|
|
||||||
|
1. **Extra Browser Args**
|
||||||
|
- `--disable-gpu`, `--no-sandbox` can help in Docker or restricted environments.
|
||||||
|
- `--disable-dev-shm-usage` avoids using `/dev/shm` which can be small on some systems.
|
||||||
|
|
||||||
|
2. **Session Reuse**
|
||||||
|
- If your site requires a login or you want to maintain local data across URLs, share the **same** `session_id`.
|
||||||
|
- If you want isolation (each URL fresh), create unique sessions.
|
||||||
|
|
||||||
|
3. **Batching**
|
||||||
|
- If you have **many** URLs (like thousands), you can do parallel crawling in chunks (like `max_concurrent=5`).
|
||||||
|
- Use `arun_many()` for a built-in approach if you prefer, but the example above is often more flexible.
|
||||||
|
|
||||||
|
4. **Cache**
|
||||||
|
- If your pages share many resources or you’re re-crawling the same domain repeatedly, consider setting `cache_mode=CacheMode.ENABLED` in `CrawlerRunConfig`.
|
||||||
|
- If you need fresh data each time, keep `cache_mode=CacheMode.BYPASS`.
|
||||||
|
|
||||||
|
5. **Hooks**
|
||||||
|
- You can set up global hooks for each crawler (like to block images) or per-run if you want.
|
||||||
|
- Keep them consistent if you’re reusing sessions.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Summary
|
||||||
|
|
||||||
|
- **One** `AsyncWebCrawler` + multiple calls to `.arun()` is far more efficient than launching a new crawler per URL.
|
||||||
|
- **Sequential** approach with a shared session is simple and memory-friendly for moderate sets of URLs.
|
||||||
|
- **Parallel** approach can speed up large crawls by concurrency, but keep concurrency balanced to avoid overhead.
|
||||||
|
- Close the crawler once at the end, ensuring the browser is only opened/closed once.
|
||||||
|
|
||||||
|
For even more advanced memory optimizations or dynamic concurrency patterns, see future sections on hooking or distributed crawling. The patterns above suffice for the majority of multi-URL scenarios—**giving you speed, simplicity, and minimal resource usage**. Enjoy your optimized crawling!
|
||||||
@@ -1,264 +1,205 @@
|
|||||||
# Optimized Multi-URL Crawling
|
# Advanced Multi-URL Crawling with Dispatchers
|
||||||
|
|
||||||
> **Note**: We’re developing a new **executor module** that uses a sophisticated algorithm to dynamically manage multi-URL crawling, optimizing for speed and memory usage. The approaches in this document remain fully valid, but keep an eye on **Crawl4AI**’s upcoming releases for this powerful feature! Follow [@unclecode](https://twitter.com/unclecode) on X and check the changelogs to stay updated.
|
> **Heads Up**: Crawl4AI supports advanced dispatchers for **parallel** or **throttled** crawling, providing dynamic rate limiting and memory usage checks. The built-in `arun_many()` function uses these dispatchers to handle concurrency efficiently.
|
||||||
|
|
||||||
|
## 1. Introduction
|
||||||
|
|
||||||
Crawl4AI’s **AsyncWebCrawler** can handle multiple URLs in a single run, which can greatly reduce overhead and speed up crawling. This guide shows how to:
|
When crawling many URLs:
|
||||||
|
- **Basic**: Use `arun()` in a loop (simple but less efficient)
|
||||||
|
- **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control
|
||||||
|
- **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.)
|
||||||
|
|
||||||
1. **Sequentially** crawl a list of URLs using the **same** session, avoiding repeated browser creation.
|
**Why Dispatchers?**
|
||||||
2. **Parallel**-crawl subsets of URLs in batches, again reusing the same browser.
|
- **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources
|
||||||
|
- **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses
|
||||||
|
- **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance
|
||||||
|
- **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency
|
||||||
|
|
||||||
When the entire process finishes, you close the browser once—**minimizing** memory and resource usage.
|
## 2. Core Components
|
||||||
|
|
||||||
---
|
### 2.1 Rate Limiter
|
||||||
|
|
||||||
## 1. Why Avoid Simple Loops per URL?
|
|
||||||
|
|
||||||
If you naively do:
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
for url in urls:
|
class RateLimiter:
|
||||||
async with AsyncWebCrawler() as crawler:
|
def __init__(
|
||||||
result = await crawler.arun(url)
|
base_delay: Tuple[float, float] = (1.0, 3.0), # Random delay range between requests
|
||||||
|
max_delay: float = 60.0, # Maximum backoff delay
|
||||||
|
max_retries: int = 3, # Retries before giving up
|
||||||
|
rate_limit_codes: List[int] = [429, 503] # Status codes triggering backoff
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
You end up:
|
The RateLimiter provides:
|
||||||
|
- Random delays between requests
|
||||||
|
- Exponential backoff on rate limit responses
|
||||||
|
- Domain-specific rate limiting
|
||||||
|
- Automatic retry handling
|
||||||
|
|
||||||
1. Spinning up a **new** browser for each URL
|
### 2.2 Crawler Monitor
|
||||||
2. Closing it immediately after the single crawl
|
|
||||||
3. Potentially using a lot of CPU/memory for short-living browsers
|
|
||||||
4. Missing out on session reusability if you have login or ongoing states
|
|
||||||
|
|
||||||
**Better** approaches ensure you **create** the browser once, then crawl multiple URLs with minimal overhead.
|
The CrawlerMonitor provides real-time visibility into crawling operations:
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 2. Sequential Crawling with Session Reuse
|
|
||||||
|
|
||||||
### 2.1 Overview
|
|
||||||
|
|
||||||
1. **One** `AsyncWebCrawler` instance for **all** URLs.
|
|
||||||
2. **One** session (via `session_id`) so we can preserve local storage or cookies across URLs if needed.
|
|
||||||
3. The crawler is only closed at the **end**.
|
|
||||||
|
|
||||||
**This** is the simplest pattern if your workload is moderate (dozens to a few hundred URLs).
|
|
||||||
|
|
||||||
### 2.2 Example Code
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
monitor = CrawlerMonitor(
|
||||||
from typing import List
|
max_visible_rows=15, # Maximum rows in live display
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
display_mode=DisplayMode.DETAILED # DETAILED or AGGREGATED view
|
||||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
)
|
||||||
|
|
||||||
async def crawl_sequential(urls: List[str]):
|
|
||||||
print("\n=== Sequential Crawling with Session Reuse ===")
|
|
||||||
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
headless=True,
|
|
||||||
# For better performance in Docker or low-memory environments:
|
|
||||||
extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
|
||||||
)
|
|
||||||
|
|
||||||
crawl_config = CrawlerRunConfig(
|
|
||||||
markdown_generator=DefaultMarkdownGenerator()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Create the crawler (opens the browser)
|
|
||||||
crawler = AsyncWebCrawler(config=browser_config)
|
|
||||||
await crawler.start()
|
|
||||||
|
|
||||||
try:
|
|
||||||
session_id = "session1" # Reuse the same session across all URLs
|
|
||||||
for url in urls:
|
|
||||||
result = await crawler.arun(
|
|
||||||
url=url,
|
|
||||||
config=crawl_config,
|
|
||||||
session_id=session_id
|
|
||||||
)
|
|
||||||
if result.success:
|
|
||||||
print(f"Successfully crawled: {url}")
|
|
||||||
# E.g. check markdown length
|
|
||||||
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
|
|
||||||
else:
|
|
||||||
print(f"Failed: {url} - Error: {result.error_message}")
|
|
||||||
finally:
|
|
||||||
# After all URLs are done, close the crawler (and the browser)
|
|
||||||
await crawler.close()
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
urls = [
|
|
||||||
"https://example.com/page1",
|
|
||||||
"https://example.com/page2",
|
|
||||||
"https://example.com/page3"
|
|
||||||
]
|
|
||||||
await crawl_sequential(urls)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Why It’s Good**:
|
**Display Modes**:
|
||||||
|
1. **DETAILED**: Shows individual task status, memory usage, and timing
|
||||||
|
2. **AGGREGATED**: Displays summary statistics and overall progress
|
||||||
|
|
||||||
- **One** browser launch.
|
## 3. Available Dispatchers
|
||||||
- Minimal memory usage.
|
|
||||||
- If the site requires login, you can log in once in `session_id` context and preserve auth across all URLs.
|
|
||||||
|
|
||||||
---
|
### 3.1 MemoryAdaptiveDispatcher (Default)
|
||||||
|
|
||||||
## 3. Parallel Crawling with Browser Reuse
|
Automatically manages concurrency based on system memory usage:
|
||||||
|
|
||||||
### 3.1 Overview
|
|
||||||
|
|
||||||
To speed up crawling further, you can crawl multiple URLs in **parallel** (batches or a concurrency limit). The crawler still uses **one** browser, but spawns different sessions (or the same, depending on your logic) for each task.
|
|
||||||
|
|
||||||
### 3.2 Example Code
|
|
||||||
|
|
||||||
For this example make sure to install the [psutil](https://pypi.org/project/psutil/) package.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install psutil
|
|
||||||
```
|
|
||||||
|
|
||||||
Then you can run the following code:
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import os
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
import sys
|
memory_threshold_percent=70.0, # Pause if memory exceeds this
|
||||||
import psutil
|
check_interval=1.0, # How often to check memory
|
||||||
import asyncio
|
max_session_permit=10, # Maximum concurrent tasks
|
||||||
|
rate_limiter=RateLimiter( # Optional rate limiting
|
||||||
__location__ = os.path.dirname(os.path.abspath(__file__))
|
base_delay=(1.0, 2.0),
|
||||||
__output__ = os.path.join(__location__, "output")
|
max_delay=30.0,
|
||||||
|
max_retries=2
|
||||||
# Append parent directory to system path
|
),
|
||||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
monitor=CrawlerMonitor( # Optional monitoring
|
||||||
sys.path.append(parent_dir)
|
max_visible_rows=15,
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
from typing import List
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
|
||||||
|
|
||||||
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
|
||||||
print("\n=== Parallel Crawling with Browser Reuse + Memory Check ===")
|
|
||||||
|
|
||||||
# We'll keep track of peak memory usage across all tasks
|
|
||||||
peak_memory = 0
|
|
||||||
process = psutil.Process(os.getpid())
|
|
||||||
|
|
||||||
def log_memory(prefix: str = ""):
|
|
||||||
nonlocal peak_memory
|
|
||||||
current_mem = process.memory_info().rss # in bytes
|
|
||||||
if current_mem > peak_memory:
|
|
||||||
peak_memory = current_mem
|
|
||||||
print(f"{prefix} Current Memory: {current_mem // (1024 * 1024)} MB, Peak: {peak_memory // (1024 * 1024)} MB")
|
|
||||||
|
|
||||||
# Minimal browser config
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
headless=True,
|
|
||||||
verbose=False, # corrected from 'verbos=False'
|
|
||||||
extra_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
|
||||||
)
|
)
|
||||||
crawl_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
)
|
||||||
|
|
||||||
# Create the crawler instance
|
|
||||||
crawler = AsyncWebCrawler(config=browser_config)
|
|
||||||
await crawler.start()
|
|
||||||
|
|
||||||
try:
|
|
||||||
# We'll chunk the URLs in batches of 'max_concurrent'
|
|
||||||
success_count = 0
|
|
||||||
fail_count = 0
|
|
||||||
for i in range(0, len(urls), max_concurrent):
|
|
||||||
batch = urls[i : i + max_concurrent]
|
|
||||||
tasks = []
|
|
||||||
|
|
||||||
for j, url in enumerate(batch):
|
|
||||||
# Unique session_id per concurrent sub-task
|
|
||||||
session_id = f"parallel_session_{i + j}"
|
|
||||||
task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
|
|
||||||
tasks.append(task)
|
|
||||||
|
|
||||||
# Check memory usage prior to launching tasks
|
|
||||||
log_memory(prefix=f"Before batch {i//max_concurrent + 1}: ")
|
|
||||||
|
|
||||||
# Gather results
|
|
||||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
||||||
|
|
||||||
# Check memory usage after tasks complete
|
|
||||||
log_memory(prefix=f"After batch {i//max_concurrent + 1}: ")
|
|
||||||
|
|
||||||
# Evaluate results
|
|
||||||
for url, result in zip(batch, results):
|
|
||||||
if isinstance(result, Exception):
|
|
||||||
print(f"Error crawling {url}: {result}")
|
|
||||||
fail_count += 1
|
|
||||||
elif result.success:
|
|
||||||
success_count += 1
|
|
||||||
else:
|
|
||||||
fail_count += 1
|
|
||||||
|
|
||||||
print(f"\nSummary:")
|
|
||||||
print(f" - Successfully crawled: {success_count}")
|
|
||||||
print(f" - Failed: {fail_count}")
|
|
||||||
|
|
||||||
finally:
|
|
||||||
print("\nClosing crawler...")
|
|
||||||
await crawler.close()
|
|
||||||
# Final memory log
|
|
||||||
log_memory(prefix="Final: ")
|
|
||||||
print(f"\nPeak memory usage (MB): {peak_memory // (1024 * 1024)}")
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
urls = [
|
|
||||||
"https://example.com/page1",
|
|
||||||
"https://example.com/page2",
|
|
||||||
"https://example.com/page3",
|
|
||||||
"https://example.com/page4"
|
|
||||||
]
|
|
||||||
await crawl_parallel(urls, max_concurrent=2)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Notes**:
|
### 3.2 SemaphoreDispatcher
|
||||||
|
|
||||||
- We **reuse** the same `AsyncWebCrawler` instance for all parallel tasks, launching **one** browser.
|
Provides simple concurrency control with a fixed limit:
|
||||||
- Each parallel sub-task might get its own `session_id` so they don’t share cookies/localStorage (unless that’s desired).
|
|
||||||
- We limit concurrency to `max_concurrent=2` or 3 to avoid saturating CPU/memory.
|
|
||||||
|
|
||||||
---
|
```python
|
||||||
|
dispatcher = SemaphoreDispatcher(
|
||||||
|
semaphore_count=5, # Fixed concurrent tasks
|
||||||
|
rate_limiter=RateLimiter( # Optional rate limiting
|
||||||
|
base_delay=(0.5, 1.0),
|
||||||
|
max_delay=10.0
|
||||||
|
),
|
||||||
|
monitor=CrawlerMonitor( # Optional monitoring
|
||||||
|
max_visible_rows=15,
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
## 4. Performance Tips
|
## 4. Usage Examples
|
||||||
|
|
||||||
1. **Extra Browser Args**
|
### 4.1 Simple Usage (Default MemoryAdaptiveDispatcher)
|
||||||
- `--disable-gpu`, `--no-sandbox` can help in Docker or restricted environments.
|
|
||||||
- `--disable-dev-shm-usage` avoids using `/dev/shm` which can be small on some systems.
|
|
||||||
|
|
||||||
2. **Session Reuse**
|
```python
|
||||||
- If your site requires a login or you want to maintain local data across URLs, share the **same** `session_id`.
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
- If you want isolation (each URL fresh), create unique sessions.
|
results = await crawler.arun_many(urls, config=run_config)
|
||||||
|
```
|
||||||
|
|
||||||
3. **Batching**
|
### 4.2 Memory Adaptive with Rate Limiting
|
||||||
- If you have **many** URLs (like thousands), you can do parallel crawling in chunks (like `max_concurrent=5`).
|
|
||||||
- Use `arun_many()` for a built-in approach if you prefer, but the example above is often more flexible.
|
|
||||||
|
|
||||||
4. **Cache**
|
```python
|
||||||
- If your pages share many resources or you’re re-crawling the same domain repeatedly, consider setting `cache_mode=CacheMode.ENABLED` in `CrawlerRunConfig`.
|
async def crawl_with_memory_adaptive(urls):
|
||||||
- If you need fresh data each time, keep `cache_mode=CacheMode.BYPASS`.
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
5. **Hooks**
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
- You can set up global hooks for each crawler (like to block images) or per-run if you want.
|
memory_threshold_percent=70.0,
|
||||||
- Keep them consistent if you’re reusing sessions.
|
max_session_permit=10,
|
||||||
|
rate_limiter=RateLimiter(
|
||||||
|
base_delay=(1.0, 2.0),
|
||||||
|
max_delay=30.0,
|
||||||
|
max_retries=2
|
||||||
|
),
|
||||||
|
monitor=CrawlerMonitor(
|
||||||
|
max_visible_rows=15,
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
---
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls,
|
||||||
|
config=run_config,
|
||||||
|
dispatcher=dispatcher
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
```
|
||||||
|
|
||||||
## 5. Summary
|
### 4.3 Semaphore with Rate Limiting
|
||||||
|
|
||||||
- **One** `AsyncWebCrawler` + multiple calls to `.arun()` is far more efficient than launching a new crawler per URL.
|
```python
|
||||||
- **Sequential** approach with a shared session is simple and memory-friendly for moderate sets of URLs.
|
async def crawl_with_semaphore(urls):
|
||||||
- **Parallel** approach can speed up large crawls by concurrency, but keep concurrency balanced to avoid overhead.
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
- Close the crawler once at the end, ensuring the browser is only opened/closed once.
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
For even more advanced memory optimizations or dynamic concurrency patterns, see future sections on hooking or distributed crawling. The patterns above suffice for the majority of multi-URL scenarios—**giving you speed, simplicity, and minimal resource usage**. Enjoy your optimized crawling!
|
dispatcher = SemaphoreDispatcher(
|
||||||
|
semaphore_count=5,
|
||||||
|
rate_limiter=RateLimiter(
|
||||||
|
base_delay=(0.5, 1.0),
|
||||||
|
max_delay=10.0
|
||||||
|
),
|
||||||
|
monitor=CrawlerMonitor(
|
||||||
|
max_visible_rows=15,
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls,
|
||||||
|
config=run_config,
|
||||||
|
dispatcher=dispatcher
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
```
|
||||||
|
|
||||||
|
## 5. Dispatch Results
|
||||||
|
|
||||||
|
Each crawl result includes dispatch information:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class DispatchResult:
|
||||||
|
task_id: str
|
||||||
|
memory_usage: float
|
||||||
|
peak_memory: float
|
||||||
|
start_time: datetime
|
||||||
|
end_time: datetime
|
||||||
|
error_message: str = ""
|
||||||
|
```
|
||||||
|
|
||||||
|
Access via `result.dispatch_result`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
for result in results:
|
||||||
|
if result.success:
|
||||||
|
dr = result.dispatch_result
|
||||||
|
print(f"URL: {result.url}")
|
||||||
|
print(f"Memory: {dr.memory_usage:.1f}MB")
|
||||||
|
print(f"Duration: {dr.end_time - dr.start_time}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## 6. Summary
|
||||||
|
|
||||||
|
1. **Two Dispatcher Types**:
|
||||||
|
- MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory
|
||||||
|
- SemaphoreDispatcher: Fixed concurrency limit
|
||||||
|
|
||||||
|
2. **Optional Components**:
|
||||||
|
- RateLimiter: Smart request pacing and backoff
|
||||||
|
- CrawlerMonitor: Real-time progress visualization
|
||||||
|
|
||||||
|
3. **Key Benefits**:
|
||||||
|
- Automatic memory management
|
||||||
|
- Built-in rate limiting
|
||||||
|
- Live progress monitoring
|
||||||
|
- Flexible concurrency control
|
||||||
|
|
||||||
|
Choose the dispatcher that best fits your needs:
|
||||||
|
- **MemoryAdaptiveDispatcher**: For large crawls or limited resources
|
||||||
|
- **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios
|
||||||
|
|||||||
@@ -1,7 +1,3 @@
|
|||||||
Below is a **revised parameter guide** for **`arun()`** in **AsyncWebCrawler**, reflecting the **new** approach where all parameters are passed via a **`CrawlerRunConfig`** instead of directly to `arun()`. Each section includes example usage in the new style, ensuring a clear, modern approach.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# `arun()` Parameter Guide (New Approach)
|
# `arun()` Parameter Guide (New Approach)
|
||||||
|
|
||||||
In Crawl4AI’s **latest** configuration model, nearly all parameters that once went directly to `arun()` are now part of **`CrawlerRunConfig`**. When calling `arun()`, you provide:
|
In Crawl4AI’s **latest** configuration model, nearly all parameters that once went directly to `arun()` are now part of **`CrawlerRunConfig`**. When calling `arun()`, you provide:
|
||||||
|
|||||||
100
docs/md_v2/api/arun_many.md
Normal file
100
docs/md_v2/api/arun_many.md
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
# `arun_many(...)` Reference
|
||||||
|
|
||||||
|
> **Note**: This function is very similar to [`arun()`](./arun.md) but focused on **concurrent** or **batch** crawling. If you’re unfamiliar with `arun()` usage, please read that doc first, then review this for differences.
|
||||||
|
|
||||||
|
## Function Signature
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def arun_many(
|
||||||
|
urls: Union[List[str], List[Any]],
|
||||||
|
config: Optional[CrawlerRunConfig] = None,
|
||||||
|
dispatcher: Optional[BaseDispatcher] = None,
|
||||||
|
...
|
||||||
|
) -> List[CrawlResult]:
|
||||||
|
"""
|
||||||
|
Crawl multiple URLs concurrently or in batches.
|
||||||
|
|
||||||
|
:param urls: A list of URLs (or tasks) to crawl.
|
||||||
|
:param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
|
||||||
|
:param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
|
||||||
|
...
|
||||||
|
:return: A list of `CrawlResult` objects, one per URL.
|
||||||
|
"""
|
||||||
|
```
|
||||||
|
|
||||||
|
## Differences from `arun()`
|
||||||
|
|
||||||
|
1. **Multiple URLs**:
|
||||||
|
- Instead of crawling a single URL, you pass a list of them (strings or tasks).
|
||||||
|
- The function returns a **list** of `CrawlResult`, in the same order as `urls`.
|
||||||
|
|
||||||
|
2. **Concurrency & Dispatchers**:
|
||||||
|
- **`dispatcher`** param allows advanced concurrency control.
|
||||||
|
- If omitted, a default dispatcher (like `MemoryAdaptiveDispatcher`) is used internally.
|
||||||
|
- Dispatchers handle concurrency, rate limiting, and memory-based adaptive throttling (see [Multi-URL Crawling](../advanced/multi-url-crawling.md)).
|
||||||
|
|
||||||
|
3. **Parallel** Execution**:
|
||||||
|
- `arun_many()` can run multiple requests concurrently under the hood.
|
||||||
|
- Each `CrawlResult` might also include a **`dispatch_result`** with concurrency details (like memory usage, start/end times).
|
||||||
|
|
||||||
|
### Basic Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Minimal usage: The default dispatcher will be used
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=["https://site1.com", "https://site2.com"],
|
||||||
|
config=my_run_config
|
||||||
|
)
|
||||||
|
|
||||||
|
for res in results:
|
||||||
|
if res.success:
|
||||||
|
print(res.url, "crawled OK!")
|
||||||
|
else:
|
||||||
|
print("Failed:", res.url, "-", res.error_message)
|
||||||
|
```
|
||||||
|
|
||||||
|
### With a Custom Dispatcher
|
||||||
|
|
||||||
|
```python
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
max_session_permit=10
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=["https://site1.com", "https://site2.com", "https://site3.com"],
|
||||||
|
config=my_run_config,
|
||||||
|
dispatcher=dispatcher
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Points**:
|
||||||
|
- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
|
||||||
|
- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.
|
||||||
|
- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
|
||||||
|
|
||||||
|
### Return Value
|
||||||
|
|
||||||
|
A **list** of [`CrawlResult`](./crawl-result.md) objects, one per URL. You can iterate to check `result.success` or read each item’s `extracted_content`, `markdown`, or `dispatch_result`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dispatcher Reference
|
||||||
|
|
||||||
|
- **`MemoryAdaptiveDispatcher`**: Dynamically manages concurrency based on system memory usage.
|
||||||
|
- **`SemaphoreDispatcher`**: Fixed concurrency limit, simpler but less adaptive.
|
||||||
|
|
||||||
|
For advanced usage or custom settings, see [Multi-URL Crawling with Dispatchers](../advanced/multi-url-crawling.md).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Pitfalls
|
||||||
|
|
||||||
|
1. **Large Lists**: If you pass thousands of URLs, be mindful of memory or rate-limits. A dispatcher can help.
|
||||||
|
2. **Session Reuse**: If you need specialized logins or persistent contexts, ensure your dispatcher or tasks handle sessions accordingly.
|
||||||
|
3. **Error Handling**: Each `CrawlResult` might fail for different reasons—always check `result.success` or the `error_message` before proceeding.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
Use `arun_many()` when you want to **crawl multiple URLs** simultaneously or in controlled parallel tasks. If you need advanced concurrency features (like memory-based adaptive throttling or complex rate-limiting), provide a **dispatcher**. Each result is a standard `CrawlResult`, possibly augmented with concurrency stats (`dispatch_result`) for deeper inspection. For more details on concurrency logic and dispatchers, see the [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) docs.
|
||||||
@@ -130,51 +130,88 @@ For **backward** compatibility, `arun()` can still accept direct arguments like
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 4. Helper Methods
|
## 4. Batch Processing: `arun_many()`
|
||||||
|
|
||||||
### 4.1 `arun_many()`
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
async def arun_many(
|
async def arun_many(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
config: Optional[CrawlerRunConfig] = None,
|
config: Optional[CrawlerRunConfig] = None,
|
||||||
# Legacy parameters...
|
# Legacy parameters maintained for backwards compatibility...
|
||||||
) -> List[CrawlResult]:
|
) -> List[CrawlResult]:
|
||||||
...
|
"""
|
||||||
|
Process multiple URLs with intelligent rate limiting and resource monitoring.
|
||||||
|
"""
|
||||||
```
|
```
|
||||||
|
|
||||||
Crawls multiple URLs in concurrency. Accepts the same style `CrawlerRunConfig`. Example:
|
### 4.1 Resource-Aware Crawling
|
||||||
|
|
||||||
|
The `arun_many()` method now uses an intelligent dispatcher that:
|
||||||
|
- Monitors system memory usage
|
||||||
|
- Implements adaptive rate limiting
|
||||||
|
- Provides detailed progress monitoring
|
||||||
|
- Manages concurrent crawls efficiently
|
||||||
|
|
||||||
|
### 4.2 Example Usage
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
||||||
|
from crawl4ai.dispatcher import DisplayMode
|
||||||
|
|
||||||
|
# Configure browser
|
||||||
|
browser_cfg = BrowserConfig(headless=True)
|
||||||
|
|
||||||
|
# Configure crawler with rate limiting
|
||||||
run_cfg = CrawlerRunConfig(
|
run_cfg = CrawlerRunConfig(
|
||||||
# e.g., concurrency, wait_for, caching, extraction, etc.
|
# Enable rate limiting
|
||||||
semaphore_count=5
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config=RateLimitConfig(
|
||||||
|
base_delay=(1.0, 2.0), # Random delay between 1-2 seconds
|
||||||
|
max_delay=30.0, # Maximum delay after rate limit hits
|
||||||
|
max_retries=2, # Number of retries before giving up
|
||||||
|
rate_limit_codes=[429, 503] # Status codes that trigger rate limiting
|
||||||
|
),
|
||||||
|
# Resource monitoring
|
||||||
|
memory_threshold_percent=70.0, # Pause if memory exceeds this
|
||||||
|
check_interval=0.5, # How often to check resources
|
||||||
|
max_session_permit=3, # Maximum concurrent crawls
|
||||||
|
display_mode=DisplayMode.DETAILED.value # Show detailed progress
|
||||||
)
|
)
|
||||||
|
|
||||||
|
urls = [
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://example.com/page2",
|
||||||
|
"https://example.com/page3"
|
||||||
|
]
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||||
results = await crawler.arun_many(
|
results = await crawler.arun_many(urls, config=run_cfg)
|
||||||
urls=["https://example.com", "https://another.com"],
|
for result in results:
|
||||||
config=run_cfg
|
print(f"URL: {result.url}, Success: {result.success}")
|
||||||
)
|
|
||||||
for r in results:
|
|
||||||
print(r.url, ":", len(r.cleaned_html))
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4.2 `start()` & `close()`
|
### 4.3 Key Features
|
||||||
|
|
||||||
Allows manual lifecycle usage instead of context manager:
|
1. **Rate Limiting**
|
||||||
|
- Automatic delay between requests
|
||||||
|
- Exponential backoff on rate limit detection
|
||||||
|
- Domain-specific rate limiting
|
||||||
|
- Configurable retry strategy
|
||||||
|
|
||||||
```python
|
2. **Resource Monitoring**
|
||||||
crawler = AsyncWebCrawler(config=browser_cfg)
|
- Memory usage tracking
|
||||||
await crawler.start()
|
- Adaptive concurrency based on system load
|
||||||
|
- Automatic pausing when resources are constrained
|
||||||
|
|
||||||
# Perform multiple operations
|
3. **Progress Monitoring**
|
||||||
resultA = await crawler.arun("https://exampleA.com", config=run_cfg)
|
- Detailed or aggregated progress display
|
||||||
resultB = await crawler.arun("https://exampleB.com", config=run_cfg)
|
- Real-time status updates
|
||||||
|
- Memory usage statistics
|
||||||
|
|
||||||
await crawler.close()
|
4. **Error Handling**
|
||||||
```
|
- Graceful handling of rate limits
|
||||||
|
- Automatic retries with backoff
|
||||||
|
- Detailed error reporting
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ class CrawlResult(BaseModel):
|
|||||||
response_headers: Optional[dict] = None
|
response_headers: Optional[dict] = None
|
||||||
status_code: Optional[int] = None
|
status_code: Optional[int] = None
|
||||||
ssl_certificate: Optional[SSLCertificate] = None
|
ssl_certificate: Optional[SSLCertificate] = None
|
||||||
|
dispatch_result: Optional[DispatchResult] = None
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -262,7 +263,31 @@ if result.metadata:
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 6. Example: Accessing Everything
|
## 6. `dispatch_result` (optional)
|
||||||
|
|
||||||
|
A `DispatchResult` object providing additional concurrency and resource usage information when crawling URLs in parallel (e.g., via `arun_many()` with custom dispatchers). It contains:
|
||||||
|
|
||||||
|
- **`task_id`**: A unique identifier for the parallel task.
|
||||||
|
- **`memory_usage`** (float): The memory (in MB) used at the time of completion.
|
||||||
|
- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task’s execution.
|
||||||
|
- **`start_time`** / **`end_time`** (datetime): Time range for this crawling task.
|
||||||
|
- **`error_message`** (str): Any dispatcher- or concurrency-related error encountered.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Example usage:
|
||||||
|
for result in results:
|
||||||
|
if result.success and result.dispatch_result:
|
||||||
|
dr = result.dispatch_result
|
||||||
|
print(f"URL: {result.url}, Task ID: {dr.task_id}")
|
||||||
|
print(f"Memory: {dr.memory_usage:.1f} MB (Peak: {dr.peak_memory:.1f} MB)")
|
||||||
|
print(f"Duration: {dr.end_time - dr.start_time}")
|
||||||
|
```
|
||||||
|
|
||||||
|
> **Note**: This field is typically populated when using `arun_many(...)` alongside a **dispatcher** (e.g., `MemoryAdaptiveDispatcher` or `SemaphoreDispatcher`). If no concurrency or dispatcher is used, `dispatch_result` may remain `None`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Example: Accessing Everything
|
||||||
|
|
||||||
```python
|
```python
|
||||||
async def handle_result(result: CrawlResult):
|
async def handle_result(result: CrawlResult):
|
||||||
@@ -306,7 +331,7 @@ async def handle_result(result: CrawlResult):
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 7. Key Points & Future
|
## 8. Key Points & Future
|
||||||
|
|
||||||
1. **`markdown_v2` vs `markdown`**
|
1. **`markdown_v2` vs `markdown`**
|
||||||
- Right now, `markdown_v2` is the more robust container (`MarkdownGenerationResult`), providing **raw_markdown**, **markdown_with_citations**, references, plus possible **fit_markdown**.
|
- Right now, `markdown_v2` is the more robust container (`MarkdownGenerationResult`), providing **raw_markdown**, **markdown_with_citations**, references, plus possible **fit_markdown**.
|
||||||
|
|||||||
@@ -157,7 +157,32 @@ Use these for link-level content filtering (often to keep crawls “internal”
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### G) **Debug & Logging**
|
### G) **Rate Limiting & Resource Management**
|
||||||
|
|
||||||
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|
|------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| **`enable_rate_limiting`** | `bool` (default: `False`) | Enable intelligent rate limiting for multiple URLs |
|
||||||
|
| **`rate_limit_config`** | `RateLimitConfig` (default: `None`) | Configuration for rate limiting behavior |
|
||||||
|
|
||||||
|
The `RateLimitConfig` class has these fields:
|
||||||
|
|
||||||
|
| **Field** | **Type / Default** | **What It Does** |
|
||||||
|
|--------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| **`base_delay`** | `Tuple[float, float]` (1.0, 3.0) | Random delay range between requests to the same domain |
|
||||||
|
| **`max_delay`** | `float` (60.0) | Maximum delay after rate limit detection |
|
||||||
|
| **`max_retries`** | `int` (3) | Number of retries before giving up on rate-limited requests |
|
||||||
|
| **`rate_limit_codes`** | `List[int]` ([429, 503]) | HTTP status codes that trigger rate limiting behavior |
|
||||||
|
|
||||||
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|
|-------------------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| **`memory_threshold_percent`** | `float` (70.0) | Maximum memory usage before pausing new crawls |
|
||||||
|
| **`check_interval`** | `float` (1.0) | How often to check system resources (in seconds) |
|
||||||
|
| **`max_session_permit`** | `int` (20) | Maximum number of concurrent crawl sessions |
|
||||||
|
| **`display_mode`** | `str` (`None`, "DETAILED", "AGGREGATED") | How to display progress information |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### H) **Debug & Logging**
|
||||||
|
|
||||||
| **Parameter** | **Type / Default** | **What It Does** |
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|----------------|--------------------|---------------------------------------------------------------------------|
|
|----------------|--------------------|---------------------------------------------------------------------------|
|
||||||
@@ -170,7 +195,7 @@ Use these for link-level content filtering (often to keep crawls “internal”
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, RateLimitConfig
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Configure the browser
|
# Configure the browser
|
||||||
@@ -190,7 +215,18 @@ async def main():
|
|||||||
excluded_tags=["script", "style"],
|
excluded_tags=["script", "style"],
|
||||||
exclude_external_links=True,
|
exclude_external_links=True,
|
||||||
wait_for="css:.article-loaded",
|
wait_for="css:.article-loaded",
|
||||||
screenshot=True
|
screenshot=True,
|
||||||
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config=RateLimitConfig(
|
||||||
|
base_delay=(1.0, 3.0),
|
||||||
|
max_delay=60.0,
|
||||||
|
max_retries=3,
|
||||||
|
rate_limit_codes=[429, 503]
|
||||||
|
),
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
check_interval=1.0,
|
||||||
|
max_session_permit=20,
|
||||||
|
display_mode="DETAILED"
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||||
@@ -223,4 +259,3 @@ if __name__ == "__main__":
|
|||||||
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
||||||
- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
|
- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
|
||||||
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
|
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
|
||||||
|
|
||||||
|
|||||||
@@ -116,6 +116,12 @@ class CrawlerRunConfig:
|
|||||||
wait_for=None,
|
wait_for=None,
|
||||||
screenshot=False,
|
screenshot=False,
|
||||||
pdf=False,
|
pdf=False,
|
||||||
|
enable_rate_limiting=False,
|
||||||
|
rate_limit_config=None,
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
check_interval=1.0,
|
||||||
|
max_session_permit=20,
|
||||||
|
display_mode=None,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
# ... other advanced parameters omitted
|
# ... other advanced parameters omitted
|
||||||
):
|
):
|
||||||
@@ -156,6 +162,58 @@ class CrawlerRunConfig:
|
|||||||
- Logs additional runtime details.
|
- Logs additional runtime details.
|
||||||
- Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
|
- Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
|
||||||
|
|
||||||
|
9. **`enable_rate_limiting`**:
|
||||||
|
- If `True`, enables rate limiting for batch processing.
|
||||||
|
- Requires `rate_limit_config` to be set.
|
||||||
|
|
||||||
|
10. **`rate_limit_config`**:
|
||||||
|
- A `RateLimitConfig` object controlling rate limiting behavior.
|
||||||
|
- See below for details.
|
||||||
|
|
||||||
|
11. **`memory_threshold_percent`**:
|
||||||
|
- The memory threshold (as a percentage) to monitor.
|
||||||
|
- If exceeded, the crawler will pause or slow down.
|
||||||
|
|
||||||
|
12. **`check_interval`**:
|
||||||
|
- The interval (in seconds) to check system resources.
|
||||||
|
- Affects how often memory and CPU usage are monitored.
|
||||||
|
|
||||||
|
13. **`max_session_permit`**:
|
||||||
|
- The maximum number of concurrent crawl sessions.
|
||||||
|
- Helps prevent overwhelming the system.
|
||||||
|
|
||||||
|
14. **`display_mode`**:
|
||||||
|
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||||
|
- Affects how much information is printed during the crawl.
|
||||||
|
|
||||||
|
### Rate Limiting & Resource Management
|
||||||
|
|
||||||
|
For batch processing with `arun_many()`, you can enable intelligent rate limiting:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import RateLimitConfig
|
||||||
|
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config=RateLimitConfig(
|
||||||
|
base_delay=(1.0, 3.0), # Random delay range
|
||||||
|
max_delay=60.0, # Max delay after rate limits
|
||||||
|
max_retries=3, # Retries before giving up
|
||||||
|
rate_limit_codes=[429, 503] # Status codes to watch
|
||||||
|
),
|
||||||
|
memory_threshold_percent=70.0, # Memory threshold
|
||||||
|
check_interval=1.0, # Resource check interval
|
||||||
|
max_session_permit=20, # Max concurrent crawls
|
||||||
|
display_mode="DETAILED" # Progress display mode
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
This configuration:
|
||||||
|
- Implements intelligent rate limiting per domain
|
||||||
|
- Monitors system resources
|
||||||
|
- Provides detailed progress information
|
||||||
|
- Manages concurrent crawls efficiently
|
||||||
|
|
||||||
**Minimal Example**:
|
**Minimal Example**:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -164,7 +222,14 @@ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|||||||
crawl_conf = CrawlerRunConfig(
|
crawl_conf = CrawlerRunConfig(
|
||||||
js_code="document.querySelector('button#loadMore')?.click()",
|
js_code="document.querySelector('button#loadMore')?.click()",
|
||||||
wait_for="css:.loaded-content",
|
wait_for="css:.loaded-content",
|
||||||
screenshot=True
|
screenshot=True,
|
||||||
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config=RateLimitConfig(
|
||||||
|
base_delay=(1.0, 3.0),
|
||||||
|
max_delay=60.0,
|
||||||
|
max_retries=3,
|
||||||
|
rate_limit_codes=[429, 503]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
@@ -205,7 +270,14 @@ async def main():
|
|||||||
# 3) Crawler run config: skip cache, use extraction
|
# 3) Crawler run config: skip cache, use extraction
|
||||||
run_conf = CrawlerRunConfig(
|
run_conf = CrawlerRunConfig(
|
||||||
extraction_strategy=extraction,
|
extraction_strategy=extraction,
|
||||||
cache_mode=CacheMode.BYPASS
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
enable_rate_limiting=True,
|
||||||
|
rate_limit_config=RateLimitConfig(
|
||||||
|
base_delay=(1.0, 3.0),
|
||||||
|
max_delay=60.0,
|
||||||
|
max_retries=3,
|
||||||
|
rate_limit_codes=[429, 503]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
async with AsyncWebCrawler(config=browser_conf) as crawler:
|
||||||
|
|||||||
@@ -1,7 +1,3 @@
|
|||||||
Below is the **revised Quickstart** guide with the **Installation** section removed, plus an updated **dynamic content** crawl example that uses `BrowserConfig` and `CrawlerRunConfig` (instead of passing parameters directly to `arun()`). Everything else remains as before.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Getting Started with Crawl4AI
|
# Getting Started with Crawl4AI
|
||||||
|
|
||||||
Welcome to **Crawl4AI**, an open-source LLM-friendly Web Crawler & Scraper. In this tutorial, you’ll:
|
Welcome to **Crawl4AI**, an open-source LLM-friendly Web Crawler & Scraper. In this tutorial, you’ll:
|
||||||
@@ -254,7 +250,39 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 7. Dynamic Content Example
|
## 7. Multi-URL Concurrency (Preview)
|
||||||
|
|
||||||
|
If you need to crawl multiple URLs in **parallel**, you can use `arun_many()`. By default, Crawl4AI employs a **MemoryAdaptiveDispatcher**, automatically adjusting concurrency based on system resources. Here’s a quick glimpse:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
async def quick_parallel_example():
|
||||||
|
urls = [
|
||||||
|
"https://example.com/page1",
|
||||||
|
"https://example.com/page2",
|
||||||
|
"https://example.com/page3"
|
||||||
|
]
|
||||||
|
|
||||||
|
run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
results = await crawler.arun_many(urls, config=run_conf)
|
||||||
|
for res in results:
|
||||||
|
if res.success:
|
||||||
|
print(f"[OK] {res.url}, length: {len(res.markdown_v2.raw_markdown)}")
|
||||||
|
else:
|
||||||
|
print(f"[ERROR] {res.url} => {res.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(quick_parallel_example())
|
||||||
|
```
|
||||||
|
|
||||||
|
For more advanced concurrency (e.g., a **semaphore-based** approach, **adaptive memory usage throttling**, or customized rate limiting), see [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md).
|
||||||
|
|
||||||
|
|
||||||
|
## 8. Dynamic Content Example
|
||||||
|
|
||||||
Some sites require multiple “page clicks” or dynamic JavaScript updates. Below is an example showing how to **click** a “Next Page” button and wait for new commits to load on GitHub, using **`BrowserConfig`** and **`CrawlerRunConfig`**:
|
Some sites require multiple “page clicks” or dynamic JavaScript updates. Below is an example showing how to **click** a “Next Page” button and wait for new commits to load on GitHub, using **`BrowserConfig`** and **`CrawlerRunConfig`**:
|
||||||
|
|
||||||
@@ -343,7 +371,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 8. Next Steps
|
## 9. Next Steps
|
||||||
|
|
||||||
Congratulations! You have:
|
Congratulations! You have:
|
||||||
|
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ nav:
|
|||||||
- API Reference:
|
- API Reference:
|
||||||
- "AsyncWebCrawler": "api/async-webcrawler.md"
|
- "AsyncWebCrawler": "api/async-webcrawler.md"
|
||||||
- "arun()": "api/arun.md"
|
- "arun()": "api/arun.md"
|
||||||
|
- "arun_many()": "api/arun_many.md"
|
||||||
- "Browser & Crawler Config": "api/parameters.md"
|
- "Browser & Crawler Config": "api/parameters.md"
|
||||||
- "CrawlResult": "api/crawl-result.md"
|
- "CrawlResult": "api/crawl-result.md"
|
||||||
- "Strategies": "api/strategies.md"
|
- "Strategies": "api/strategies.md"
|
||||||
|
|||||||
147
tests/async/test_dispatchers.py
Normal file
147
tests/async/test_dispatchers.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
import pytest
|
||||||
|
import asyncio, time
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
|
||||||
|
MemoryAdaptiveDispatcher, SemaphoreDispatcher,
|
||||||
|
RateLimiter, CrawlerMonitor, DisplayMode, CacheMode
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def browser_config():
|
||||||
|
return BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
verbose=False
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def run_config():
|
||||||
|
return CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
verbose=False
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_urls():
|
||||||
|
return [
|
||||||
|
"http://example.com",
|
||||||
|
"http://example.com/page1",
|
||||||
|
"http://example.com/page2"
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
class TestDispatchStrategies:
|
||||||
|
|
||||||
|
async def test_memory_adaptive_basic(self, browser_config, run_config, test_urls):
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
max_session_permit=2,
|
||||||
|
check_interval=0.1
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
assert len(results) == len(test_urls)
|
||||||
|
assert all(r.success for r in results)
|
||||||
|
|
||||||
|
async def test_memory_adaptive_with_rate_limit(self, browser_config, run_config, test_urls):
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
max_session_permit=2,
|
||||||
|
check_interval=0.1,
|
||||||
|
rate_limiter=RateLimiter(
|
||||||
|
base_delay=(0.1, 0.2),
|
||||||
|
max_delay=1.0,
|
||||||
|
max_retries=2
|
||||||
|
)
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
assert len(results) == len(test_urls)
|
||||||
|
assert all(r.success for r in results)
|
||||||
|
|
||||||
|
async def test_semaphore_basic(self, browser_config, run_config, test_urls):
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = SemaphoreDispatcher(
|
||||||
|
semaphore_count=2
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
assert len(results) == len(test_urls)
|
||||||
|
assert all(r.success for r in results)
|
||||||
|
|
||||||
|
async def test_semaphore_with_rate_limit(self, browser_config, run_config, test_urls):
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = SemaphoreDispatcher(
|
||||||
|
semaphore_count=2,
|
||||||
|
rate_limiter=RateLimiter(
|
||||||
|
base_delay=(0.1, 0.2),
|
||||||
|
max_delay=1.0,
|
||||||
|
max_retries=2
|
||||||
|
)
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
assert len(results) == len(test_urls)
|
||||||
|
assert all(r.success for r in results)
|
||||||
|
|
||||||
|
async def test_memory_adaptive_memory_error(self, browser_config, run_config, test_urls):
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=1.0, # Set unrealistically low threshold
|
||||||
|
max_session_permit=2,
|
||||||
|
check_interval=0.1,
|
||||||
|
memory_wait_timeout=1.0 # Short timeout for testing
|
||||||
|
)
|
||||||
|
with pytest.raises(MemoryError):
|
||||||
|
await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
|
||||||
|
async def test_empty_urls(self, browser_config, run_config):
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
|
||||||
|
results = await crawler.arun_many([], config=run_config, dispatcher=dispatcher)
|
||||||
|
assert len(results) == 0
|
||||||
|
|
||||||
|
async def test_single_url(self, browser_config, run_config):
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
|
||||||
|
results = await crawler.arun_many(["http://example.com"], config=run_config, dispatcher=dispatcher)
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0].success
|
||||||
|
|
||||||
|
async def test_invalid_urls(self, browser_config, run_config):
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=2)
|
||||||
|
results = await crawler.arun_many(["http://invalid.url.that.doesnt.exist"], config=run_config, dispatcher=dispatcher)
|
||||||
|
assert len(results) == 1
|
||||||
|
assert not results[0].success
|
||||||
|
|
||||||
|
async def test_rate_limit_backoff(self, browser_config, run_config):
|
||||||
|
urls = ["http://example.com"] * 5 # Multiple requests to same domain
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
max_session_permit=2,
|
||||||
|
rate_limiter=RateLimiter(
|
||||||
|
base_delay=(0.1, 0.2),
|
||||||
|
max_delay=1.0,
|
||||||
|
max_retries=2,
|
||||||
|
rate_limit_codes=[200] # Force rate limiting for testing
|
||||||
|
)
|
||||||
|
)
|
||||||
|
start_time = time.time()
|
||||||
|
results = await crawler.arun_many(urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
duration = time.time() - start_time
|
||||||
|
assert len(results) == len(urls)
|
||||||
|
assert duration > 1.0 # Ensure rate limiting caused delays
|
||||||
|
|
||||||
|
async def test_monitor_integration(self, browser_config, run_config, test_urls):
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
monitor = CrawlerMonitor(max_visible_rows=5, display_mode=DisplayMode.DETAILED)
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
max_session_permit=2,
|
||||||
|
monitor=monitor
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(test_urls, config=run_config, dispatcher=dispatcher)
|
||||||
|
assert len(results) == len(test_urls)
|
||||||
|
# Check monitor stats
|
||||||
|
assert len(monitor.stats) == len(test_urls)
|
||||||
|
assert all(stat.end_time is not None for stat in monitor.stats.values())
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v", "--asyncio-mode=auto"])
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
import pytest
|
|
||||||
import asyncio
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
|
||||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, RateLimitConfig
|
|
||||||
from crawl4ai.dispatcher import DisplayMode
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_crawler_with_dispatcher():
|
|
||||||
# Create test URLs
|
|
||||||
urls = [f"https://example.com/page_{i}" for i in range(5)]
|
|
||||||
|
|
||||||
# Configure browser
|
|
||||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
|
||||||
|
|
||||||
# Configure crawler with rate limiting
|
|
||||||
run_config = CrawlerRunConfig(
|
|
||||||
enable_rate_limiting=True,
|
|
||||||
rate_limit_config=RateLimitConfig(
|
|
||||||
base_delay=(1.0, 2.0),
|
|
||||||
max_delay=30.0,
|
|
||||||
max_retries=2,
|
|
||||||
rate_limit_codes=[429, 503]
|
|
||||||
),
|
|
||||||
memory_threshold_percent=70.0,
|
|
||||||
check_interval=0.5,
|
|
||||||
max_session_permit=3,
|
|
||||||
display_mode=DisplayMode.DETAILED.value
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
results = await crawler.arun_many(urls, config=run_config)
|
|
||||||
|
|
||||||
# Basic validation
|
|
||||||
assert len(results) == len(urls)
|
|
||||||
for result in results:
|
|
||||||
assert result is not None
|
|
||||||
# Note: example.com URLs will fail, which is expected for this test
|
|
||||||
assert not result.success # We expect these to fail since they're fake URLs
|
|
||||||
Reference in New Issue
Block a user