feat(tests): implement high volume stress testing framework

Add comprehensive stress testing solution for SDK using arun_many and dispatcher system: - Create test_stress_sdk.py for running high volume crawl tests - Add run_benchmark.py for orchestrating tests with predefined configs - Implement benchmark_report.py for generating performance reports - Add memory tracking and local test site generation - Support both streaming and batch processing modes - Add detailed documentation in README.md The framework enables testing SDK performance, concurrency handling, and memory behavior under high-volume scenarios.
2025-04-17 22:31:51 +08:00
parent 94d486579c
commit 921e0c46b6
7 changed files with 2161 additions and 1 deletions
--- a/tests/memory/benchmark_report.py
+++ b/tests/memory/benchmark_report.py
@@ -0,0 +1,887 @@
+#!/usr/bin/env python3
+"""
+Benchmark reporting tool for Crawl4AI stress tests.
+Generates visual reports and comparisons between test runs.
+"""
+
+import os
+import json
+import glob
+import argparse
+import sys
+from datetime import datetime
+from pathlib import Path
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+
+# Initialize rich console
+console = Console()
+
+# Try to import optional visualization dependencies
+VISUALIZATION_AVAILABLE = True
+try:
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import matplotlib as mpl
+    import numpy as np
+    import seaborn as sns
+except ImportError:
+    VISUALIZATION_AVAILABLE = False
+    console.print("[yellow]Warning: Visualization dependencies not found. Install with:[/yellow]")
+    console.print("[yellow]pip install pandas matplotlib seaborn[/yellow]")
+    console.print("[yellow]Only text-based reports will be generated.[/yellow]")
+
+# Configure plotting if available
+if VISUALIZATION_AVAILABLE:
+    # Set plot style for dark theme
+    plt.style.use('dark_background')
+    sns.set_theme(style="darkgrid")
+    
+    # Custom color palette based on Nord theme
+    nord_palette = ["#88c0d0", "#81a1c1", "#a3be8c", "#ebcb8b", "#bf616a", "#b48ead", "#5e81ac"]
+    sns.set_palette(nord_palette)
+
+class BenchmarkReporter:
+    """Generates visual reports and comparisons for Crawl4AI stress tests."""
+    
+    def __init__(self, reports_dir="reports", output_dir="benchmark_reports"):
+        """Initialize the benchmark reporter.
+        
+        Args:
+            reports_dir: Directory containing test result files
+            output_dir: Directory to save generated reports
+        """
+        self.reports_dir = Path(reports_dir)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Configure matplotlib if available
+        if VISUALIZATION_AVAILABLE:
+            # Ensure the matplotlib backend works in headless environments
+            mpl.use('Agg')
+            
+            # Set up styling for plots with dark theme
+            mpl.rcParams['figure.figsize'] = (12, 8)
+            mpl.rcParams['font.size'] = 12
+            mpl.rcParams['axes.labelsize'] = 14
+            mpl.rcParams['axes.titlesize'] = 16
+            mpl.rcParams['xtick.labelsize'] = 12
+            mpl.rcParams['ytick.labelsize'] = 12
+            mpl.rcParams['legend.fontsize'] = 12
+            mpl.rcParams['figure.facecolor'] = '#1e1e1e'
+            mpl.rcParams['axes.facecolor'] = '#2e3440'
+            mpl.rcParams['savefig.facecolor'] = '#1e1e1e'
+            mpl.rcParams['text.color'] = '#e0e0e0'
+            mpl.rcParams['axes.labelcolor'] = '#e0e0e0'
+            mpl.rcParams['xtick.color'] = '#e0e0e0'
+            mpl.rcParams['ytick.color'] = '#e0e0e0'
+            mpl.rcParams['grid.color'] = '#444444'
+            mpl.rcParams['figure.edgecolor'] = '#444444'
+        
+    def load_test_results(self, limit=None):
+        """Load all test results from the reports directory.
+        
+        Args:
+            limit: Optional limit on number of most recent tests to load
+            
+        Returns:
+            Dictionary mapping test IDs to result data
+        """
+        result_files = glob.glob(str(self.reports_dir / "test_results_*.json"))
+        
+        # Sort files by modification time (newest first)
+        result_files.sort(key=os.path.getmtime, reverse=True)
+        
+        if limit:
+            result_files = result_files[:limit]
+        
+        results = {}
+        for file_path in result_files:
+            try:
+                with open(file_path, 'r') as f:
+                    data = json.load(f)
+                    test_id = data.get('test_id')
+                    if test_id:
+                        results[test_id] = data
+                        
+                        # Try to load the corresponding memory samples
+                        csv_path = self.reports_dir / f"memory_samples_{test_id}.csv"
+                        if csv_path.exists():
+                            try:
+                                memory_df = pd.read_csv(csv_path)
+                                results[test_id]['memory_samples'] = memory_df
+                            except Exception as e:
+                                console.print(f"[yellow]Warning: Could not load memory samples for {test_id}: {e}[/yellow]")
+            except Exception as e:
+                console.print(f"[red]Error loading {file_path}: {e}[/red]")
+        
+        console.print(f"Loaded {len(results)} test results")
+        return results
+    
+    def generate_summary_table(self, results):
+        """Generate a summary table of test results.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            
+        Returns:
+            Rich Table object
+        """
+        table = Table(title="Crawl4AI Stress Test Summary", show_header=True)
+        
+        # Define columns
+        table.add_column("Test ID", style="cyan")
+        table.add_column("Date", style="bright_green")
+        table.add_column("URLs", justify="right")
+        table.add_column("Workers", justify="right")
+        table.add_column("Success %", justify="right")
+        table.add_column("Time (s)", justify="right")
+        table.add_column("Mem Growth", justify="right")
+        table.add_column("URLs/sec", justify="right")
+        
+        # Add rows
+        for test_id, data in sorted(results.items(), key=lambda x: x[0], reverse=True):
+            # Parse timestamp from test_id
+            try:
+                date_str = datetime.strptime(test_id, "%Y%m%d_%H%M%S").strftime("%Y-%m-%d %H:%M")
+            except:
+                date_str = "Unknown"
+            
+            # Calculate success percentage
+            total_urls = data.get('url_count', 0)
+            successful = data.get('successful_urls', 0)
+            success_pct = (successful / total_urls * 100) if total_urls > 0 else 0
+            
+            # Calculate memory growth if available
+            mem_growth = "N/A"
+            if 'memory_samples' in data:
+                samples = data['memory_samples']
+                if len(samples) >= 2:
+                    # Try to extract numeric values from memory_info strings
+                    try:
+                        first_mem = float(samples.iloc[0]['memory_info'].split()[0])
+                        last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
+                        mem_growth = f"{last_mem - first_mem:.1f} MB"
+                    except:
+                        pass
+            
+            # Calculate URLs per second
+            time_taken = data.get('total_time_seconds', 0)
+            urls_per_sec = total_urls / time_taken if time_taken > 0 else 0
+            
+            table.add_row(
+                test_id,
+                date_str,
+                str(total_urls),
+                str(data.get('workers', 'N/A')),
+                f"{success_pct:.1f}%",
+                f"{data.get('total_time_seconds', 0):.2f}",
+                mem_growth,
+                f"{urls_per_sec:.1f}"
+            )
+        
+        return table
+    
+    def generate_performance_chart(self, results, output_file=None):
+        """Generate a performance comparison chart.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            output_file: File path to save the chart
+            
+        Returns:
+            Path to the saved chart file or None if visualization is not available
+        """
+        if not VISUALIZATION_AVAILABLE:
+            console.print("[yellow]Skipping performance chart - visualization dependencies not available[/yellow]")
+            return None
+            
+        # Extract relevant data
+        data = []
+        for test_id, result in results.items():
+            urls = result.get('url_count', 0)
+            workers = result.get('workers', 0)
+            time_taken = result.get('total_time_seconds', 0)
+            urls_per_sec = urls / time_taken if time_taken > 0 else 0
+            
+            # Parse timestamp from test_id for sorting
+            try:
+                timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
+                data.append({
+                    'test_id': test_id,
+                    'timestamp': timestamp,
+                    'urls': urls,
+                    'workers': workers,
+                    'time_seconds': time_taken,
+                    'urls_per_sec': urls_per_sec
+                })
+            except:
+                console.print(f"[yellow]Warning: Could not parse timestamp from {test_id}[/yellow]")
+        
+        if not data:
+            console.print("[yellow]No valid data for performance chart[/yellow]")
+            return None
+        
+        # Convert to DataFrame and sort by timestamp
+        df = pd.DataFrame(data)
+        df = df.sort_values('timestamp')
+        
+        # Create the plot
+        fig, ax1 = plt.subplots(figsize=(12, 6))
+        
+        # Plot URLs per second as bars with properly set x-axis
+        x_pos = range(len(df['test_id']))
+        bars = ax1.bar(x_pos, df['urls_per_sec'], color='#88c0d0', alpha=0.8)
+        ax1.set_ylabel('URLs per Second', color='#88c0d0')
+        ax1.tick_params(axis='y', labelcolor='#88c0d0')
+        
+        # Properly set x-axis labels
+        ax1.set_xticks(x_pos)
+        ax1.set_xticklabels(df['test_id'].tolist(), rotation=45, ha='right')
+        
+        # Add worker count as text on each bar
+        for i, bar in enumerate(bars):
+            height = bar.get_height()
+            workers = df.iloc[i]['workers']
+            ax1.text(i, height + 0.1,
+                    f'W: {workers}', ha='center', va='bottom', fontsize=9, color='#e0e0e0')
+        
+        # Add a second y-axis for total URLs
+        ax2 = ax1.twinx()
+        ax2.plot(x_pos, df['urls'], '-', color='#bf616a', alpha=0.8, markersize=6, marker='o')
+        ax2.set_ylabel('Total URLs', color='#bf616a')
+        ax2.tick_params(axis='y', labelcolor='#bf616a')
+        
+        # Set title and layout
+        plt.title('Crawl4AI Performance Benchmarks')
+        plt.tight_layout()
+        
+        # Save the figure
+        if output_file is None:
+            output_file = self.output_dir / "performance_comparison.png"
+        plt.savefig(output_file, dpi=100, bbox_inches='tight')
+        plt.close()
+        
+        return output_file
+    
+    def generate_memory_charts(self, results, output_prefix=None):
+        """Generate memory usage charts for each test.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            output_prefix: Prefix for output file names
+            
+        Returns:
+            List of paths to the saved chart files
+        """
+        if not VISUALIZATION_AVAILABLE:
+            console.print("[yellow]Skipping memory charts - visualization dependencies not available[/yellow]")
+            return []
+            
+        output_files = []
+        
+        for test_id, result in results.items():
+            if 'memory_samples' not in result:
+                continue
+            
+            memory_df = result['memory_samples']
+            
+            # Check if we have enough data points
+            if len(memory_df) < 2:
+                continue
+            
+            # Try to extract numeric values from memory_info strings
+            try:
+                memory_values = []
+                for mem_str in memory_df['memory_info']:
+                    # Extract the number from strings like "142.8 MB"
+                    value = float(mem_str.split()[0])
+                    memory_values.append(value)
+                
+                memory_df['memory_mb'] = memory_values
+            except Exception as e:
+                console.print(f"[yellow]Could not parse memory values for {test_id}: {e}[/yellow]")
+                continue
+            
+            # Create the plot
+            plt.figure(figsize=(10, 6))
+            
+            # Plot memory usage over time
+            plt.plot(memory_df['elapsed_seconds'], memory_df['memory_mb'], 
+                     color='#88c0d0', marker='o', linewidth=2, markersize=4)
+            
+            # Add annotations for chunk processing
+            chunk_size = result.get('chunk_size', 0)
+            url_count = result.get('url_count', 0)
+            if chunk_size > 0 and url_count > 0:
+                # Estimate chunk processing times
+                num_chunks = (url_count + chunk_size - 1) // chunk_size  # Ceiling division
+                total_time = result.get('total_time_seconds', memory_df['elapsed_seconds'].max())
+                chunk_times = np.linspace(0, total_time, num_chunks + 1)[1:]
+                
+                for i, time_point in enumerate(chunk_times):
+                    if time_point <= memory_df['elapsed_seconds'].max():
+                        plt.axvline(x=time_point, color='#4c566a', linestyle='--', alpha=0.6)
+                        plt.text(time_point, memory_df['memory_mb'].min(), f'Chunk {i+1}', 
+                                rotation=90, verticalalignment='bottom', fontsize=8, color='#e0e0e0')
+            
+            # Set labels and title
+            plt.xlabel('Elapsed Time (seconds)', color='#e0e0e0')
+            plt.ylabel('Memory Usage (MB)', color='#e0e0e0')
+            plt.title(f'Memory Usage During Test {test_id}\n({url_count} URLs, {result.get("workers", "?")} Workers)', 
+                      color='#e0e0e0')
+            
+            # Add grid and set y-axis to start from zero
+            plt.grid(True, alpha=0.3, color='#4c566a')
+            
+            # Add test metadata as text
+            info_text = (
+                f"URLs: {url_count}\n"
+                f"Workers: {result.get('workers', 'N/A')}\n"
+                f"Chunk Size: {result.get('chunk_size', 'N/A')}\n"
+                f"Total Time: {result.get('total_time_seconds', 0):.2f}s\n"
+            )
+            
+            # Calculate memory growth
+            if len(memory_df) >= 2:
+                first_mem = memory_df.iloc[0]['memory_mb']
+                last_mem = memory_df.iloc[-1]['memory_mb']
+                growth = last_mem - first_mem
+                growth_rate = growth / result.get('total_time_seconds', 1)
+                
+                info_text += f"Memory Growth: {growth:.1f} MB\n"
+                info_text += f"Growth Rate: {growth_rate:.2f} MB/s"
+            
+            plt.figtext(0.02, 0.02, info_text, fontsize=9, color='#e0e0e0',
+                       bbox=dict(facecolor='#3b4252', alpha=0.8, edgecolor='#4c566a'))
+            
+            # Save the figure
+            if output_prefix is None:
+                output_file = self.output_dir / f"memory_chart_{test_id}.png"
+            else:
+                output_file = Path(f"{output_prefix}_memory_{test_id}.png")
+                
+            plt.tight_layout()
+            plt.savefig(output_file, dpi=100, bbox_inches='tight')
+            plt.close()
+            
+            output_files.append(output_file)
+        
+        return output_files
+    
+    def generate_comparison_report(self, results, title=None, output_file=None):
+        """Generate a comprehensive comparison report of multiple test runs.
+        
+        Args:
+            results: Dictionary mapping test IDs to result data
+            title: Optional title for the report
+            output_file: File path to save the report
+            
+        Returns:
+            Path to the saved report file
+        """
+        if not results:
+            console.print("[yellow]No results to generate comparison report[/yellow]")
+            return None
+        
+        if output_file is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_file = self.output_dir / f"comparison_report_{timestamp}.html"
+        
+        # Create data for the report
+        rows = []
+        for test_id, data in results.items():
+            # Calculate metrics
+            urls = data.get('url_count', 0)
+            workers = data.get('workers', 0)
+            successful = data.get('successful_urls', 0)
+            failed = data.get('failed_urls', 0)
+            time_seconds = data.get('total_time_seconds', 0)
+            
+            # Calculate additional metrics
+            success_rate = (successful / urls) * 100 if urls > 0 else 0
+            urls_per_second = urls / time_seconds if time_seconds > 0 else 0
+            urls_per_worker = urls / workers if workers > 0 else 0
+            
+            # Calculate memory growth if available
+            mem_start = None
+            mem_end = None
+            mem_growth = None
+            if 'memory_samples' in data:
+                samples = data['memory_samples']
+                if len(samples) >= 2:
+                    try:
+                        first_mem = float(samples.iloc[0]['memory_info'].split()[0])
+                        last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
+                        mem_start = first_mem
+                        mem_end = last_mem
+                        mem_growth = last_mem - first_mem
+                    except:
+                        pass
+            
+            # Parse timestamp from test_id
+            try:
+                timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
+            except:
+                timestamp = None
+            
+            rows.append({
+                'test_id': test_id,
+                'timestamp': timestamp,
+                'date': timestamp.strftime("%Y-%m-%d %H:%M:%S") if timestamp else "Unknown",
+                'urls': urls,
+                'workers': workers,
+                'chunk_size': data.get('chunk_size', 0),
+                'successful': successful,
+                'failed': failed,
+                'success_rate': success_rate,
+                'time_seconds': time_seconds,
+                'urls_per_second': urls_per_second,
+                'urls_per_worker': urls_per_worker,
+                'memory_start': mem_start,
+                'memory_end': mem_end,
+                'memory_growth': mem_growth
+            })
+        
+        # Sort data by timestamp if possible
+        if VISUALIZATION_AVAILABLE:
+            # Convert to DataFrame and sort by timestamp
+            df = pd.DataFrame(rows)
+            if 'timestamp' in df.columns and not df['timestamp'].isna().all():
+                df = df.sort_values('timestamp', ascending=False)
+        else:
+            # Simple sorting without pandas
+            rows.sort(key=lambda x: x.get('timestamp', datetime.now()), reverse=True)
+            df = None
+        
+        # Generate HTML report
+        html = []
+        html.append('<!DOCTYPE html>')
+        html.append('<html lang="en">')
+        html.append('<head>')
+        html.append('<meta charset="UTF-8">')
+        html.append('<meta name="viewport" content="width=device-width, initial-scale=1.0">')
+        html.append(f'<title>{title or "Crawl4AI Benchmark Comparison"}</title>')
+        html.append('<style>')
+        html.append('''
+        body {
+            font-family: Arial, sans-serif;
+            line-height: 1.6;
+            margin: 0;
+            padding: 20px;
+            max-width: 1200px;
+            margin: 0 auto;
+            color: #e0e0e0;
+            background-color: #1e1e1e;
+        }
+        h1, h2, h3 {
+            color: #81a1c1;
+        }
+        table {
+            border-collapse: collapse;
+            width: 100%;
+            margin-bottom: 20px;
+        }
+        th, td {
+            text-align: left;
+            padding: 12px;
+            border-bottom: 1px solid #444;
+        }
+        th {
+            background-color: #2e3440;
+            font-weight: bold;
+        }
+        tr:hover {
+            background-color: #2e3440;
+        }
+        a {
+            color: #88c0d0;
+            text-decoration: none;
+        }
+        a:hover {
+            text-decoration: underline;
+        }
+        .chart-container {
+            margin: 30px 0;
+            text-align: center;
+            background-color: #2e3440;
+            padding: 20px;
+            border-radius: 8px;
+        }
+        .chart-container img {
+            max-width: 100%;
+            height: auto;
+            border: 1px solid #444;
+            box-shadow: 0 0 10px rgba(0,0,0,0.3);
+        }
+        .card {
+            border: 1px solid #444;
+            border-radius: 8px;
+            padding: 15px;
+            margin-bottom: 20px;
+            background-color: #2e3440;
+            box-shadow: 0 0 10px rgba(0,0,0,0.2);
+        }
+        .highlight {
+            background-color: #3b4252;
+            font-weight: bold;
+        }
+        .status-good {
+            color: #a3be8c;
+        }
+        .status-warning {
+            color: #ebcb8b;
+        }
+        .status-bad {
+            color: #bf616a;
+        }
+        ''')
+        html.append('</style>')
+        html.append('</head>')
+        html.append('<body>')
+        
+        # Header
+        html.append(f'<h1>{title or "Crawl4AI Benchmark Comparison"}</h1>')
+        html.append(f'<p>Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>')
+        
+        # Summary section
+        html.append('<div class="card">')
+        html.append('<h2>Summary</h2>')
+        html.append('<p>This report compares the performance of Crawl4AI across multiple test runs.</p>')
+        
+        # Summary metrics
+        data_available = (VISUALIZATION_AVAILABLE and df is not None and not df.empty) or (not VISUALIZATION_AVAILABLE and len(rows) > 0)
+        if data_available:
+            # Get the latest test data
+            if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
+                latest_test = df.iloc[0]
+                latest_id = latest_test['test_id']
+            else:
+                latest_test = rows[0]  # First row (already sorted by timestamp)
+                latest_id = latest_test['test_id']
+            
+            html.append('<h3>Latest Test Results</h3>')
+            html.append('<ul>')
+            html.append(f'<li><strong>Test ID:</strong> {latest_id}</li>')
+            html.append(f'<li><strong>Date:</strong> {latest_test["date"]}</li>')
+            html.append(f'<li><strong>URLs:</strong> {latest_test["urls"]}</li>')
+            html.append(f'<li><strong>Workers:</strong> {latest_test["workers"]}</li>')
+            html.append(f'<li><strong>Success Rate:</strong> {latest_test["success_rate"]:.1f}%</li>')
+            html.append(f'<li><strong>Time:</strong> {latest_test["time_seconds"]:.2f} seconds</li>')
+            html.append(f'<li><strong>Performance:</strong> {latest_test["urls_per_second"]:.1f} URLs/second</li>')
+            
+            # Check memory growth (handle both pandas and dict mode)
+            memory_growth_available = False
+            if VISUALIZATION_AVAILABLE and df is not None:
+                if pd.notna(latest_test["memory_growth"]):
+                    html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
+                    memory_growth_available = True
+            else:
+                if latest_test["memory_growth"] is not None:
+                    html.append(f'<li><strong>Memory Growth:</strong> {latest_test["memory_growth"]:.1f} MB</li>')
+                    memory_growth_available = True
+            
+            html.append('</ul>')
+            
+            # If we have more than one test, show trend
+            if (VISUALIZATION_AVAILABLE and df is not None and len(df) > 1) or (not VISUALIZATION_AVAILABLE and len(rows) > 1):
+                if VISUALIZATION_AVAILABLE and df is not None:
+                    prev_test = df.iloc[1]
+                else:
+                    prev_test = rows[1]
+                
+                # Calculate performance change
+                perf_change = ((latest_test["urls_per_second"] / prev_test["urls_per_second"]) - 1) * 100 if prev_test["urls_per_second"] > 0 else 0
+                
+                status_class = ""
+                if perf_change > 5:
+                    status_class = "status-good"
+                elif perf_change < -5:
+                    status_class = "status-bad"
+                
+                html.append('<h3>Performance Trend</h3>')
+                html.append('<ul>')
+                html.append(f'<li><strong>Performance Change:</strong> <span class="{status_class}">{perf_change:+.1f}%</span> compared to previous test</li>')
+                
+                # Memory trend if available
+                memory_trend_available = False
+                if VISUALIZATION_AVAILABLE and df is not None:
+                    if pd.notna(latest_test["memory_growth"]) and pd.notna(prev_test["memory_growth"]):
+                        mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
+                        memory_trend_available = True
+                else:
+                    if latest_test["memory_growth"] is not None and prev_test["memory_growth"] is not None:
+                        mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
+                        memory_trend_available = True
+                
+                if memory_trend_available:
+                    mem_status = ""
+                    if mem_change < -1:  # Improved (less growth)
+                        mem_status = "status-good"
+                    elif mem_change > 1:  # Worse (more growth)
+                        mem_status = "status-bad"
+                    
+                    html.append(f'<li><strong>Memory Trend:</strong> <span class="{mem_status}">{mem_change:+.1f} MB</span> change in memory growth</li>')
+                
+                html.append('</ul>')
+        
+        html.append('</div>')
+        
+        # Generate performance chart if visualization is available
+        if VISUALIZATION_AVAILABLE:
+            perf_chart = self.generate_performance_chart(results)
+            if perf_chart:
+                html.append('<div class="chart-container">')
+                html.append('<h2>Performance Comparison</h2>')
+                html.append(f'<img src="{os.path.relpath(perf_chart, os.path.dirname(output_file))}" alt="Performance Comparison Chart">')
+                html.append('</div>')
+        else:
+            html.append('<div class="chart-container">')
+            html.append('<h2>Performance Comparison</h2>')
+            html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
+            html.append('</div>')
+        
+        # Generate memory charts if visualization is available
+        if VISUALIZATION_AVAILABLE:
+            memory_charts = self.generate_memory_charts(results)
+            if memory_charts:
+                html.append('<div class="chart-container">')
+                html.append('<h2>Memory Usage</h2>')
+                
+                for chart in memory_charts:
+                    test_id = chart.stem.split('_')[-1]
+                    html.append(f'<h3>Test {test_id}</h3>')
+                    html.append(f'<img src="{os.path.relpath(chart, os.path.dirname(output_file))}" alt="Memory Chart for {test_id}">')
+                
+                html.append('</div>')
+        else:
+            html.append('<div class="chart-container">')
+            html.append('<h2>Memory Usage</h2>')
+            html.append('<p>Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.</p>')
+            html.append('</div>')
+        
+        # Detailed results table
+        html.append('<h2>Detailed Results</h2>')
+        
+        # Add the results as an HTML table
+        html.append('<table>')
+        
+        # Table headers
+        html.append('<tr>')
+        for col in ['Test ID', 'Date', 'URLs', 'Workers', 'Success %', 'Time (s)', 'URLs/sec', 'Mem Growth (MB)']:
+            html.append(f'<th>{col}</th>')
+        html.append('</tr>')
+        
+        # Table rows - handle both pandas DataFrame and list of dicts
+        if VISUALIZATION_AVAILABLE and df is not None:
+            # Using pandas DataFrame
+            for _, row in df.iterrows():
+                html.append('<tr>')
+                html.append(f'<td>{row["test_id"]}</td>')
+                html.append(f'<td>{row["date"]}</td>')
+                html.append(f'<td>{row["urls"]}</td>')
+                html.append(f'<td>{row["workers"]}</td>')
+                html.append(f'<td>{row["success_rate"]:.1f}%</td>')
+                html.append(f'<td>{row["time_seconds"]:.2f}</td>')
+                html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
+                
+                # Memory growth cell
+                if pd.notna(row["memory_growth"]):
+                    html.append(f'<td>{row["memory_growth"]:.1f}</td>')
+                else:
+                    html.append('<td>N/A</td>')
+                    
+                html.append('</tr>')
+        else:
+            # Using list of dicts (when pandas is not available)
+            for row in rows:
+                html.append('<tr>')
+                html.append(f'<td>{row["test_id"]}</td>')
+                html.append(f'<td>{row["date"]}</td>')
+                html.append(f'<td>{row["urls"]}</td>')
+                html.append(f'<td>{row["workers"]}</td>')
+                html.append(f'<td>{row["success_rate"]:.1f}%</td>')
+                html.append(f'<td>{row["time_seconds"]:.2f}</td>')
+                html.append(f'<td>{row["urls_per_second"]:.1f}</td>')
+                
+                # Memory growth cell
+                if row["memory_growth"] is not None:
+                    html.append(f'<td>{row["memory_growth"]:.1f}</td>')
+                else:
+                    html.append('<td>N/A</td>')
+                    
+                html.append('</tr>')
+        
+        html.append('</table>')
+        
+        # Conclusion section
+        html.append('<div class="card">')
+        html.append('<h2>Conclusion</h2>')
+        
+        if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
+            # Using pandas for statistics (when available)
+            # Calculate some overall statistics
+            avg_urls_per_sec = df['urls_per_second'].mean()
+            max_urls_per_sec = df['urls_per_second'].max()
+            
+            # Determine if we have a trend
+            if len(df) > 1:
+                trend_data = df.sort_values('timestamp')
+                first_perf = trend_data.iloc[0]['urls_per_second']
+                last_perf = trend_data.iloc[-1]['urls_per_second']
+                
+                perf_change = ((last_perf / first_perf) - 1) * 100 if first_perf > 0 else 0
+                
+                if perf_change > 10:
+                    trend_desc = "significantly improved"
+                    trend_class = "status-good"
+                elif perf_change > 5:
+                    trend_desc = "improved"
+                    trend_class = "status-good"
+                elif perf_change < -10:
+                    trend_desc = "significantly decreased"
+                    trend_class = "status-bad"
+                elif perf_change < -5:
+                    trend_desc = "decreased"
+                    trend_class = "status-bad"
+                else:
+                    trend_desc = "remained stable"
+                    trend_class = ""
+                
+                html.append(f'<p>Overall performance has <span class="{trend_class}">{trend_desc}</span> over the test period.</p>')
+            
+            html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
+            html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
+            
+            # Memory leak assessment
+            if 'memory_growth' in df.columns and not df['memory_growth'].isna().all():
+                avg_growth = df['memory_growth'].mean()
+                max_growth = df['memory_growth'].max()
+                
+                if avg_growth < 5:
+                    leak_assessment = "No significant memory leaks detected"
+                    leak_class = "status-good"
+                elif avg_growth < 10:
+                    leak_assessment = "Minor memory growth observed"
+                    leak_class = "status-warning"
+                else:
+                    leak_assessment = "Potential memory leak detected"
+                    leak_class = "status-bad"
+                
+                html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
+        else:
+            # Manual calculations without pandas
+            if rows:
+                # Calculate average and max throughput
+                total_urls_per_sec = sum(row['urls_per_second'] for row in rows)
+                avg_urls_per_sec = total_urls_per_sec / len(rows)
+                max_urls_per_sec = max(row['urls_per_second'] for row in rows)
+                
+                html.append(f'<p>Average throughput: <strong>{avg_urls_per_sec:.1f}</strong> URLs/second</p>')
+                html.append(f'<p>Maximum throughput: <strong>{max_urls_per_sec:.1f}</strong> URLs/second</p>')
+                
+                # Memory assessment (simplified without pandas)
+                growth_values = [row['memory_growth'] for row in rows if row['memory_growth'] is not None]
+                if growth_values:
+                    avg_growth = sum(growth_values) / len(growth_values)
+                    
+                    if avg_growth < 5:
+                        leak_assessment = "No significant memory leaks detected"
+                        leak_class = "status-good"
+                    elif avg_growth < 10:
+                        leak_assessment = "Minor memory growth observed"
+                        leak_class = "status-warning"
+                    else:
+                        leak_assessment = "Potential memory leak detected"
+                        leak_class = "status-bad"
+                    
+                    html.append(f'<p><span class="{leak_class}">{leak_assessment}</span>. Average memory growth: <strong>{avg_growth:.1f} MB</strong> per test.</p>')
+            else:
+                html.append('<p>No test data available for analysis.</p>')
+        
+        html.append('</div>')
+        
+        # Footer
+        html.append('<div style="margin-top: 30px; text-align: center; color: #777; font-size: 0.9em;">')
+        html.append('<p>Generated by Crawl4AI Benchmark Reporter</p>')
+        html.append('</div>')
+        
+        html.append('</body>')
+        html.append('</html>')
+        
+        # Write the HTML file
+        with open(output_file, 'w') as f:
+            f.write('\n'.join(html))
+        
+        # Print a clickable link for terminals that support it (iTerm, VS Code, etc.)
+        file_url = f"file://{os.path.abspath(output_file)}"
+        console.print(f"[green]Comparison report saved to: {output_file}[/green]")
+        console.print(f"[blue underline]Click to open report: {file_url}[/blue underline]")
+        return output_file
+    
+    def run(self, limit=None, output_file=None):
+        """Generate a full benchmark report.
+        
+        Args:
+            limit: Optional limit on number of most recent tests to include
+            output_file: Optional output file path
+            
+        Returns:
+            Path to the generated report file
+        """
+        # Load test results
+        results = self.load_test_results(limit=limit)
+        
+        if not results:
+            console.print("[yellow]No test results found. Run some tests first.[/yellow]")
+            return None
+        
+        # Generate and display summary table
+        summary_table = self.generate_summary_table(results)
+        console.print(summary_table)
+        
+        # Generate comparison report
+        title = f"Crawl4AI Benchmark Report ({len(results)} test runs)"
+        report_file = self.generate_comparison_report(results, title=title, output_file=output_file)
+        
+        if report_file:
+            console.print(f"[bold green]Report generated successfully: {report_file}[/bold green]")
+            return report_file
+        else:
+            console.print("[bold red]Failed to generate report[/bold red]")
+            return None
+
+
+def main():
+    """Main entry point for the benchmark reporter."""
+    parser = argparse.ArgumentParser(description="Generate benchmark reports for Crawl4AI stress tests")
+    
+    parser.add_argument("--reports-dir", type=str, default="reports",
+                      help="Directory containing test result files")
+    parser.add_argument("--output-dir", type=str, default="benchmark_reports",
+                      help="Directory to save generated reports")
+    parser.add_argument("--limit", type=int, default=None,
+                      help="Limit to most recent N test results")
+    parser.add_argument("--output-file", type=str, default=None,
+                      help="Custom output file path for the report")
+    
+    args = parser.parse_args()
+    
+    # Create the benchmark reporter
+    reporter = BenchmarkReporter(reports_dir=args.reports_dir, output_dir=args.output_dir)
+    
+    # Generate the report
+    report_file = reporter.run(limit=args.limit, output_file=args.output_file)
+    
+    if report_file:
+        print(f"Report generated at: {report_file}")
+        return 0
+    else:
+        print("Failed to generate report")
+        return 1
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())