-
-
+
+
Monitor
+
+
+
+
diff --git a/deploy/docker/test-websocket.py b/deploy/docker/test-websocket.py
new file mode 100755
index 00000000..db121deb
--- /dev/null
+++ b/deploy/docker/test-websocket.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+"""
+Quick WebSocket test - Connect to monitor WebSocket and print updates
+"""
+import asyncio
+import websockets
+import json
+
+async def test_websocket():
+ uri = "ws://localhost:11235/monitor/ws"
+ print(f"Connecting to {uri}...")
+
+ try:
+ async with websockets.connect(uri) as websocket:
+ print("✅ Connected!")
+
+ # Receive and print 5 updates
+ for i in range(5):
+ message = await websocket.recv()
+ data = json.loads(message)
+ print(f"\n📊 Update #{i+1}:")
+ print(f" - Health: CPU {data['health']['container']['cpu_percent']}%, Memory {data['health']['container']['memory_percent']}%")
+ print(f" - Active Requests: {len(data['requests']['active'])}")
+ print(f" - Browsers: {len(data['browsers'])}")
+
+ except Exception as e:
+ print(f"❌ Error: {e}")
+ return 1
+
+ print("\n✅ WebSocket test passed!")
+ return 0
+
+if __name__ == "__main__":
+ exit(asyncio.run(test_websocket()))
diff --git a/deploy/docker/tests/demo_monitor_dashboard.py b/deploy/docker/tests/demo_monitor_dashboard.py
new file mode 100755
index 00000000..699988a5
--- /dev/null
+++ b/deploy/docker/tests/demo_monitor_dashboard.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""
+Monitor Dashboard Demo Script
+Generates varied activity to showcase all monitoring features for video recording.
+"""
+import httpx
+import asyncio
+import time
+from datetime import datetime
+
+BASE_URL = "http://localhost:11235"
+
+async def demo_dashboard():
+ print("🎬 Monitor Dashboard Demo - Starting...\n")
+ print(f"📊 Dashboard: {BASE_URL}/dashboard")
+ print("=" * 60)
+
+ async with httpx.AsyncClient(timeout=60.0) as client:
+
+ # Phase 1: Simple requests (permanent browser)
+ print("\n🔷 Phase 1: Testing permanent browser pool")
+ print("-" * 60)
+ for i in range(5):
+ print(f" {i+1}/5 Request to /crawl (default config)...")
+ try:
+ r = await client.post(
+ f"{BASE_URL}/crawl",
+ json={"urls": [f"https://httpbin.org/html?req={i}"], "crawler_config": {}}
+ )
+ print(f" ✅ Status: {r.status_code}, Time: {r.elapsed.total_seconds():.2f}s")
+ except Exception as e:
+ print(f" ❌ Error: {e}")
+ await asyncio.sleep(1) # Small delay between requests
+
+ # Phase 2: Create variant browsers (different configs)
+ print("\n🔶 Phase 2: Testing cold→hot pool promotion")
+ print("-" * 60)
+ viewports = [
+ {"width": 1920, "height": 1080},
+ {"width": 1280, "height": 720},
+ {"width": 800, "height": 600}
+ ]
+
+ for idx, viewport in enumerate(viewports):
+ print(f" Viewport {viewport['width']}x{viewport['height']}:")
+ for i in range(4): # 4 requests each to trigger promotion at 3
+ try:
+ r = await client.post(
+ f"{BASE_URL}/crawl",
+ json={
+ "urls": [f"https://httpbin.org/json?v={idx}&r={i}"],
+ "browser_config": {"viewport": viewport},
+ "crawler_config": {}
+ }
+ )
+ print(f" {i+1}/4 ✅ {r.status_code} - Should see cold→hot after 3 uses")
+ except Exception as e:
+ print(f" {i+1}/4 ❌ {e}")
+ await asyncio.sleep(0.5)
+
+ # Phase 3: Concurrent burst (stress pool)
+ print("\n🔷 Phase 3: Concurrent burst (10 parallel)")
+ print("-" * 60)
+ tasks = []
+ for i in range(10):
+ tasks.append(
+ client.post(
+ f"{BASE_URL}/crawl",
+ json={"urls": [f"https://httpbin.org/delay/2?burst={i}"], "crawler_config": {}}
+ )
+ )
+
+ print(" Sending 10 concurrent requests...")
+ start = time.time()
+ results = await asyncio.gather(*tasks, return_exceptions=True)
+ elapsed = time.time() - start
+
+ successes = sum(1 for r in results if not isinstance(r, Exception) and r.status_code == 200)
+ print(f" ✅ {successes}/10 succeeded in {elapsed:.2f}s")
+
+ # Phase 4: Multi-endpoint coverage
+ print("\n🔶 Phase 4: Testing multiple endpoints")
+ print("-" * 60)
+ endpoints = [
+ ("/md", {"url": "https://httpbin.org/html", "f": "fit", "c": "0"}),
+ ("/screenshot", {"url": "https://httpbin.org/html"}),
+ ("/pdf", {"url": "https://httpbin.org/html"}),
+ ]
+
+ for endpoint, payload in endpoints:
+ print(f" Testing {endpoint}...")
+ try:
+ if endpoint == "/md":
+ r = await client.post(f"{BASE_URL}{endpoint}", json=payload)
+ else:
+ r = await client.post(f"{BASE_URL}{endpoint}", json=payload)
+ print(f" ✅ {r.status_code}")
+ except Exception as e:
+ print(f" ❌ {e}")
+ await asyncio.sleep(1)
+
+ # Phase 5: Intentional error (to populate errors tab)
+ print("\n🔷 Phase 5: Generating error examples")
+ print("-" * 60)
+ print(" Triggering invalid URL error...")
+ try:
+ r = await client.post(
+ f"{BASE_URL}/crawl",
+ json={"urls": ["invalid://bad-url"], "crawler_config": {}}
+ )
+ print(f" Response: {r.status_code}")
+ except Exception as e:
+ print(f" ✅ Error captured: {type(e).__name__}")
+
+ # Phase 6: Wait for janitor activity
+ print("\n🔶 Phase 6: Waiting for janitor cleanup...")
+ print("-" * 60)
+ print(" Idle for 40s to allow janitor to clean cold pool browsers...")
+ for i in range(40, 0, -10):
+ print(f" {i}s remaining... (Check dashboard for cleanup events)")
+ await asyncio.sleep(10)
+
+ # Phase 7: Final stats check
+ print("\n🔷 Phase 7: Final dashboard state")
+ print("-" * 60)
+
+ r = await client.get(f"{BASE_URL}/monitor/health")
+ health = r.json()
+ print(f" Memory: {health['container']['memory_percent']:.1f}%")
+ print(f" Browsers: Perm={health['pool']['permanent']['active']}, "
+ f"Hot={health['pool']['hot']['count']}, Cold={health['pool']['cold']['count']}")
+
+ r = await client.get(f"{BASE_URL}/monitor/endpoints/stats")
+ stats = r.json()
+ print(f"\n Endpoint Stats:")
+ for endpoint, data in stats.items():
+ print(f" {endpoint}: {data['count']} req, "
+ f"{data['avg_latency_ms']:.0f}ms avg, "
+ f"{data['success_rate_percent']:.1f}% success")
+
+ r = await client.get(f"{BASE_URL}/monitor/browsers")
+ browsers = r.json()
+ print(f"\n Pool Efficiency:")
+ print(f" Total browsers: {browsers['summary']['total_count']}")
+ print(f" Memory usage: {browsers['summary']['total_memory_mb']} MB")
+ print(f" Reuse rate: {browsers['summary']['reuse_rate_percent']:.1f}%")
+
+ print("\n" + "=" * 60)
+ print("✅ Demo complete! Dashboard is now populated with rich data.")
+ print(f"\n📹 Recording tip: Refresh {BASE_URL}/dashboard")
+ print(" You should see:")
+ print(" • Active & completed requests")
+ print(" • Browser pool (permanent + hot/cold)")
+ print(" • Janitor cleanup events")
+ print(" • Endpoint analytics")
+ print(" • Memory timeline")
+
+if __name__ == "__main__":
+ try:
+ asyncio.run(demo_dashboard())
+ except KeyboardInterrupt:
+ print("\n\n⚠️ Demo interrupted by user")
+ except Exception as e:
+ print(f"\n\n❌ Demo failed: {e}")
diff --git a/deploy/docker/tests/requirements.txt b/deploy/docker/tests/requirements.txt
new file mode 100644
index 00000000..5f7a842f
--- /dev/null
+++ b/deploy/docker/tests/requirements.txt
@@ -0,0 +1,2 @@
+httpx>=0.25.0
+docker>=7.0.0
diff --git a/deploy/docker/tests/test_1_basic.py b/deploy/docker/tests/test_1_basic.py
new file mode 100755
index 00000000..c86de073
--- /dev/null
+++ b/deploy/docker/tests/test_1_basic.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Test 1: Basic Container Health + Single Endpoint
+- Starts container
+- Hits /health endpoint 10 times
+- Reports success rate and basic latency
+"""
+import asyncio
+import time
+import docker
+import httpx
+
+# Config
+IMAGE = "crawl4ai-local:latest"
+CONTAINER_NAME = "crawl4ai-test"
+PORT = 11235
+REQUESTS = 10
+
+async def test_endpoint(url: str, count: int):
+ """Hit endpoint multiple times, return stats."""
+ results = []
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ for i in range(count):
+ start = time.time()
+ try:
+ resp = await client.get(url)
+ elapsed = (time.time() - start) * 1000 # ms
+ results.append({
+ "success": resp.status_code == 200,
+ "latency_ms": elapsed,
+ "status": resp.status_code
+ })
+ print(f" [{i+1}/{count}] ✓ {resp.status_code} - {elapsed:.0f}ms")
+ except Exception as e:
+ results.append({
+ "success": False,
+ "latency_ms": None,
+ "error": str(e)
+ })
+ print(f" [{i+1}/{count}] ✗ Error: {e}")
+ return results
+
+def start_container(client, image: str, name: str, port: int):
+ """Start container, return container object."""
+ # Clean up existing
+ try:
+ old = client.containers.get(name)
+ print(f"🧹 Stopping existing container '{name}'...")
+ old.stop()
+ old.remove()
+ except docker.errors.NotFound:
+ pass
+
+ print(f"🚀 Starting container '{name}' from image '{image}'...")
+ container = client.containers.run(
+ image,
+ name=name,
+ ports={f"{port}/tcp": port},
+ detach=True,
+ shm_size="1g",
+ environment={"PYTHON_ENV": "production"}
+ )
+
+ # Wait for health
+ print(f"⏳ Waiting for container to be healthy...")
+ for _ in range(30): # 30s timeout
+ time.sleep(1)
+ container.reload()
+ if container.status == "running":
+ try:
+ # Quick health check
+ import requests
+ resp = requests.get(f"http://localhost:{port}/health", timeout=2)
+ if resp.status_code == 200:
+ print(f"✅ Container healthy!")
+ return container
+ except:
+ pass
+ raise TimeoutError("Container failed to start")
+
+def stop_container(container):
+ """Stop and remove container."""
+ print(f"🛑 Stopping container...")
+ container.stop()
+ container.remove()
+ print(f"✅ Container removed")
+
+async def main():
+ print("="*60)
+ print("TEST 1: Basic Container Health + Single Endpoint")
+ print("="*60)
+
+ client = docker.from_env()
+ container = None
+
+ try:
+ # Start container
+ container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
+
+ # Test /health endpoint
+ print(f"\n📊 Testing /health endpoint ({REQUESTS} requests)...")
+ url = f"http://localhost:{PORT}/health"
+ results = await test_endpoint(url, REQUESTS)
+
+ # Calculate stats
+ successes = sum(1 for r in results if r["success"])
+ success_rate = (successes / len(results)) * 100
+ latencies = [r["latency_ms"] for r in results if r["latency_ms"] is not None]
+ avg_latency = sum(latencies) / len(latencies) if latencies else 0
+
+ # Print results
+ print(f"\n{'='*60}")
+ print(f"RESULTS:")
+ print(f" Success Rate: {success_rate:.1f}% ({successes}/{len(results)})")
+ print(f" Avg Latency: {avg_latency:.0f}ms")
+ if latencies:
+ print(f" Min Latency: {min(latencies):.0f}ms")
+ print(f" Max Latency: {max(latencies):.0f}ms")
+ print(f"{'='*60}")
+
+ # Pass/Fail
+ if success_rate >= 100:
+ print(f"✅ TEST PASSED")
+ return 0
+ else:
+ print(f"❌ TEST FAILED (expected 100% success rate)")
+ return 1
+
+ except Exception as e:
+ print(f"\n❌ TEST ERROR: {e}")
+ return 1
+ finally:
+ if container:
+ stop_container(container)
+
+if __name__ == "__main__":
+ exit_code = asyncio.run(main())
+ exit(exit_code)
diff --git a/deploy/docker/tests/test_2_memory.py b/deploy/docker/tests/test_2_memory.py
new file mode 100755
index 00000000..aed4c61c
--- /dev/null
+++ b/deploy/docker/tests/test_2_memory.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""
+Test 2: Docker Stats Monitoring
+- Extends Test 1 with real-time container stats
+- Monitors memory % and CPU during requests
+- Reports baseline, peak, and final memory
+"""
+import asyncio
+import time
+import docker
+import httpx
+from threading import Thread, Event
+
+# Config
+IMAGE = "crawl4ai-local:latest"
+CONTAINER_NAME = "crawl4ai-test"
+PORT = 11235
+REQUESTS = 20 # More requests to see memory usage
+
+# Stats tracking
+stats_history = []
+stop_monitoring = Event()
+
+def monitor_stats(container):
+ """Background thread to collect container stats."""
+ for stat in container.stats(decode=True, stream=True):
+ if stop_monitoring.is_set():
+ break
+
+ try:
+ # Extract memory stats
+ mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024) # MB
+ mem_limit = stat['memory_stats'].get('limit', 1) / (1024 * 1024)
+ mem_percent = (mem_usage / mem_limit * 100) if mem_limit > 0 else 0
+
+ # Extract CPU stats (handle missing fields on Mac)
+ cpu_percent = 0
+ try:
+ cpu_delta = stat['cpu_stats']['cpu_usage']['total_usage'] - \
+ stat['precpu_stats']['cpu_usage']['total_usage']
+ system_delta = stat['cpu_stats'].get('system_cpu_usage', 0) - \
+ stat['precpu_stats'].get('system_cpu_usage', 0)
+ if system_delta > 0:
+ num_cpus = stat['cpu_stats'].get('online_cpus', 1)
+ cpu_percent = (cpu_delta / system_delta * num_cpus * 100.0)
+ except (KeyError, ZeroDivisionError):
+ pass
+
+ stats_history.append({
+ 'timestamp': time.time(),
+ 'memory_mb': mem_usage,
+ 'memory_percent': mem_percent,
+ 'cpu_percent': cpu_percent
+ })
+ except Exception as e:
+ # Skip malformed stats
+ pass
+
+ time.sleep(0.5) # Sample every 500ms
+
+async def test_endpoint(url: str, count: int):
+ """Hit endpoint, return stats."""
+ results = []
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ for i in range(count):
+ start = time.time()
+ try:
+ resp = await client.get(url)
+ elapsed = (time.time() - start) * 1000
+ results.append({
+ "success": resp.status_code == 200,
+ "latency_ms": elapsed,
+ })
+ if (i + 1) % 5 == 0: # Print every 5 requests
+ print(f" [{i+1}/{count}] ✓ {resp.status_code} - {elapsed:.0f}ms")
+ except Exception as e:
+ results.append({"success": False, "error": str(e)})
+ print(f" [{i+1}/{count}] ✗ Error: {e}")
+ return results
+
+def start_container(client, image: str, name: str, port: int):
+ """Start container."""
+ try:
+ old = client.containers.get(name)
+ print(f"🧹 Stopping existing container '{name}'...")
+ old.stop()
+ old.remove()
+ except docker.errors.NotFound:
+ pass
+
+ print(f"🚀 Starting container '{name}'...")
+ container = client.containers.run(
+ image,
+ name=name,
+ ports={f"{port}/tcp": port},
+ detach=True,
+ shm_size="1g",
+ mem_limit="4g", # Set explicit memory limit
+ )
+
+ print(f"⏳ Waiting for health...")
+ for _ in range(30):
+ time.sleep(1)
+ container.reload()
+ if container.status == "running":
+ try:
+ import requests
+ resp = requests.get(f"http://localhost:{port}/health", timeout=2)
+ if resp.status_code == 200:
+ print(f"✅ Container healthy!")
+ return container
+ except:
+ pass
+ raise TimeoutError("Container failed to start")
+
+def stop_container(container):
+ """Stop container."""
+ print(f"🛑 Stopping container...")
+ container.stop()
+ container.remove()
+
+async def main():
+ print("="*60)
+ print("TEST 2: Docker Stats Monitoring")
+ print("="*60)
+
+ client = docker.from_env()
+ container = None
+ monitor_thread = None
+
+ try:
+ # Start container
+ container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
+
+ # Start stats monitoring in background
+ print(f"\n📊 Starting stats monitor...")
+ stop_monitoring.clear()
+ stats_history.clear()
+ monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
+ monitor_thread.start()
+
+ # Wait a bit for baseline
+ await asyncio.sleep(2)
+ baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
+ print(f"📏 Baseline memory: {baseline_mem:.1f} MB")
+
+ # Test /health endpoint
+ print(f"\n🔄 Running {REQUESTS} requests to /health...")
+ url = f"http://localhost:{PORT}/health"
+ results = await test_endpoint(url, REQUESTS)
+
+ # Wait a bit to capture peak
+ await asyncio.sleep(1)
+
+ # Stop monitoring
+ stop_monitoring.set()
+ if monitor_thread:
+ monitor_thread.join(timeout=2)
+
+ # Calculate stats
+ successes = sum(1 for r in results if r.get("success"))
+ success_rate = (successes / len(results)) * 100
+ latencies = [r["latency_ms"] for r in results if "latency_ms" in r]
+ avg_latency = sum(latencies) / len(latencies) if latencies else 0
+
+ # Memory stats
+ memory_samples = [s['memory_mb'] for s in stats_history]
+ peak_mem = max(memory_samples) if memory_samples else 0
+ final_mem = memory_samples[-1] if memory_samples else 0
+ mem_delta = final_mem - baseline_mem
+
+ # Print results
+ print(f"\n{'='*60}")
+ print(f"RESULTS:")
+ print(f" Success Rate: {success_rate:.1f}% ({successes}/{len(results)})")
+ print(f" Avg Latency: {avg_latency:.0f}ms")
+ print(f"\n Memory Stats:")
+ print(f" Baseline: {baseline_mem:.1f} MB")
+ print(f" Peak: {peak_mem:.1f} MB")
+ print(f" Final: {final_mem:.1f} MB")
+ print(f" Delta: {mem_delta:+.1f} MB")
+ print(f"{'='*60}")
+
+ # Pass/Fail
+ if success_rate >= 100 and mem_delta < 100: # No significant memory growth
+ print(f"✅ TEST PASSED")
+ return 0
+ else:
+ if success_rate < 100:
+ print(f"❌ TEST FAILED (success rate < 100%)")
+ if mem_delta >= 100:
+ print(f"⚠️ WARNING: Memory grew by {mem_delta:.1f} MB")
+ return 1
+
+ except Exception as e:
+ print(f"\n❌ TEST ERROR: {e}")
+ return 1
+ finally:
+ stop_monitoring.set()
+ if container:
+ stop_container(container)
+
+if __name__ == "__main__":
+ exit_code = asyncio.run(main())
+ exit(exit_code)
diff --git a/deploy/docker/tests/test_3_pool.py b/deploy/docker/tests/test_3_pool.py
new file mode 100755
index 00000000..9f2c00b2
--- /dev/null
+++ b/deploy/docker/tests/test_3_pool.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+"""
+Test 3: Pool Validation - Permanent Browser Reuse
+- Tests /html endpoint (should use permanent browser)
+- Monitors container logs for pool hit markers
+- Validates browser reuse rate
+- Checks memory after browser creation
+"""
+import asyncio
+import time
+import docker
+import httpx
+from threading import Thread, Event
+
+# Config
+IMAGE = "crawl4ai-local:latest"
+CONTAINER_NAME = "crawl4ai-test"
+PORT = 11235
+REQUESTS = 30
+
+# Stats tracking
+stats_history = []
+stop_monitoring = Event()
+
+def monitor_stats(container):
+ """Background stats collector."""
+ for stat in container.stats(decode=True, stream=True):
+ if stop_monitoring.is_set():
+ break
+ try:
+ mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
+ stats_history.append({
+ 'timestamp': time.time(),
+ 'memory_mb': mem_usage,
+ })
+ except:
+ pass
+ time.sleep(0.5)
+
+def count_log_markers(container):
+ """Extract pool usage markers from logs."""
+ logs = container.logs().decode('utf-8')
+
+ permanent_hits = logs.count("🔥 Using permanent browser")
+ hot_hits = logs.count("♨️ Using hot pool browser")
+ cold_hits = logs.count("❄️ Using cold pool browser")
+ new_created = logs.count("🆕 Creating new browser")
+
+ return {
+ 'permanent_hits': permanent_hits,
+ 'hot_hits': hot_hits,
+ 'cold_hits': cold_hits,
+ 'new_created': new_created,
+ 'total_hits': permanent_hits + hot_hits + cold_hits
+ }
+
+async def test_endpoint(url: str, count: int):
+ """Hit endpoint multiple times."""
+ results = []
+ async with httpx.AsyncClient(timeout=60.0) as client:
+ for i in range(count):
+ start = time.time()
+ try:
+ resp = await client.post(url, json={"url": "https://httpbin.org/html"})
+ elapsed = (time.time() - start) * 1000
+ results.append({
+ "success": resp.status_code == 200,
+ "latency_ms": elapsed,
+ })
+ if (i + 1) % 10 == 0:
+ print(f" [{i+1}/{count}] ✓ {resp.status_code} - {elapsed:.0f}ms")
+ except Exception as e:
+ results.append({"success": False, "error": str(e)})
+ print(f" [{i+1}/{count}] ✗ Error: {e}")
+ return results
+
+def start_container(client, image: str, name: str, port: int):
+ """Start container."""
+ try:
+ old = client.containers.get(name)
+ print(f"🧹 Stopping existing container...")
+ old.stop()
+ old.remove()
+ except docker.errors.NotFound:
+ pass
+
+ print(f"🚀 Starting container...")
+ container = client.containers.run(
+ image,
+ name=name,
+ ports={f"{port}/tcp": port},
+ detach=True,
+ shm_size="1g",
+ mem_limit="4g",
+ )
+
+ print(f"⏳ Waiting for health...")
+ for _ in range(30):
+ time.sleep(1)
+ container.reload()
+ if container.status == "running":
+ try:
+ import requests
+ resp = requests.get(f"http://localhost:{port}/health", timeout=2)
+ if resp.status_code == 200:
+ print(f"✅ Container healthy!")
+ return container
+ except:
+ pass
+ raise TimeoutError("Container failed to start")
+
+def stop_container(container):
+ """Stop container."""
+ print(f"🛑 Stopping container...")
+ container.stop()
+ container.remove()
+
+async def main():
+ print("="*60)
+ print("TEST 3: Pool Validation - Permanent Browser Reuse")
+ print("="*60)
+
+ client = docker.from_env()
+ container = None
+ monitor_thread = None
+
+ try:
+ # Start container
+ container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
+
+ # Wait for permanent browser initialization
+ print(f"\n⏳ Waiting for permanent browser init (3s)...")
+ await asyncio.sleep(3)
+
+ # Start stats monitoring
+ print(f"📊 Starting stats monitor...")
+ stop_monitoring.clear()
+ stats_history.clear()
+ monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
+ monitor_thread.start()
+
+ await asyncio.sleep(1)
+ baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
+ print(f"📏 Baseline (with permanent browser): {baseline_mem:.1f} MB")
+
+ # Test /html endpoint (uses permanent browser for default config)
+ print(f"\n🔄 Running {REQUESTS} requests to /html...")
+ url = f"http://localhost:{PORT}/html"
+ results = await test_endpoint(url, REQUESTS)
+
+ # Wait a bit
+ await asyncio.sleep(1)
+
+ # Stop monitoring
+ stop_monitoring.set()
+ if monitor_thread:
+ monitor_thread.join(timeout=2)
+
+ # Analyze logs for pool markers
+ print(f"\n📋 Analyzing pool usage...")
+ pool_stats = count_log_markers(container)
+
+ # Calculate request stats
+ successes = sum(1 for r in results if r.get("success"))
+ success_rate = (successes / len(results)) * 100
+ latencies = [r["latency_ms"] for r in results if "latency_ms" in r]
+ avg_latency = sum(latencies) / len(latencies) if latencies else 0
+
+ # Memory stats
+ memory_samples = [s['memory_mb'] for s in stats_history]
+ peak_mem = max(memory_samples) if memory_samples else 0
+ final_mem = memory_samples[-1] if memory_samples else 0
+ mem_delta = final_mem - baseline_mem
+
+ # Calculate reuse rate
+ total_requests = len(results)
+ total_pool_hits = pool_stats['total_hits']
+ reuse_rate = (total_pool_hits / total_requests * 100) if total_requests > 0 else 0
+
+ # Print results
+ print(f"\n{'='*60}")
+ print(f"RESULTS:")
+ print(f" Success Rate: {success_rate:.1f}% ({successes}/{len(results)})")
+ print(f" Avg Latency: {avg_latency:.0f}ms")
+ print(f"\n Pool Stats:")
+ print(f" 🔥 Permanent Hits: {pool_stats['permanent_hits']}")
+ print(f" ♨️ Hot Pool Hits: {pool_stats['hot_hits']}")
+ print(f" ❄️ Cold Pool Hits: {pool_stats['cold_hits']}")
+ print(f" 🆕 New Created: {pool_stats['new_created']}")
+ print(f" 📊 Reuse Rate: {reuse_rate:.1f}%")
+ print(f"\n Memory Stats:")
+ print(f" Baseline: {baseline_mem:.1f} MB")
+ print(f" Peak: {peak_mem:.1f} MB")
+ print(f" Final: {final_mem:.1f} MB")
+ print(f" Delta: {mem_delta:+.1f} MB")
+ print(f"{'='*60}")
+
+ # Pass/Fail
+ passed = True
+ if success_rate < 100:
+ print(f"❌ FAIL: Success rate {success_rate:.1f}% < 100%")
+ passed = False
+ if reuse_rate < 80:
+ print(f"❌ FAIL: Reuse rate {reuse_rate:.1f}% < 80% (expected high permanent browser usage)")
+ passed = False
+ if pool_stats['permanent_hits'] < (total_requests * 0.8):
+ print(f"⚠️ WARNING: Only {pool_stats['permanent_hits']} permanent hits out of {total_requests} requests")
+ if mem_delta > 200:
+ print(f"⚠️ WARNING: Memory grew by {mem_delta:.1f} MB (possible browser leak)")
+
+ if passed:
+ print(f"✅ TEST PASSED")
+ return 0
+ else:
+ return 1
+
+ except Exception as e:
+ print(f"\n❌ TEST ERROR: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+ finally:
+ stop_monitoring.set()
+ if container:
+ stop_container(container)
+
+if __name__ == "__main__":
+ exit_code = asyncio.run(main())
+ exit(exit_code)
diff --git a/deploy/docker/tests/test_4_concurrent.py b/deploy/docker/tests/test_4_concurrent.py
new file mode 100755
index 00000000..70198ddc
--- /dev/null
+++ b/deploy/docker/tests/test_4_concurrent.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+"""
+Test 4: Concurrent Load Testing
+- Tests pool under concurrent load
+- Escalates: 10 → 50 → 100 concurrent requests
+- Validates latency distribution (P50, P95, P99)
+- Monitors memory stability
+"""
+import asyncio
+import time
+import docker
+import httpx
+from threading import Thread, Event
+from collections import defaultdict
+
+# Config
+IMAGE = "crawl4ai-local:latest"
+CONTAINER_NAME = "crawl4ai-test"
+PORT = 11235
+LOAD_LEVELS = [
+ {"name": "Light", "concurrent": 10, "requests": 20},
+ {"name": "Medium", "concurrent": 50, "requests": 100},
+ {"name": "Heavy", "concurrent": 100, "requests": 200},
+]
+
+# Stats
+stats_history = []
+stop_monitoring = Event()
+
+def monitor_stats(container):
+ """Background stats collector."""
+ for stat in container.stats(decode=True, stream=True):
+ if stop_monitoring.is_set():
+ break
+ try:
+ mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
+ stats_history.append({'timestamp': time.time(), 'memory_mb': mem_usage})
+ except:
+ pass
+ time.sleep(0.5)
+
+def count_log_markers(container):
+ """Extract pool markers."""
+ logs = container.logs().decode('utf-8')
+ return {
+ 'permanent': logs.count("🔥 Using permanent browser"),
+ 'hot': logs.count("♨️ Using hot pool browser"),
+ 'cold': logs.count("❄️ Using cold pool browser"),
+ 'new': logs.count("🆕 Creating new browser"),
+ }
+
+async def hit_endpoint(client, url, payload, semaphore):
+ """Single request with concurrency control."""
+ async with semaphore:
+ start = time.time()
+ try:
+ resp = await client.post(url, json=payload, timeout=60.0)
+ elapsed = (time.time() - start) * 1000
+ return {"success": resp.status_code == 200, "latency_ms": elapsed}
+ except Exception as e:
+ return {"success": False, "error": str(e)}
+
+async def run_concurrent_test(url, payload, concurrent, total_requests):
+ """Run concurrent requests."""
+ semaphore = asyncio.Semaphore(concurrent)
+ async with httpx.AsyncClient() as client:
+ tasks = [hit_endpoint(client, url, payload, semaphore) for _ in range(total_requests)]
+ results = await asyncio.gather(*tasks)
+ return results
+
+def calculate_percentiles(latencies):
+ """Calculate P50, P95, P99."""
+ if not latencies:
+ return 0, 0, 0
+ sorted_lat = sorted(latencies)
+ n = len(sorted_lat)
+ return (
+ sorted_lat[int(n * 0.50)],
+ sorted_lat[int(n * 0.95)],
+ sorted_lat[int(n * 0.99)],
+ )
+
+def start_container(client, image, name, port):
+ """Start container."""
+ try:
+ old = client.containers.get(name)
+ print(f"🧹 Stopping existing container...")
+ old.stop()
+ old.remove()
+ except docker.errors.NotFound:
+ pass
+
+ print(f"🚀 Starting container...")
+ container = client.containers.run(
+ image, name=name, ports={f"{port}/tcp": port},
+ detach=True, shm_size="1g", mem_limit="4g",
+ )
+
+ print(f"⏳ Waiting for health...")
+ for _ in range(30):
+ time.sleep(1)
+ container.reload()
+ if container.status == "running":
+ try:
+ import requests
+ if requests.get(f"http://localhost:{port}/health", timeout=2).status_code == 200:
+ print(f"✅ Container healthy!")
+ return container
+ except:
+ pass
+ raise TimeoutError("Container failed to start")
+
+async def main():
+ print("="*60)
+ print("TEST 4: Concurrent Load Testing")
+ print("="*60)
+
+ client = docker.from_env()
+ container = None
+ monitor_thread = None
+
+ try:
+ container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
+
+ print(f"\n⏳ Waiting for permanent browser init (3s)...")
+ await asyncio.sleep(3)
+
+ # Start monitoring
+ stop_monitoring.clear()
+ stats_history.clear()
+ monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
+ monitor_thread.start()
+
+ await asyncio.sleep(1)
+ baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
+ print(f"📏 Baseline: {baseline_mem:.1f} MB\n")
+
+ url = f"http://localhost:{PORT}/html"
+ payload = {"url": "https://httpbin.org/html"}
+
+ all_results = []
+ level_stats = []
+
+ # Run load levels
+ for level in LOAD_LEVELS:
+ print(f"{'='*60}")
+ print(f"🔄 {level['name']} Load: {level['concurrent']} concurrent, {level['requests']} total")
+ print(f"{'='*60}")
+
+ start_time = time.time()
+ results = await run_concurrent_test(url, payload, level['concurrent'], level['requests'])
+ duration = time.time() - start_time
+
+ successes = sum(1 for r in results if r.get("success"))
+ success_rate = (successes / len(results)) * 100
+ latencies = [r["latency_ms"] for r in results if "latency_ms" in r]
+ p50, p95, p99 = calculate_percentiles(latencies)
+ avg_lat = sum(latencies) / len(latencies) if latencies else 0
+
+ print(f" Duration: {duration:.1f}s")
+ print(f" Success: {success_rate:.1f}% ({successes}/{len(results)})")
+ print(f" Avg Latency: {avg_lat:.0f}ms")
+ print(f" P50/P95/P99: {p50:.0f}ms / {p95:.0f}ms / {p99:.0f}ms")
+
+ level_stats.append({
+ 'name': level['name'],
+ 'concurrent': level['concurrent'],
+ 'success_rate': success_rate,
+ 'avg_latency': avg_lat,
+ 'p50': p50, 'p95': p95, 'p99': p99,
+ })
+ all_results.extend(results)
+
+ await asyncio.sleep(2) # Cool down between levels
+
+ # Stop monitoring
+ await asyncio.sleep(1)
+ stop_monitoring.set()
+ if monitor_thread:
+ monitor_thread.join(timeout=2)
+
+ # Final stats
+ pool_stats = count_log_markers(container)
+ memory_samples = [s['memory_mb'] for s in stats_history]
+ peak_mem = max(memory_samples) if memory_samples else 0
+ final_mem = memory_samples[-1] if memory_samples else 0
+
+ print(f"\n{'='*60}")
+ print(f"FINAL RESULTS:")
+ print(f"{'='*60}")
+ print(f" Total Requests: {len(all_results)}")
+ print(f"\n Pool Utilization:")
+ print(f" 🔥 Permanent: {pool_stats['permanent']}")
+ print(f" ♨️ Hot: {pool_stats['hot']}")
+ print(f" ❄️ Cold: {pool_stats['cold']}")
+ print(f" 🆕 New: {pool_stats['new']}")
+ print(f"\n Memory:")
+ print(f" Baseline: {baseline_mem:.1f} MB")
+ print(f" Peak: {peak_mem:.1f} MB")
+ print(f" Final: {final_mem:.1f} MB")
+ print(f" Delta: {final_mem - baseline_mem:+.1f} MB")
+ print(f"{'='*60}")
+
+ # Pass/Fail
+ passed = True
+ for ls in level_stats:
+ if ls['success_rate'] < 99:
+ print(f"❌ FAIL: {ls['name']} success rate {ls['success_rate']:.1f}% < 99%")
+ passed = False
+ if ls['p99'] > 10000: # 10s threshold
+ print(f"⚠️ WARNING: {ls['name']} P99 latency {ls['p99']:.0f}ms very high")
+
+ if final_mem - baseline_mem > 300:
+ print(f"⚠️ WARNING: Memory grew {final_mem - baseline_mem:.1f} MB")
+
+ if passed:
+ print(f"✅ TEST PASSED")
+ return 0
+ else:
+ return 1
+
+ except Exception as e:
+ print(f"\n❌ TEST ERROR: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+ finally:
+ stop_monitoring.set()
+ if container:
+ print(f"🛑 Stopping container...")
+ container.stop()
+ container.remove()
+
+if __name__ == "__main__":
+ exit_code = asyncio.run(main())
+ exit(exit_code)
diff --git a/deploy/docker/tests/test_5_pool_stress.py b/deploy/docker/tests/test_5_pool_stress.py
new file mode 100755
index 00000000..40752d84
--- /dev/null
+++ b/deploy/docker/tests/test_5_pool_stress.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+"""
+Test 5: Pool Stress - Mixed Configs
+- Tests hot/cold pool with different browser configs
+- Uses different viewports to create config variants
+- Validates cold → hot promotion after 3 uses
+- Monitors pool tier distribution
+"""
+import asyncio
+import time
+import docker
+import httpx
+from threading import Thread, Event
+import random
+
+# Config
+IMAGE = "crawl4ai-local:latest"
+CONTAINER_NAME = "crawl4ai-test"
+PORT = 11235
+REQUESTS_PER_CONFIG = 5 # 5 requests per config variant
+
+# Different viewport configs to test pool tiers
+VIEWPORT_CONFIGS = [
+ None, # Default (permanent browser)
+ {"width": 1920, "height": 1080}, # Desktop
+ {"width": 1024, "height": 768}, # Tablet
+ {"width": 375, "height": 667}, # Mobile
+]
+
+# Stats
+stats_history = []
+stop_monitoring = Event()
+
+def monitor_stats(container):
+ """Background stats collector."""
+ for stat in container.stats(decode=True, stream=True):
+ if stop_monitoring.is_set():
+ break
+ try:
+ mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
+ stats_history.append({'timestamp': time.time(), 'memory_mb': mem_usage})
+ except:
+ pass
+ time.sleep(0.5)
+
+def analyze_pool_logs(container):
+ """Extract detailed pool stats from logs."""
+ logs = container.logs().decode('utf-8')
+
+ permanent = logs.count("🔥 Using permanent browser")
+ hot = logs.count("♨️ Using hot pool browser")
+ cold = logs.count("❄️ Using cold pool browser")
+ new = logs.count("🆕 Creating new browser")
+ promotions = logs.count("⬆️ Promoting to hot pool")
+
+ return {
+ 'permanent': permanent,
+ 'hot': hot,
+ 'cold': cold,
+ 'new': new,
+ 'promotions': promotions,
+ 'total': permanent + hot + cold
+ }
+
+async def crawl_with_viewport(client, url, viewport):
+ """Single request with specific viewport."""
+ payload = {
+ "urls": ["https://httpbin.org/html"],
+ "browser_config": {},
+ "crawler_config": {}
+ }
+
+ # Add viewport if specified
+ if viewport:
+ payload["browser_config"] = {
+ "type": "BrowserConfig",
+ "params": {
+ "viewport": {"type": "dict", "value": viewport},
+ "headless": True,
+ "text_mode": True,
+ "extra_args": [
+ "--no-sandbox",
+ "--disable-dev-shm-usage",
+ "--disable-gpu",
+ "--disable-software-rasterizer",
+ "--disable-web-security",
+ "--allow-insecure-localhost",
+ "--ignore-certificate-errors"
+ ]
+ }
+ }
+
+ start = time.time()
+ try:
+ resp = await client.post(url, json=payload, timeout=60.0)
+ elapsed = (time.time() - start) * 1000
+ return {"success": resp.status_code == 200, "latency_ms": elapsed, "viewport": viewport}
+ except Exception as e:
+ return {"success": False, "error": str(e), "viewport": viewport}
+
+def start_container(client, image, name, port):
+ """Start container."""
+ try:
+ old = client.containers.get(name)
+ print(f"🧹 Stopping existing container...")
+ old.stop()
+ old.remove()
+ except docker.errors.NotFound:
+ pass
+
+ print(f"🚀 Starting container...")
+ container = client.containers.run(
+ image, name=name, ports={f"{port}/tcp": port},
+ detach=True, shm_size="1g", mem_limit="4g",
+ )
+
+ print(f"⏳ Waiting for health...")
+ for _ in range(30):
+ time.sleep(1)
+ container.reload()
+ if container.status == "running":
+ try:
+ import requests
+ if requests.get(f"http://localhost:{port}/health", timeout=2).status_code == 200:
+ print(f"✅ Container healthy!")
+ return container
+ except:
+ pass
+ raise TimeoutError("Container failed to start")
+
+async def main():
+ print("="*60)
+ print("TEST 5: Pool Stress - Mixed Configs")
+ print("="*60)
+
+ client = docker.from_env()
+ container = None
+ monitor_thread = None
+
+ try:
+ container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
+
+ print(f"\n⏳ Waiting for permanent browser init (3s)...")
+ await asyncio.sleep(3)
+
+ # Start monitoring
+ stop_monitoring.clear()
+ stats_history.clear()
+ monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
+ monitor_thread.start()
+
+ await asyncio.sleep(1)
+ baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
+ print(f"📏 Baseline: {baseline_mem:.1f} MB\n")
+
+ url = f"http://localhost:{PORT}/crawl"
+
+ print(f"Testing {len(VIEWPORT_CONFIGS)} different configs:")
+ for i, vp in enumerate(VIEWPORT_CONFIGS):
+ vp_str = "Default" if vp is None else f"{vp['width']}x{vp['height']}"
+ print(f" {i+1}. {vp_str}")
+ print()
+
+ # Run requests: repeat each config REQUESTS_PER_CONFIG times
+ all_results = []
+ config_sequence = []
+
+ for _ in range(REQUESTS_PER_CONFIG):
+ for viewport in VIEWPORT_CONFIGS:
+ config_sequence.append(viewport)
+
+ # Shuffle to mix configs
+ random.shuffle(config_sequence)
+
+ print(f"🔄 Running {len(config_sequence)} requests with mixed configs...")
+
+ async with httpx.AsyncClient() as http_client:
+ for i, viewport in enumerate(config_sequence):
+ result = await crawl_with_viewport(http_client, url, viewport)
+ all_results.append(result)
+
+ if (i + 1) % 5 == 0:
+ vp_str = "default" if result['viewport'] is None else f"{result['viewport']['width']}x{result['viewport']['height']}"
+ status = "✓" if result.get('success') else "✗"
+ lat = f"{result.get('latency_ms', 0):.0f}ms" if 'latency_ms' in result else "error"
+ print(f" [{i+1}/{len(config_sequence)}] {status} {vp_str} - {lat}")
+
+ # Stop monitoring
+ await asyncio.sleep(2)
+ stop_monitoring.set()
+ if monitor_thread:
+ monitor_thread.join(timeout=2)
+
+ # Analyze results
+ pool_stats = analyze_pool_logs(container)
+
+ successes = sum(1 for r in all_results if r.get("success"))
+ success_rate = (successes / len(all_results)) * 100
+ latencies = [r["latency_ms"] for r in all_results if "latency_ms" in r]
+ avg_lat = sum(latencies) / len(latencies) if latencies else 0
+
+ memory_samples = [s['memory_mb'] for s in stats_history]
+ peak_mem = max(memory_samples) if memory_samples else 0
+ final_mem = memory_samples[-1] if memory_samples else 0
+
+ print(f"\n{'='*60}")
+ print(f"RESULTS:")
+ print(f"{'='*60}")
+ print(f" Requests: {len(all_results)}")
+ print(f" Success Rate: {success_rate:.1f}% ({successes}/{len(all_results)})")
+ print(f" Avg Latency: {avg_lat:.0f}ms")
+ print(f"\n Pool Statistics:")
+ print(f" 🔥 Permanent: {pool_stats['permanent']}")
+ print(f" ♨️ Hot: {pool_stats['hot']}")
+ print(f" ❄️ Cold: {pool_stats['cold']}")
+ print(f" 🆕 New: {pool_stats['new']}")
+ print(f" ⬆️ Promotions: {pool_stats['promotions']}")
+ print(f" 📊 Reuse: {(pool_stats['total'] / len(all_results) * 100):.1f}%")
+ print(f"\n Memory:")
+ print(f" Baseline: {baseline_mem:.1f} MB")
+ print(f" Peak: {peak_mem:.1f} MB")
+ print(f" Final: {final_mem:.1f} MB")
+ print(f" Delta: {final_mem - baseline_mem:+.1f} MB")
+ print(f"{'='*60}")
+
+ # Pass/Fail
+ passed = True
+
+ if success_rate < 99:
+ print(f"❌ FAIL: Success rate {success_rate:.1f}% < 99%")
+ passed = False
+
+ # Should see promotions since we repeat each config 5 times
+ if pool_stats['promotions'] < (len(VIEWPORT_CONFIGS) - 1): # -1 for default
+ print(f"⚠️ WARNING: Only {pool_stats['promotions']} promotions (expected ~{len(VIEWPORT_CONFIGS)-1})")
+
+ # Should have created some browsers for different configs
+ if pool_stats['new'] == 0:
+ print(f"⚠️ NOTE: No new browsers created (all used default?)")
+
+ if pool_stats['permanent'] == len(all_results):
+ print(f"⚠️ NOTE: All requests used permanent browser (configs not varying enough?)")
+
+ if final_mem - baseline_mem > 500:
+ print(f"⚠️ WARNING: Memory grew {final_mem - baseline_mem:.1f} MB")
+
+ if passed:
+ print(f"✅ TEST PASSED")
+ return 0
+ else:
+ return 1
+
+ except Exception as e:
+ print(f"\n❌ TEST ERROR: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+ finally:
+ stop_monitoring.set()
+ if container:
+ print(f"🛑 Stopping container...")
+ container.stop()
+ container.remove()
+
+if __name__ == "__main__":
+ exit_code = asyncio.run(main())
+ exit(exit_code)
diff --git a/deploy/docker/tests/test_6_multi_endpoint.py b/deploy/docker/tests/test_6_multi_endpoint.py
new file mode 100755
index 00000000..2d532d3b
--- /dev/null
+++ b/deploy/docker/tests/test_6_multi_endpoint.py
@@ -0,0 +1,234 @@
+#!/usr/bin/env python3
+"""
+Test 6: Multi-Endpoint Testing
+- Tests multiple endpoints together: /html, /screenshot, /pdf, /crawl
+- Validates each endpoint works correctly
+- Monitors success rates per endpoint
+"""
+import asyncio
+import time
+import docker
+import httpx
+from threading import Thread, Event
+
+# Config
+IMAGE = "crawl4ai-local:latest"
+CONTAINER_NAME = "crawl4ai-test"
+PORT = 11235
+REQUESTS_PER_ENDPOINT = 10
+
+# Stats
+stats_history = []
+stop_monitoring = Event()
+
+def monitor_stats(container):
+ """Background stats collector."""
+ for stat in container.stats(decode=True, stream=True):
+ if stop_monitoring.is_set():
+ break
+ try:
+ mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
+ stats_history.append({'timestamp': time.time(), 'memory_mb': mem_usage})
+ except:
+ pass
+ time.sleep(0.5)
+
+async def test_html(client, base_url, count):
+ """Test /html endpoint."""
+ url = f"{base_url}/html"
+ results = []
+ for _ in range(count):
+ start = time.time()
+ try:
+ resp = await client.post(url, json={"url": "https://httpbin.org/html"}, timeout=30.0)
+ elapsed = (time.time() - start) * 1000
+ results.append({"success": resp.status_code == 200, "latency_ms": elapsed})
+ except Exception as e:
+ results.append({"success": False, "error": str(e)})
+ return results
+
+async def test_screenshot(client, base_url, count):
+ """Test /screenshot endpoint."""
+ url = f"{base_url}/screenshot"
+ results = []
+ for _ in range(count):
+ start = time.time()
+ try:
+ resp = await client.post(url, json={"url": "https://httpbin.org/html"}, timeout=30.0)
+ elapsed = (time.time() - start) * 1000
+ results.append({"success": resp.status_code == 200, "latency_ms": elapsed})
+ except Exception as e:
+ results.append({"success": False, "error": str(e)})
+ return results
+
+async def test_pdf(client, base_url, count):
+ """Test /pdf endpoint."""
+ url = f"{base_url}/pdf"
+ results = []
+ for _ in range(count):
+ start = time.time()
+ try:
+ resp = await client.post(url, json={"url": "https://httpbin.org/html"}, timeout=30.0)
+ elapsed = (time.time() - start) * 1000
+ results.append({"success": resp.status_code == 200, "latency_ms": elapsed})
+ except Exception as e:
+ results.append({"success": False, "error": str(e)})
+ return results
+
+async def test_crawl(client, base_url, count):
+ """Test /crawl endpoint."""
+ url = f"{base_url}/crawl"
+ results = []
+ payload = {
+ "urls": ["https://httpbin.org/html"],
+ "browser_config": {},
+ "crawler_config": {}
+ }
+ for _ in range(count):
+ start = time.time()
+ try:
+ resp = await client.post(url, json=payload, timeout=30.0)
+ elapsed = (time.time() - start) * 1000
+ results.append({"success": resp.status_code == 200, "latency_ms": elapsed})
+ except Exception as e:
+ results.append({"success": False, "error": str(e)})
+ return results
+
+def start_container(client, image, name, port):
+ """Start container."""
+ try:
+ old = client.containers.get(name)
+ print(f"🧹 Stopping existing container...")
+ old.stop()
+ old.remove()
+ except docker.errors.NotFound:
+ pass
+
+ print(f"🚀 Starting container...")
+ container = client.containers.run(
+ image, name=name, ports={f"{port}/tcp": port},
+ detach=True, shm_size="1g", mem_limit="4g",
+ )
+
+ print(f"⏳ Waiting for health...")
+ for _ in range(30):
+ time.sleep(1)
+ container.reload()
+ if container.status == "running":
+ try:
+ import requests
+ if requests.get(f"http://localhost:{port}/health", timeout=2).status_code == 200:
+ print(f"✅ Container healthy!")
+ return container
+ except:
+ pass
+ raise TimeoutError("Container failed to start")
+
+async def main():
+ print("="*60)
+ print("TEST 6: Multi-Endpoint Testing")
+ print("="*60)
+
+ client = docker.from_env()
+ container = None
+ monitor_thread = None
+
+ try:
+ container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
+
+ print(f"\n⏳ Waiting for permanent browser init (3s)...")
+ await asyncio.sleep(3)
+
+ # Start monitoring
+ stop_monitoring.clear()
+ stats_history.clear()
+ monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
+ monitor_thread.start()
+
+ await asyncio.sleep(1)
+ baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
+ print(f"📏 Baseline: {baseline_mem:.1f} MB\n")
+
+ base_url = f"http://localhost:{PORT}"
+
+ # Test each endpoint
+ endpoints = {
+ "/html": test_html,
+ "/screenshot": test_screenshot,
+ "/pdf": test_pdf,
+ "/crawl": test_crawl,
+ }
+
+ all_endpoint_stats = {}
+
+ async with httpx.AsyncClient() as http_client:
+ for endpoint_name, test_func in endpoints.items():
+ print(f"🔄 Testing {endpoint_name} ({REQUESTS_PER_ENDPOINT} requests)...")
+ results = await test_func(http_client, base_url, REQUESTS_PER_ENDPOINT)
+
+ successes = sum(1 for r in results if r.get("success"))
+ success_rate = (successes / len(results)) * 100
+ latencies = [r["latency_ms"] for r in results if "latency_ms" in r]
+ avg_lat = sum(latencies) / len(latencies) if latencies else 0
+
+ all_endpoint_stats[endpoint_name] = {
+ 'success_rate': success_rate,
+ 'avg_latency': avg_lat,
+ 'total': len(results),
+ 'successes': successes
+ }
+
+ print(f" ✓ Success: {success_rate:.1f}% ({successes}/{len(results)}), Avg: {avg_lat:.0f}ms")
+
+ # Stop monitoring
+ await asyncio.sleep(1)
+ stop_monitoring.set()
+ if monitor_thread:
+ monitor_thread.join(timeout=2)
+
+ # Final stats
+ memory_samples = [s['memory_mb'] for s in stats_history]
+ peak_mem = max(memory_samples) if memory_samples else 0
+ final_mem = memory_samples[-1] if memory_samples else 0
+
+ print(f"\n{'='*60}")
+ print(f"RESULTS:")
+ print(f"{'='*60}")
+ for endpoint, stats in all_endpoint_stats.items():
+ print(f" {endpoint:12} Success: {stats['success_rate']:5.1f}% Avg: {stats['avg_latency']:6.0f}ms")
+
+ print(f"\n Memory:")
+ print(f" Baseline: {baseline_mem:.1f} MB")
+ print(f" Peak: {peak_mem:.1f} MB")
+ print(f" Final: {final_mem:.1f} MB")
+ print(f" Delta: {final_mem - baseline_mem:+.1f} MB")
+ print(f"{'='*60}")
+
+ # Pass/Fail
+ passed = True
+ for endpoint, stats in all_endpoint_stats.items():
+ if stats['success_rate'] < 100:
+ print(f"❌ FAIL: {endpoint} success rate {stats['success_rate']:.1f}% < 100%")
+ passed = False
+
+ if passed:
+ print(f"✅ TEST PASSED")
+ return 0
+ else:
+ return 1
+
+ except Exception as e:
+ print(f"\n❌ TEST ERROR: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+ finally:
+ stop_monitoring.set()
+ if container:
+ print(f"🛑 Stopping container...")
+ container.stop()
+ container.remove()
+
+if __name__ == "__main__":
+ exit_code = asyncio.run(main())
+ exit(exit_code)
diff --git a/deploy/docker/tests/test_7_cleanup.py b/deploy/docker/tests/test_7_cleanup.py
new file mode 100755
index 00000000..2fdbe9a6
--- /dev/null
+++ b/deploy/docker/tests/test_7_cleanup.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+"""
+Test 7: Cleanup Verification (Janitor)
+- Creates load spike then goes idle
+- Verifies memory returns to near baseline
+- Tests janitor cleanup of idle browsers
+- Monitors memory recovery time
+"""
+import asyncio
+import time
+import docker
+import httpx
+from threading import Thread, Event
+
+# Config
+IMAGE = "crawl4ai-local:latest"
+CONTAINER_NAME = "crawl4ai-test"
+PORT = 11235
+SPIKE_REQUESTS = 20 # Create some browsers
+IDLE_TIME = 90 # Wait 90s for janitor (runs every 60s)
+
+# Stats
+stats_history = []
+stop_monitoring = Event()
+
+def monitor_stats(container):
+ """Background stats collector."""
+ for stat in container.stats(decode=True, stream=True):
+ if stop_monitoring.is_set():
+ break
+ try:
+ mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
+ stats_history.append({'timestamp': time.time(), 'memory_mb': mem_usage})
+ except:
+ pass
+ time.sleep(1) # Sample every 1s for this test
+
+def start_container(client, image, name, port):
+ """Start container."""
+ try:
+ old = client.containers.get(name)
+ print(f"🧹 Stopping existing container...")
+ old.stop()
+ old.remove()
+ except docker.errors.NotFound:
+ pass
+
+ print(f"🚀 Starting container...")
+ container = client.containers.run(
+ image, name=name, ports={f"{port}/tcp": port},
+ detach=True, shm_size="1g", mem_limit="4g",
+ )
+
+ print(f"⏳ Waiting for health...")
+ for _ in range(30):
+ time.sleep(1)
+ container.reload()
+ if container.status == "running":
+ try:
+ import requests
+ if requests.get(f"http://localhost:{port}/health", timeout=2).status_code == 200:
+ print(f"✅ Container healthy!")
+ return container
+ except:
+ pass
+ raise TimeoutError("Container failed to start")
+
+async def main():
+ print("="*60)
+ print("TEST 7: Cleanup Verification (Janitor)")
+ print("="*60)
+
+ client = docker.from_env()
+ container = None
+ monitor_thread = None
+
+ try:
+ container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
+
+ print(f"\n⏳ Waiting for permanent browser init (3s)...")
+ await asyncio.sleep(3)
+
+ # Start monitoring
+ stop_monitoring.clear()
+ stats_history.clear()
+ monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
+ monitor_thread.start()
+
+ await asyncio.sleep(2)
+ baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
+ print(f"📏 Baseline: {baseline_mem:.1f} MB\n")
+
+ # Create load spike with different configs to populate pool
+ print(f"🔥 Creating load spike ({SPIKE_REQUESTS} requests with varied configs)...")
+ url = f"http://localhost:{PORT}/crawl"
+
+ viewports = [
+ {"width": 1920, "height": 1080},
+ {"width": 1024, "height": 768},
+ {"width": 375, "height": 667},
+ ]
+
+ async with httpx.AsyncClient(timeout=60.0) as http_client:
+ tasks = []
+ for i in range(SPIKE_REQUESTS):
+ vp = viewports[i % len(viewports)]
+ payload = {
+ "urls": ["https://httpbin.org/html"],
+ "browser_config": {
+ "type": "BrowserConfig",
+ "params": {
+ "viewport": {"type": "dict", "value": vp},
+ "headless": True,
+ "text_mode": True,
+ "extra_args": [
+ "--no-sandbox", "--disable-dev-shm-usage",
+ "--disable-gpu", "--disable-software-rasterizer",
+ "--disable-web-security", "--allow-insecure-localhost",
+ "--ignore-certificate-errors"
+ ]
+ }
+ },
+ "crawler_config": {}
+ }
+ tasks.append(http_client.post(url, json=payload))
+
+ results = await asyncio.gather(*tasks, return_exceptions=True)
+ successes = sum(1 for r in results if hasattr(r, 'status_code') and r.status_code == 200)
+ print(f" ✓ Spike completed: {successes}/{len(results)} successful")
+
+ # Measure peak
+ await asyncio.sleep(2)
+ peak_mem = max([s['memory_mb'] for s in stats_history]) if stats_history else baseline_mem
+ print(f" 📊 Peak memory: {peak_mem:.1f} MB (+{peak_mem - baseline_mem:.1f} MB)")
+
+ # Now go idle and wait for janitor
+ print(f"\n⏸️ Going idle for {IDLE_TIME}s (janitor cleanup)...")
+ print(f" (Janitor runs every 60s, checking for idle browsers)")
+
+ for elapsed in range(0, IDLE_TIME, 10):
+ await asyncio.sleep(10)
+ current_mem = stats_history[-1]['memory_mb'] if stats_history else 0
+ print(f" [{elapsed+10:3d}s] Memory: {current_mem:.1f} MB")
+
+ # Stop monitoring
+ stop_monitoring.set()
+ if monitor_thread:
+ monitor_thread.join(timeout=2)
+
+ # Analyze memory recovery
+ final_mem = stats_history[-1]['memory_mb'] if stats_history else 0
+ recovery_mb = peak_mem - final_mem
+ recovery_pct = (recovery_mb / (peak_mem - baseline_mem) * 100) if (peak_mem - baseline_mem) > 0 else 0
+
+ print(f"\n{'='*60}")
+ print(f"RESULTS:")
+ print(f"{'='*60}")
+ print(f" Memory Journey:")
+ print(f" Baseline: {baseline_mem:.1f} MB")
+ print(f" Peak: {peak_mem:.1f} MB (+{peak_mem - baseline_mem:.1f} MB)")
+ print(f" Final: {final_mem:.1f} MB (+{final_mem - baseline_mem:.1f} MB)")
+ print(f" Recovered: {recovery_mb:.1f} MB ({recovery_pct:.1f}%)")
+ print(f"{'='*60}")
+
+ # Pass/Fail
+ passed = True
+
+ # Should have created some memory pressure
+ if peak_mem - baseline_mem < 100:
+ print(f"⚠️ WARNING: Peak increase only {peak_mem - baseline_mem:.1f} MB (expected more browsers)")
+
+ # Should recover most memory (within 100MB of baseline)
+ if final_mem - baseline_mem > 100:
+ print(f"⚠️ WARNING: Memory didn't recover well (still +{final_mem - baseline_mem:.1f} MB above baseline)")
+ else:
+ print(f"✅ Good memory recovery!")
+
+ # Baseline + 50MB tolerance
+ if final_mem - baseline_mem < 50:
+ print(f"✅ Excellent cleanup (within 50MB of baseline)")
+
+ print(f"✅ TEST PASSED")
+ return 0
+
+ except Exception as e:
+ print(f"\n❌ TEST ERROR: {e}")
+ import traceback
+ traceback.print_exc()
+ return 1
+ finally:
+ stop_monitoring.set()
+ if container:
+ print(f"🛑 Stopping container...")
+ container.stop()
+ container.remove()
+
+if __name__ == "__main__":
+ exit_code = asyncio.run(main())
+ exit(exit_code)
diff --git a/deploy/docker/tests/test_monitor_demo.py b/deploy/docker/tests/test_monitor_demo.py
new file mode 100644
index 00000000..2dbff5b1
--- /dev/null
+++ b/deploy/docker/tests/test_monitor_demo.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""Quick test to generate monitor dashboard activity"""
+import httpx
+import asyncio
+
+async def test_dashboard():
+ async with httpx.AsyncClient(timeout=30.0) as client:
+ print("📊 Generating dashboard activity...")
+
+ # Test 1: Simple crawl
+ print("\n1️⃣ Running simple crawl...")
+ r1 = await client.post(
+ "http://localhost:11235/crawl",
+ json={"urls": ["https://httpbin.org/html"], "crawler_config": {}}
+ )
+ print(f" Status: {r1.status_code}")
+
+ # Test 2: Multiple URLs
+ print("\n2️⃣ Running multi-URL crawl...")
+ r2 = await client.post(
+ "http://localhost:11235/crawl",
+ json={
+ "urls": [
+ "https://httpbin.org/html",
+ "https://httpbin.org/json"
+ ],
+ "crawler_config": {}
+ }
+ )
+ print(f" Status: {r2.status_code}")
+
+ # Test 3: Check monitor health
+ print("\n3️⃣ Checking monitor health...")
+ r3 = await client.get("http://localhost:11235/monitor/health")
+ health = r3.json()
+ print(f" Memory: {health['container']['memory_percent']}%")
+ print(f" Browsers: {health['pool']['permanent']['active']}")
+
+ # Test 4: Check requests
+ print("\n4️⃣ Checking request log...")
+ r4 = await client.get("http://localhost:11235/monitor/requests")
+ reqs = r4.json()
+ print(f" Active: {len(reqs['active'])}")
+ print(f" Completed: {len(reqs['completed'])}")
+
+ # Test 5: Check endpoint stats
+ print("\n5️⃣ Checking endpoint stats...")
+ r5 = await client.get("http://localhost:11235/monitor/endpoints/stats")
+ stats = r5.json()
+ for endpoint, data in stats.items():
+ print(f" {endpoint}: {data['count']} requests, {data['avg_latency_ms']}ms avg")
+
+ print("\n✅ Dashboard should now show activity!")
+ print(f"\n🌐 Open: http://localhost:11235/dashboard")
+
+if __name__ == "__main__":
+ asyncio.run(test_dashboard())
diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
index 5f3618af..52f4e11f 100644
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -178,4 +178,29 @@ def verify_email_domain(email: str) -> bool:
records = dns.resolver.resolve(domain, 'MX')
return True if records else False
except Exception as e:
- return False
\ No newline at end of file
+ return False
+
+def get_container_memory_percent() -> float:
+ """Get actual container memory usage vs limit (cgroup v1/v2 aware)."""
+ try:
+ # Try cgroup v2 first
+ usage_path = Path("/sys/fs/cgroup/memory.current")
+ limit_path = Path("/sys/fs/cgroup/memory.max")
+ if not usage_path.exists():
+ # Fall back to cgroup v1
+ usage_path = Path("/sys/fs/cgroup/memory/memory.usage_in_bytes")
+ limit_path = Path("/sys/fs/cgroup/memory/memory.limit_in_bytes")
+
+ usage = int(usage_path.read_text())
+ limit = int(limit_path.read_text())
+
+ # Handle unlimited (v2: "max", v1: > 1e18)
+ if limit > 1e18:
+ import psutil
+ limit = psutil.virtual_memory().total
+
+ return (usage / limit) * 100
+ except:
+ # Non-container or unsupported: fallback to host
+ import psutil
+ return psutil.virtual_memory().percent
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 100d6973..cb99c18f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,15 +6,16 @@ x-base-config: &base-config
- "11235:11235" # Gunicorn port
env_file:
- .llm.env # API keys (create from .llm.env.example)
- environment:
- - OPENAI_API_KEY=${OPENAI_API_KEY:-}
- - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
- - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- - GROQ_API_KEY=${GROQ_API_KEY:-}
- - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
- - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
- - LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
+ # Uncomment to set default environment variables (will overwrite .llm.env)
+ # environment:
+ # - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+ # - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
+ # - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+ # - GROQ_API_KEY=${GROQ_API_KEY:-}
+ # - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
+ # - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
+ # - GEMINI_API_KEY=${GEMINI_API_KEY:-}
+ # - LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
volumes:
- /dev/shm:/dev/shm # Chromium performance
deploy:
diff --git a/docs/blog/release-v0.7.7.md b/docs/blog/release-v0.7.7.md
new file mode 100644
index 00000000..190cd374
--- /dev/null
+++ b/docs/blog/release-v0.7.7.md
@@ -0,0 +1,626 @@
+# 🚀 Crawl4AI v0.7.7: The Self-Hosting & Monitoring Update
+
+*November 14, 2025 • 10 min read*
+
+---
+
+Today I'm releasing Crawl4AI v0.7.7—the Self-Hosting & Monitoring Update. This release transforms Crawl4AI Docker from a simple containerized crawler into a complete self-hosting platform with enterprise-grade real-time monitoring, full operational transparency, and production-ready observability.
+
+## 🎯 What's New at a Glance
+
+- **📊 Real-time Monitoring Dashboard**: Interactive web UI with live system metrics and browser pool status
+- **🔌 Comprehensive Monitor API**: Complete REST API for programmatic access to all monitoring data
+- **⚡ WebSocket Streaming**: Real-time updates every 2 seconds for custom dashboards
+- **🎮 Control Actions**: Manual browser management (kill, restart, cleanup)
+- **🔥 Smart Browser Pool**: 3-tier architecture (permanent/hot/cold) with automatic promotion
+- **🧹 Janitor Cleanup System**: Automatic resource management with event logging
+- **📈 Production Metrics**: 6 critical metrics for operational excellence
+- **🏭 Integration Ready**: Prometheus, alerting, and log aggregation examples
+- **🐛 Critical Bug Fixes**: Async LLM extraction, DFS crawling, viewport config, and more
+
+## 📊 Real-time Monitoring Dashboard: Complete Visibility
+
+**The Problem:** Running Crawl4AI in Docker was like flying blind. Users had no visibility into what was happening inside the container—memory usage, active requests, browser pools, or errors. Troubleshooting required checking logs, and there was no way to monitor performance or manually intervene when issues occurred.
+
+**My Solution:** I built a complete real-time monitoring system with an interactive dashboard, comprehensive REST API, WebSocket streaming, and manual control actions. Now you have full transparency and control over your crawling infrastructure.
+
+### The Self-Hosting Value Proposition
+
+Before v0.7.7, Docker was just a containerized crawler. After v0.7.7, it's a complete self-hosting platform that gives you:
+
+- **🔒 Data Privacy**: Your data never leaves your infrastructure
+- **💰 Cost Control**: No per-request pricing or rate limits
+- **🎯 Full Customization**: Complete control over configurations and strategies
+- **📊 Complete Transparency**: Real-time visibility into every aspect
+- **⚡ Performance**: Direct access without network overhead
+- **🛡️ Enterprise Security**: Keep workflows behind your firewall
+
+### Interactive Monitoring Dashboard
+
+Access the dashboard at `http://localhost:11235/dashboard` to see:
+
+- **System Health Overview**: CPU, memory, network, and uptime in real-time
+- **Live Request Tracking**: Active and completed requests with full details
+- **Browser Pool Management**: Interactive table with permanent/hot/cold browsers
+- **Janitor Events Log**: Automatic cleanup activities
+- **Error Monitoring**: Full context error logs
+
+The dashboard updates every 2 seconds via WebSocket, giving you live visibility into your crawling operations.
+
+## 🔌 Monitor API: Programmatic Access
+
+**The Problem:** Monitoring dashboards are great for humans, but automation and integration require programmatic access.
+
+**My Solution:** A comprehensive REST API that exposes all monitoring data for integration with your existing infrastructure.
+
+### System Health Endpoint
+
+```python
+import httpx
+import asyncio
+
+async def monitor_system_health():
+ async with httpx.AsyncClient() as client:
+ response = await client.get("http://localhost:11235/monitor/health")
+ health = response.json()
+
+ print(f"Container Metrics:")
+ print(f" CPU: {health['container']['cpu_percent']:.1f}%")
+ print(f" Memory: {health['container']['memory_percent']:.1f}%")
+ print(f" Uptime: {health['container']['uptime_seconds']}s")
+
+ print(f"\nBrowser Pool:")
+ print(f" Permanent: {health['pool']['permanent']['active']} active")
+ print(f" Hot Pool: {health['pool']['hot']['count']} browsers")
+ print(f" Cold Pool: {health['pool']['cold']['count']} browsers")
+
+ print(f"\nStatistics:")
+ print(f" Total Requests: {health['stats']['total_requests']}")
+ print(f" Success Rate: {health['stats']['success_rate_percent']:.1f}%")
+ print(f" Avg Latency: {health['stats']['avg_latency_ms']:.0f}ms")
+
+asyncio.run(monitor_system_health())
+```
+
+### Request Tracking
+
+```python
+async def track_requests():
+ async with httpx.AsyncClient() as client:
+ response = await client.get("http://localhost:11235/monitor/requests")
+ requests_data = response.json()
+
+ print(f"Active Requests: {len(requests_data['active'])}")
+ print(f"Completed Requests: {len(requests_data['completed'])}")
+
+ # See details of recent requests
+ for req in requests_data['completed'][:5]:
+ status_icon = "✅" if req['success'] else "❌"
+ print(f"{status_icon} {req['endpoint']} - {req['latency_ms']:.0f}ms")
+```
+
+### Browser Pool Management
+
+```python
+async def monitor_browser_pool():
+ async with httpx.AsyncClient() as client:
+ response = await client.get("http://localhost:11235/monitor/browsers")
+ browsers = response.json()
+
+ print(f"Pool Summary:")
+ print(f" Total Browsers: {browsers['summary']['total_count']}")
+ print(f" Total Memory: {browsers['summary']['total_memory_mb']} MB")
+ print(f" Reuse Rate: {browsers['summary']['reuse_rate_percent']:.1f}%")
+
+ # List all browsers
+ for browser in browsers['permanent']:
+ print(f"🔥 Permanent: {browser['browser_id'][:8]}... | "
+ f"Requests: {browser['request_count']} | "
+ f"Memory: {browser['memory_mb']:.0f} MB")
+```
+
+### Endpoint Performance Statistics
+
+```python
+async def get_endpoint_stats():
+ async with httpx.AsyncClient() as client:
+ response = await client.get("http://localhost:11235/monitor/endpoints/stats")
+ stats = response.json()
+
+ print("Endpoint Analytics:")
+ for endpoint, data in stats.items():
+ print(f" {endpoint}:")
+ print(f" Requests: {data['count']}")
+ print(f" Avg Latency: {data['avg_latency_ms']:.0f}ms")
+ print(f" Success Rate: {data['success_rate_percent']:.1f}%")
+```
+
+### Complete API Reference
+
+The Monitor API includes these endpoints:
+
+- `GET /monitor/health` - System health with pool statistics
+- `GET /monitor/requests` - Active and completed request tracking
+- `GET /monitor/browsers` - Browser pool details and efficiency
+- `GET /monitor/endpoints/stats` - Per-endpoint performance analytics
+- `GET /monitor/timeline?minutes=5` - Time-series data for charts
+- `GET /monitor/logs/janitor?limit=10` - Cleanup activity logs
+- `GET /monitor/logs/errors?limit=10` - Error logs with context
+- `POST /monitor/actions/cleanup` - Force immediate cleanup
+- `POST /monitor/actions/kill_browser` - Kill specific browser
+- `POST /monitor/actions/restart_browser` - Restart browser
+- `POST /monitor/stats/reset` - Reset accumulated statistics
+
+## ⚡ WebSocket Streaming: Real-time Updates
+
+**The Problem:** Polling the API every few seconds wastes resources and adds latency. Real-time dashboards need instant updates.
+
+**My Solution:** WebSocket streaming with 2-second update intervals for building custom real-time dashboards.
+
+### WebSocket Integration Example
+
+```python
+import websockets
+import json
+import asyncio
+
+async def monitor_realtime():
+ uri = "ws://localhost:11235/monitor/ws"
+
+ async with websockets.connect(uri) as websocket:
+ print("Connected to real-time monitoring stream")
+
+ while True:
+ # Receive update every 2 seconds
+ data = await websocket.recv()
+ update = json.loads(data)
+
+ # Access all monitoring data
+ print(f"\n--- Update at {update['timestamp']} ---")
+ print(f"Memory: {update['health']['container']['memory_percent']:.1f}%")
+ print(f"Active Requests: {len(update['requests']['active'])}")
+ print(f"Total Browsers: {update['browsers']['summary']['total_count']}")
+
+ if update['errors']:
+ print(f"⚠️ Recent Errors: {len(update['errors'])}")
+
+asyncio.run(monitor_realtime())
+```
+
+**Expected Real-World Impact:**
+- **Custom Dashboards**: Build tailored monitoring UIs for your team
+- **Real-time Alerting**: Trigger alerts instantly when metrics exceed thresholds
+- **Integration**: Feed live data into monitoring tools like Grafana
+- **Automation**: React to events in real-time without polling
+
+## 🔥 Smart Browser Pool: 3-Tier Architecture
+
+**The Problem:** Creating a new browser for every request is slow and memory-intensive. Traditional browser pools are static and inefficient.
+
+**My Solution:** A smart 3-tier browser pool that automatically adapts to usage patterns.
+
+### How It Works
+
+```python
+import httpx
+
+async def demonstrate_browser_pool():
+ async with httpx.AsyncClient() as client:
+ # Request 1-3: Default config → Uses permanent browser
+ print("Phase 1: Using permanent browser")
+ for i in range(3):
+ await client.post(
+ "http://localhost:11235/crawl",
+ json={"urls": [f"https://httpbin.org/html?req={i}"]}
+ )
+ print(f" Request {i+1}: Reused permanent browser")
+
+ # Request 4-6: Custom viewport → Cold pool (first use)
+ print("\nPhase 2: Custom config creates cold pool browser")
+ viewport_config = {"viewport": {"width": 1280, "height": 720}}
+ for i in range(4):
+ await client.post(
+ "http://localhost:11235/crawl",
+ json={
+ "urls": [f"https://httpbin.org/json?v={i}"],
+ "browser_config": viewport_config
+ }
+ )
+ if i < 2:
+ print(f" Request {i+1}: Cold pool browser")
+ else:
+ print(f" Request {i+1}: Promoted to hot pool! (after 3 uses)")
+
+ # Check pool status
+ response = await client.get("http://localhost:11235/monitor/browsers")
+ browsers = response.json()
+
+ print(f"\nPool Status:")
+ print(f" Permanent: {len(browsers['permanent'])} (always active)")
+ print(f" Hot: {len(browsers['hot'])} (frequently used configs)")
+ print(f" Cold: {len(browsers['cold'])} (on-demand)")
+ print(f" Reuse Rate: {browsers['summary']['reuse_rate_percent']:.1f}%")
+
+asyncio.run(demonstrate_browser_pool())
+```
+
+**Pool Tiers:**
+
+- **🔥 Permanent Browser**: Always-on, default configuration, instant response
+- **♨️ Hot Pool**: Browsers promoted after 3+ uses, kept warm for quick access
+- **❄️ Cold Pool**: On-demand browsers for variant configs, cleaned up when idle
+
+**Expected Real-World Impact:**
+- **Memory Efficiency**: 10x reduction in memory usage vs creating browsers per request
+- **Performance**: Instant access to frequently-used configurations
+- **Automatic Optimization**: Pool adapts to your usage patterns
+- **Resource Management**: Janitor automatically cleans up idle browsers
+
+## 🧹 Janitor System: Automatic Cleanup
+
+**The Problem:** Long-running crawlers accumulate idle browsers and consume memory over time.
+
+**My Solution:** An automatic janitor system that monitors and cleans up idle resources.
+
+```python
+async def monitor_janitor_activity():
+ async with httpx.AsyncClient() as client:
+ response = await client.get("http://localhost:11235/monitor/logs/janitor?limit=5")
+ logs = response.json()
+
+ print("Recent Cleanup Activities:")
+ for log in logs:
+ print(f" {log['timestamp']}: {log['message']}")
+
+# Example output:
+# 2025-11-14 10:30:00: Cleaned up 2 cold pool browsers (idle > 5min)
+# 2025-11-14 10:25:00: Browser reuse rate: 85.3%
+# 2025-11-14 10:20:00: Hot pool browser promoted (10 requests)
+```
+
+## 🎮 Control Actions: Manual Management
+
+**The Problem:** Sometimes you need to manually intervene—kill a stuck browser, force cleanup, or restart resources.
+
+**My Solution:** Manual control actions via the API for operational troubleshooting.
+
+### Force Cleanup
+
+```python
+async def force_cleanup():
+ async with httpx.AsyncClient() as client:
+ response = await client.post("http://localhost:11235/monitor/actions/cleanup")
+ result = response.json()
+
+ print(f"Cleanup completed:")
+ print(f" Browsers cleaned: {result.get('cleaned_count', 0)}")
+ print(f" Memory freed: {result.get('memory_freed_mb', 0):.1f} MB")
+```
+
+### Kill Specific Browser
+
+```python
+async def kill_stuck_browser(browser_id: str):
+ async with httpx.AsyncClient() as client:
+ response = await client.post(
+ "http://localhost:11235/monitor/actions/kill_browser",
+ json={"browser_id": browser_id}
+ )
+
+ if response.status_code == 200:
+ print(f"✅ Browser {browser_id} killed successfully")
+```
+
+### Reset Statistics
+
+```python
+async def reset_stats():
+ async with httpx.AsyncClient() as client:
+ response = await client.post("http://localhost:11235/monitor/stats/reset")
+ print("📊 Statistics reset for fresh monitoring")
+```
+
+## 📈 Production Integration Patterns
+
+### Prometheus Integration
+
+```python
+# Export metrics for Prometheus scraping
+async def export_prometheus_metrics():
+ async with httpx.AsyncClient() as client:
+ health = await client.get("http://localhost:11235/monitor/health")
+ data = health.json()
+
+ # Export in Prometheus format
+ metrics = f"""
+# HELP crawl4ai_memory_usage_percent Memory usage percentage
+# TYPE crawl4ai_memory_usage_percent gauge
+crawl4ai_memory_usage_percent {data['container']['memory_percent']}
+
+# HELP crawl4ai_request_success_rate Request success rate
+# TYPE crawl4ai_request_success_rate gauge
+crawl4ai_request_success_rate {data['stats']['success_rate_percent']}
+
+# HELP crawl4ai_browser_pool_count Total browsers in pool
+# TYPE crawl4ai_browser_pool_count gauge
+crawl4ai_browser_pool_count {data['pool']['permanent']['active'] + data['pool']['hot']['count'] + data['pool']['cold']['count']}
+"""
+ return metrics
+```
+
+### Alerting Example
+
+```python
+async def check_alerts():
+ async with httpx.AsyncClient() as client:
+ health = await client.get("http://localhost:11235/monitor/health")
+ data = health.json()
+
+ # Memory alert
+ if data['container']['memory_percent'] > 80:
+ print("🚨 ALERT: Memory usage above 80%")
+ # Trigger cleanup
+ await client.post("http://localhost:11235/monitor/actions/cleanup")
+
+ # Success rate alert
+ if data['stats']['success_rate_percent'] < 90:
+ print("🚨 ALERT: Success rate below 90%")
+ # Check error logs
+ errors = await client.get("http://localhost:11235/monitor/logs/errors")
+ print(f"Recent errors: {len(errors.json())}")
+
+ # Latency alert
+ if data['stats']['avg_latency_ms'] > 5000:
+ print("🚨 ALERT: Average latency above 5s")
+```
+
+### Key Metrics to Track
+
+```python
+CRITICAL_METRICS = {
+ "memory_usage": {
+ "current": "container.memory_percent",
+ "target": "<80%",
+ "alert_threshold": ">80%",
+ "action": "Force cleanup or scale"
+ },
+ "success_rate": {
+ "current": "stats.success_rate_percent",
+ "target": ">95%",
+ "alert_threshold": "<90%",
+ "action": "Check error logs"
+ },
+ "avg_latency": {
+ "current": "stats.avg_latency_ms",
+ "target": "<2000ms",
+ "alert_threshold": ">5000ms",
+ "action": "Investigate slow requests"
+ },
+ "browser_reuse_rate": {
+ "current": "browsers.summary.reuse_rate_percent",
+ "target": ">80%",
+ "alert_threshold": "<60%",
+ "action": "Check pool configuration"
+ },
+ "total_browsers": {
+ "current": "browsers.summary.total_count",
+ "target": "<15",
+ "alert_threshold": ">20",
+ "action": "Check for browser leaks"
+ },
+ "error_frequency": {
+ "current": "len(errors)",
+ "target": "<5/hour",
+ "alert_threshold": ">10/hour",
+ "action": "Review error patterns"
+ }
+}
+```
+
+## 🐛 Critical Bug Fixes
+
+This release includes significant bug fixes that improve stability and performance:
+
+### Async LLM Extraction (#1590)
+
+**The Problem:** LLM extraction was blocking async execution, causing URLs to be processed sequentially instead of in parallel (issue #1055).
+
+**The Fix:** Resolved the blocking issue to enable true parallel processing for LLM extraction.
+
+```python
+# Before v0.7.7: Sequential processing
+# After v0.7.7: True parallel processing
+
+async with AsyncWebCrawler() as crawler:
+ urls = ["url1", "url2", "url3", "url4"]
+
+ # Now processes truly in parallel with LLM extraction
+ results = await crawler.arun_many(
+ urls,
+ config=CrawlerRunConfig(
+ extraction_strategy=LLMExtractionStrategy(...)
+ )
+ )
+ # 4x faster for parallel LLM extraction!
+```
+
+**Expected Impact:** Major performance improvement for batch LLM extraction workflows.
+
+### DFS Deep Crawling (#1607)
+
+**The Problem:** DFS (Depth-First Search) deep crawl strategy had implementation issues.
+
+**The Fix:** Enhanced DFSDeepCrawlStrategy with proper seen URL tracking and improved documentation.
+
+### Browser & Crawler Config Documentation (#1609)
+
+**The Problem:** Documentation didn't match the actual `async_configs.py` implementation.
+
+**The Fix:** Updated all configuration documentation to accurately reflect the current implementation.
+
+### Sitemap Seeder (#1598)
+
+**The Problem:** Sitemap parsing and URL normalization issues in AsyncUrlSeeder (issue #1559).
+
+**The Fix:** Added comprehensive tests and fixes for sitemap namespace parsing and URL normalization.
+
+### Remove Overlay Elements (#1529)
+
+**The Problem:** The `remove_overlay_elements` functionality wasn't working (issue #1396).
+
+**The Fix:** Fixed by properly calling the injected JavaScript function.
+
+### Viewport Configuration (#1495)
+
+**The Problem:** Viewport configuration wasn't working in managed browsers (issue #1490).
+
+**The Fix:** Added proper viewport size configuration support for browser launch.
+
+### Managed Browser CDP Timing (#1528)
+
+**The Problem:** CDP (Chrome DevTools Protocol) endpoint verification had timing issues causing connection failures (issue #1445).
+
+**The Fix:** Added exponential backoff for CDP endpoint verification to handle timing variations.
+
+### Security Updates
+
+- **pyOpenSSL**: Updated from >=24.3.0 to >=25.3.0 to address security vulnerability
+- Added verification tests for the security update
+
+### Docker Fixes
+
+- **Port Standardization**: Fixed inconsistent port usage (11234 vs 11235) - now standardized to 11235
+- **LLM Environment**: Fixed LLM API key handling for multi-provider support (PR #1537)
+- **Error Handling**: Improved Docker API error messages with comprehensive status codes
+- **Serialization**: Fixed `fit_html` property serialization in `/crawl` and `/crawl/stream` endpoints
+
+### Other Important Fixes
+
+- **arun_many Returns**: Fixed function to always return a list, even on exception (PR #1530)
+- **Webhook Serialization**: Properly serialize Pydantic HttpUrl in webhook config
+- **LLMConfig Documentation**: Fixed casing and variable name consistency (issue #1551)
+- **Python Version**: Dropped Python 3.9 support, now requires Python >=3.10
+
+## 📊 Expected Real-World Impact
+
+### For DevOps & Infrastructure Teams
+- **Full Visibility**: Know exactly what's happening inside your crawling infrastructure
+- **Proactive Monitoring**: Catch issues before they become problems
+- **Resource Optimization**: Identify memory leaks and performance bottlenecks
+- **Operational Control**: Manual intervention when automated systems need help
+
+### For Production Deployments
+- **Enterprise Observability**: Prometheus, Grafana, and alerting integration
+- **Debugging**: Real-time logs and error tracking
+- **Capacity Planning**: Historical metrics for scaling decisions
+- **SLA Monitoring**: Track success rates and latency against targets
+
+### For Development Teams
+- **Local Monitoring**: Understand crawler behavior during development
+- **Performance Testing**: Measure impact of configuration changes
+- **Troubleshooting**: Quickly identify and fix issues
+- **Learning**: See exactly how the browser pool works
+
+## 🔄 Breaking Changes
+
+**None!** This release is fully backward compatible.
+
+- All existing Docker configurations continue to work
+- No API changes to existing endpoints
+- Monitoring is additive functionality
+- No migration required
+
+## 🚀 Upgrade Instructions
+
+### Docker
+
+```bash
+# Pull the latest version
+docker pull unclecode/crawl4ai:0.7.7
+
+# Or use the latest tag
+docker pull unclecode/crawl4ai:latest
+
+# Run with monitoring enabled (default)
+docker run -d \
+ -p 11235:11235 \
+ --shm-size=1g \
+ --name crawl4ai \
+ unclecode/crawl4ai:0.7.7
+
+# Access the monitoring dashboard
+open http://localhost:11235/dashboard
+```
+
+### Python Package
+
+```bash
+# Upgrade to latest version
+pip install --upgrade crawl4ai
+
+# Or install specific version
+pip install crawl4ai==0.7.7
+```
+
+## 🎬 Try the Demo
+
+Run the comprehensive demo that showcases all monitoring features:
+
+```bash
+python docs/releases_review/demo_v0.7.7.py
+```
+
+**The demo includes:**
+1. System health overview with live metrics
+2. Request tracking with active/completed monitoring
+3. Browser pool management (permanent/hot/cold)
+4. Complete Monitor API endpoint examples
+5. WebSocket streaming demonstration
+6. Control actions (cleanup, kill, restart)
+7. Production metrics and alerting patterns
+8. Self-hosting value proposition
+
+## 📚 Documentation
+
+### New Documentation
+- **[Self-Hosting Guide](https://docs.crawl4ai.com/core/self-hosting/)** - Complete self-hosting documentation with monitoring
+- **Demo Script**: `docs/releases_review/demo_v0.7.7.py` - Working examples
+
+### Updated Documentation
+- **Docker Deployment** → **Self-Hosting** (renamed for better positioning)
+- Added comprehensive monitoring sections
+- Production integration patterns
+- WebSocket streaming examples
+
+## 💡 Pro Tips
+
+1. **Start with the dashboard** - Visit `/dashboard` to get familiar with the monitoring system
+2. **Track the 6 key metrics** - Memory, success rate, latency, reuse rate, browser count, errors
+3. **Set up alerting early** - Use the Monitor API to build alerts before issues occur
+4. **Monitor browser pool efficiency** - Aim for >80% reuse rate for optimal performance
+5. **Use WebSocket for custom dashboards** - Build tailored monitoring UIs for your team
+6. **Leverage Prometheus integration** - Export metrics for long-term storage and analysis
+7. **Check janitor logs** - Understand automatic cleanup patterns
+8. **Use control actions judiciously** - Manual interventions are for exceptional cases
+
+## 🙏 Acknowledgments
+
+Thank you to our community for the feedback, bug reports, and feature requests that shaped this release. Special thanks to everyone who contributed to the issues that were fixed in this version.
+
+The monitoring system was built based on real user needs for production deployments, and your input made it comprehensive and practical.
+
+## 📞 Support & Resources
+
+- **📖 Documentation**: [docs.crawl4ai.com](https://docs.crawl4ai.com)
+- **🐙 GitHub**: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
+- **💬 Discord**: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
+- **🐦 Twitter**: [@unclecode](https://x.com/unclecode)
+- **📊 Dashboard**: `http://localhost:11235/dashboard` (when running)
+
+---
+
+**Crawl4AI v0.7.7 delivers complete self-hosting with enterprise-grade monitoring. You now have full visibility and control over your web crawling infrastructure. The monitoring dashboard, comprehensive API, and WebSocket streaming give you everything needed for production deployments. Try the self-hosting platform—it's a game changer for operational excellence!**
+
+**Happy crawling with full visibility!** 🕷️📊
+
+*- unclecode*
diff --git a/docs/examples/c4a_script/tutorial/README.md b/docs/examples/c4a_script/tutorial/README.md
index 81f855ee..2d6940bb 100644
--- a/docs/examples/c4a_script/tutorial/README.md
+++ b/docs/examples/c4a_script/tutorial/README.md
@@ -18,7 +18,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
2. **Install Dependencies**
```bash
- pip install flask
+ pip install -r requirements.txt
```
3. **Launch the Server**
@@ -28,7 +28,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
4. **Open in Browser**
```
- http://localhost:8080
+ http://localhost:8000
```
**🌐 Try Online**: [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)
@@ -325,7 +325,7 @@ Powers the recording functionality:
### Configuration
```python
# server.py configuration
-PORT = 8080
+PORT = 8000
DEBUG = True
THREADED = True
```
@@ -343,9 +343,9 @@ THREADED = True
**Port Already in Use**
```bash
# Kill existing process
-lsof -ti:8080 | xargs kill -9
+lsof -ti:8000 | xargs kill -9
# Or use different port
-python server.py --port 8081
+python server.py --port 8001
```
**Blockly Not Loading**
diff --git a/docs/examples/c4a_script/tutorial/server.py b/docs/examples/c4a_script/tutorial/server.py
index f9cb81e9..2537e4c3 100644
--- a/docs/examples/c4a_script/tutorial/server.py
+++ b/docs/examples/c4a_script/tutorial/server.py
@@ -216,7 +216,7 @@ def get_examples():
'name': 'Handle Cookie Banner',
'description': 'Accept cookies and close newsletter popup',
'script': '''# Handle cookie banner and newsletter
-GO http://127.0.0.1:8080/playground/
+GO http://127.0.0.1:8000/playground/
WAIT `body` 2
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`'''
diff --git a/docs/examples/dfs_crawl_demo.py b/docs/examples/dfs_crawl_demo.py
new file mode 100644
index 00000000..321c4131
--- /dev/null
+++ b/docs/examples/dfs_crawl_demo.py
@@ -0,0 +1,39 @@
+"""
+Simple demonstration of the DFS deep crawler visiting multiple pages.
+
+Run with: python docs/examples/dfs_crawl_demo.py
+"""
+import asyncio
+
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.cache_context import CacheMode
+from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+
+async def main() -> None:
+ dfs_strategy = DFSDeepCrawlStrategy(
+ max_depth=3,
+ max_pages=50,
+ include_external=False,
+ )
+
+ config = CrawlerRunConfig(
+ deep_crawl_strategy=dfs_strategy,
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(),
+ stream=True,
+ )
+
+ seed_url = "https://docs.python.org/3/" # Plenty of internal links
+
+ async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
+ async for result in await crawler.arun(url=seed_url, config=config):
+ depth = result.metadata.get("depth")
+ status = "SUCCESS" if result.success else "FAILED"
+ print(f"[{status}] depth={depth} url={result.url}")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/examples/nst_proxy/api_proxy_example.py b/docs/examples/nst_proxy/api_proxy_example.py
new file mode 100644
index 00000000..11847697
--- /dev/null
+++ b/docs/examples/nst_proxy/api_proxy_example.py
@@ -0,0 +1,48 @@
+"""
+NSTProxy Integration Examples for crawl4ai
+------------------------------------------
+
+NSTProxy is a premium residential proxy provider.
+👉 Purchase Proxies: https://nstproxy.com
+💰 Use coupon code "crawl4ai" for 10% off your plan.
+
+"""
+import asyncio, requests
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+
+async def main():
+ """
+ Example: Dynamically fetch a proxy from NSTProxy API before crawling.
+ """
+ NST_TOKEN = "YOUR_NST_PROXY_TOKEN" # Get from https://app.nstproxy.com/profile
+ CHANNEL_ID = "YOUR_NST_PROXY_CHANNEL_ID" # Your NSTProxy Channel ID
+ country = "ANY" # e.g. "ANY", "US", "DE"
+
+ # Fetch proxy from NSTProxy API
+ api_url = (
+ f"https://api.nstproxy.com/api/v1/generate/apiproxies"
+ f"?fType=2&channelId={CHANNEL_ID}&country={country}"
+ f"&protocol=http&sessionDuration=10&count=1&token={NST_TOKEN}"
+ )
+ response = requests.get(api_url, timeout=10).json()
+ proxy = response[0]
+
+ ip = proxy.get("ip")
+ port = proxy.get("port")
+ username = proxy.get("username", "")
+ password = proxy.get("password", "")
+
+ browser_config = BrowserConfig(proxy_config={
+ "server": f"http://{ip}:{port}",
+ "username": username,
+ "password": password,
+ })
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+ print("[API Proxy] Status:", result.status_code)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/nst_proxy/auth_proxy_example.py b/docs/examples/nst_proxy/auth_proxy_example.py
new file mode 100644
index 00000000..6fb838b4
--- /dev/null
+++ b/docs/examples/nst_proxy/auth_proxy_example.py
@@ -0,0 +1,31 @@
+"""
+NSTProxy Integration Examples for crawl4ai
+------------------------------------------
+
+NSTProxy is a premium residential proxy provider.
+👉 Purchase Proxies: https://nstproxy.com
+💰 Use coupon code "crawl4ai" for 10% off your plan.
+
+"""
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+
+async def main():
+ """
+ Example: Use NSTProxy with manual username/password authentication.
+ """
+
+ browser_config = BrowserConfig(proxy_config={
+ "server": "http://gate.nstproxy.io:24125",
+ "username": "your_username",
+ "password": "your_password",
+ })
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+ print("[Auth Proxy] Status:", result.status_code)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/nst_proxy/basic_proxy_example.py b/docs/examples/nst_proxy/basic_proxy_example.py
new file mode 100644
index 00000000..5a79525c
--- /dev/null
+++ b/docs/examples/nst_proxy/basic_proxy_example.py
@@ -0,0 +1,29 @@
+"""
+NSTProxy Integration Examples for crawl4ai
+------------------------------------------
+
+NSTProxy is a premium residential proxy provider.
+👉 Purchase Proxies: https://nstproxy.com
+💰 Use coupon code "crawl4ai" for 10% off your plan.
+
+"""
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+
+async def main():
+ # Using HTTP proxy
+ browser_config = BrowserConfig(proxy_config={"server": "http://gate.nstproxy.io:24125"})
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+ print("[HTTP Proxy] Status:", result.status_code)
+
+ # Using SOCKS proxy
+ browser_config = BrowserConfig(proxy_config={"server": "socks5://gate.nstproxy.io:24125"})
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+ print("[SOCKS5 Proxy] Status:", result.status_code)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/nst_proxy/nstproxy_example.py b/docs/examples/nst_proxy/nstproxy_example.py
new file mode 100644
index 00000000..4e8587b3
--- /dev/null
+++ b/docs/examples/nst_proxy/nstproxy_example.py
@@ -0,0 +1,39 @@
+"""
+NSTProxy Integration Examples for crawl4ai
+------------------------------------------
+
+NSTProxy is a premium residential proxy provider.
+👉 Purchase Proxies: https://nstproxy.com
+💰 Use coupon code "crawl4ai" for 10% off your plan.
+
+"""
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+
+async def main():
+ """
+ Example: Using NSTProxy with AsyncWebCrawler.
+ """
+
+ NST_TOKEN = "YOUR_NST_PROXY_TOKEN" # Get from https://app.nstproxy.com/profile
+ CHANNEL_ID = "YOUR_NST_PROXY_CHANNEL_ID" # Your NSTProxy Channel ID
+
+ browser_config = BrowserConfig()
+ browser_config.set_nstproxy(
+ token=NST_TOKEN,
+ channel_id=CHANNEL_ID,
+ country="ANY", # e.g. "US", "JP", or "ANY"
+ state="", # optional, leave empty if not needed
+ city="", # optional, leave empty if not needed
+ session_duration=0 # Session duration in minutes,0 = rotate on every request
+ )
+
+ # === Run crawler ===
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+ print("[Nstproxy] Status:", result.status_code)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md
index 3864f840..2b155857 100644
--- a/docs/md_v2/advanced/identity-based-crawling.md
+++ b/docs/md_v2/advanced/identity-based-crawling.md
@@ -82,6 +82,42 @@ If you installed Crawl4AI (which installs Playwright under the hood), you alread
---
+### Creating a Profile Using the Crawl4AI CLI (Easiest)
+
+If you prefer a guided, interactive setup, use the built-in CLI to create and manage persistent browser profiles.
+
+1.⠀Launch the profile manager:
+ ```bash
+ crwl profiles
+ ```
+
+2.⠀Choose "Create new profile" and enter a profile name. A Chromium window opens so you can log in to sites and configure settings. When finished, return to the terminal and press `q` to save the profile.
+
+3.⠀Profiles are saved under `~/.crawl4ai/profiles/
` (for example: `/home//.crawl4ai/profiles/test_profile_1`) along with a `storage_state.json` for cookies and session data.
+
+4.⠀Optionally, choose "List profiles" in the CLI to view available profiles and their paths.
+
+5.⠀Use the saved path with `BrowserConfig.user_data_dir`:
+ ```python
+ from crawl4ai import AsyncWebCrawler, BrowserConfig
+
+ profile_path = "/home//.crawl4ai/profiles/test_profile_1"
+
+ browser_config = BrowserConfig(
+ headless=True,
+ use_managed_browser=True,
+ user_data_dir=profile_path,
+ browser_type="chromium",
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com/private")
+ ```
+
+The CLI also supports listing and deleting profiles, and even testing a crawl directly from the menu.
+
+---
+
## 3. Using Managed Browsers in Crawl4AI
Once you have a data directory with your session data, pass it to **`BrowserConfig`**:
diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md
index d1c868b2..d14e59ff 100644
--- a/docs/md_v2/advanced/proxy-security.md
+++ b/docs/md_v2/advanced/proxy-security.md
@@ -1,98 +1,304 @@
-# Proxy
+# Proxy & Security
+
+This guide covers proxy configuration and security features in Crawl4AI, including SSL certificate analysis and proxy rotation strategies.
+
+## Understanding Proxy Configuration
+
+Crawl4AI recommends configuring proxies per request through `CrawlerRunConfig.proxy_config`. This gives you precise control, enables rotation strategies, and keeps examples simple enough to copy, paste, and run.
## Basic Proxy Setup
-Simple proxy configuration with `BrowserConfig`:
+Configure proxies that apply to each crawl operation:
```python
-from crawl4ai.async_configs import BrowserConfig
-
-# Using HTTP proxy
-browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"})
-async with AsyncWebCrawler(config=browser_config) as crawler:
- result = await crawler.arun(url="https://example.com")
-
-# Using SOCKS proxy
-browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"})
-async with AsyncWebCrawler(config=browser_config) as crawler:
- result = await crawler.arun(url="https://example.com")
-```
-
-## Authenticated Proxy
-
-Use an authenticated proxy with `BrowserConfig`:
-
-```python
-from crawl4ai.async_configs import BrowserConfig
-
-browser_config = BrowserConfig(proxy_config={
- "server": "http://[host]:[port]",
- "username": "[username]",
- "password": "[password]",
-})
-async with AsyncWebCrawler(config=browser_config) as crawler:
- result = await crawler.arun(url="https://example.com")
-```
-
-
-## Rotating Proxies
-
-Example using a proxy rotation service dynamically:
-
-```python
-import re
-from crawl4ai import (
- AsyncWebCrawler,
- BrowserConfig,
- CrawlerRunConfig,
- CacheMode,
- RoundRobinProxyStrategy,
-)
import asyncio
-from crawl4ai import ProxyConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, ProxyConfig
+
+run_config = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://proxy.example.com:8080"))
+# run_config = CrawlerRunConfig(proxy_config={"server": "http://proxy.example.com:8080"})
+# run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080")
+
+
async def main():
- # Load proxies and create rotation strategy
+ browser_config = BrowserConfig()
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com", config=run_config)
+ print(f"Success: {result.success} -> {result.url}")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+!!! note "Why request-level?"
+ `CrawlerRunConfig.proxy_config` keeps each request self-contained, so swapping proxies or rotation strategies is just a matter of building a new run configuration.
+
+## Supported Proxy Formats
+
+The `ProxyConfig.from_string()` method supports multiple formats:
+
+```python
+from crawl4ai import ProxyConfig
+
+# HTTP proxy with authentication
+proxy1 = ProxyConfig.from_string("http://user:pass@192.168.1.1:8080")
+
+# HTTPS proxy
+proxy2 = ProxyConfig.from_string("https://proxy.example.com:8080")
+
+# SOCKS5 proxy
+proxy3 = ProxyConfig.from_string("socks5://proxy.example.com:1080")
+
+# Simple IP:port format
+proxy4 = ProxyConfig.from_string("192.168.1.1:8080")
+
+# IP:port:user:pass format
+proxy5 = ProxyConfig.from_string("192.168.1.1:8080:user:pass")
+```
+
+## Authenticated Proxies
+
+For proxies requiring authentication:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler,BrowserConfig, CrawlerRunConfig, ProxyConfig
+
+run_config = CrawlerRunConfig(
+ proxy_config=ProxyConfig(
+ server="http://proxy.example.com:8080",
+ username="your_username",
+ password="your_password",
+ )
+)
+# Or dictionary style:
+# run_config = CrawlerRunConfig(proxy_config={
+# "server": "http://proxy.example.com:8080",
+# "username": "your_username",
+# "password": "your_password",
+# })
+
+
+async def main():
+ browser_config = BrowserConfig()
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com", config=run_config)
+ print(f"Success: {result.success} -> {result.url}")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+## Environment Variable Configuration
+
+Load proxies from environment variables for easy configuration:
+
+```python
+import os
+from crawl4ai import ProxyConfig, CrawlerRunConfig
+
+# Set environment variable
+os.environ["PROXIES"] = "ip1:port1:user1:pass1,ip2:port2:user2:pass2,ip3:port3"
+
+# Load all proxies
+proxies = ProxyConfig.from_env()
+print(f"Loaded {len(proxies)} proxies")
+
+# Use first proxy
+if proxies:
+ run_config = CrawlerRunConfig(proxy_config=proxies[0])
+```
+
+## Rotating Proxies
+
+Crawl4AI supports automatic proxy rotation to distribute requests across multiple proxy servers. Rotation is applied per request using a rotation strategy on `CrawlerRunConfig`.
+
+### Proxy Rotation (recommended)
+```python
+import asyncio
+import re
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, ProxyConfig
+from crawl4ai.proxy_strategy import RoundRobinProxyStrategy
+
+async def main():
+ # Load proxies from environment
proxies = ProxyConfig.from_env()
- #eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
if not proxies:
- print("No proxies found in environment. Set PROXIES env variable!")
+ print("No proxies found! Set PROXIES environment variable.")
return
+ # Create rotation strategy
proxy_strategy = RoundRobinProxyStrategy(proxies)
- # Create configs
+ # Configure per-request with proxy rotation
browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
- proxy_rotation_strategy=proxy_strategy
+ proxy_rotation_strategy=proxy_strategy,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
- print("\n📈 Initializing crawler with proxy rotation...")
- async with AsyncWebCrawler(config=browser_config) as crawler:
- print("\n🚀 Starting batch crawl with proxy rotation...")
- results = await crawler.arun_many(
- urls=urls,
- config=run_config
- )
- for result in results:
- if result.success:
- ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
- current_proxy = run_config.proxy_config if run_config.proxy_config else None
+ print(f"🚀 Testing {len(proxies)} proxies with rotation...")
+ results = await crawler.arun_many(urls=urls, config=run_config)
- if current_proxy and ip_match:
- print(f"URL {result.url}")
- print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
- verified = ip_match.group(0) == current_proxy.ip
- if verified:
- print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
- else:
- print("❌ Proxy failed or IP mismatch!")
- print("---")
+ for i, result in enumerate(results):
+ if result.success:
+ # Extract IP from response
+ ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
+ if ip_match:
+ detected_ip = ip_match.group(0)
+ proxy_index = i % len(proxies)
+ expected_ip = proxies[proxy_index].ip
-asyncio.run(main())
+ print(f"✅ Request {i+1}: Proxy {proxy_index+1} -> IP {detected_ip}")
+ if detected_ip == expected_ip:
+ print(" 🎯 IP matches proxy configuration")
+ else:
+ print(f" ⚠️ IP mismatch (expected {expected_ip})")
+ else:
+ print(f"❌ Request {i+1}: Could not extract IP from response")
+ else:
+ print(f"❌ Request {i+1}: Failed - {result.error_message}")
+if __name__ == "__main__":
+ asyncio.run(main())
```
+## SSL Certificate Analysis
+
+Combine proxy usage with SSL certificate inspection for enhanced security analysis. SSL certificate fetching is configured per request via `CrawlerRunConfig`.
+
+### Per-Request SSL Certificate Analysis
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+run_config = CrawlerRunConfig(
+ proxy_config={
+ "server": "http://proxy.example.com:8080",
+ "username": "user",
+ "password": "pass",
+ },
+ fetch_ssl_certificate=True, # Enable SSL certificate analysis for this request
+)
+
+
+async def main():
+ browser_config = BrowserConfig()
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com", config=run_config)
+
+ if result.success:
+ print(f"✅ Crawled via proxy: {result.url}")
+
+ # Analyze SSL certificate
+ if result.ssl_certificate:
+ cert = result.ssl_certificate
+ print("🔒 SSL Certificate Info:")
+ print(f" Issuer: {cert.issuer}")
+ print(f" Subject: {cert.subject}")
+ print(f" Valid until: {cert.valid_until}")
+ print(f" Fingerprint: {cert.fingerprint}")
+
+ # Export certificate
+ cert.to_json("certificate.json")
+ print("💾 Certificate exported to certificate.json")
+ else:
+ print("⚠️ No SSL certificate information available")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+## Security Best Practices
+
+### 1. Proxy Rotation for Anonymity
+```python
+from crawl4ai import CrawlerRunConfig, ProxyConfig
+from crawl4ai.proxy_strategy import RoundRobinProxyStrategy
+
+# Use multiple proxies to avoid IP blocking
+proxies = ProxyConfig.from_env("PROXIES")
+strategy = RoundRobinProxyStrategy(proxies)
+
+# Configure rotation per request (recommended)
+run_config = CrawlerRunConfig(proxy_rotation_strategy=strategy)
+
+# For a fixed proxy across all requests, just reuse the same run_config instance
+static_run_config = run_config
+```
+
+### 2. SSL Certificate Verification
+```python
+from crawl4ai import CrawlerRunConfig
+
+# Always verify SSL certificates when possible
+# Per-request (affects specific requests)
+run_config = CrawlerRunConfig(fetch_ssl_certificate=True)
+```
+
+### 3. Environment Variable Security
+```bash
+# Use environment variables for sensitive proxy credentials
+# Avoid hardcoding usernames/passwords in code
+export PROXIES="ip1:port1:user1:pass1,ip2:port2:user2:pass2"
+```
+
+### 4. SOCKS5 for Enhanced Security
+```python
+from crawl4ai import CrawlerRunConfig
+
+# Prefer SOCKS5 proxies for better protocol support
+run_config = CrawlerRunConfig(proxy_config="socks5://proxy.example.com:1080")
+```
+
+## Migration from Deprecated `proxy` Parameter
+
+- "Deprecation Notice"
+ The legacy `proxy` argument on `BrowserConfig` is deprecated. Configure proxies through `CrawlerRunConfig.proxy_config` so each request fully describes its network settings.
+
+```python
+# Old (deprecated) approach
+# from crawl4ai import BrowserConfig
+# browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
+
+# New (preferred) approach
+from crawl4ai import CrawlerRunConfig
+run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080")
+```
+
+### Safe Logging of Proxies
+```python
+from crawl4ai import ProxyConfig
+
+def safe_proxy_repr(proxy: ProxyConfig):
+ if getattr(proxy, "username", None):
+ return f"{proxy.server} (auth: ****)"
+ return proxy.server
+```
+
+## Troubleshooting
+
+### Common Issues
+
+- "Proxy connection failed"
+ - Verify the proxy server is reachable from your network.
+ - Double-check authentication credentials.
+ - Ensure the protocol matches (`http`, `https`, or `socks5`).
+
+- "SSL certificate errors"
+ - Some proxies break SSL inspection; switch proxies if you see repeated failures.
+ - Consider temporarily disabling certificate fetching to isolate the issue.
+
+- "Environment variables not loading"
+ - Confirm `PROXIES` (or your custom env var) is set before running the script.
+ - Check formatting: `ip:port:user:pass,ip:port:user:pass`.
+
+- "Proxy rotation not working"
+ - Ensure `ProxyConfig.from_env()` actually loaded entries (`len(proxies) > 0`).
+ - Attach `proxy_rotation_strategy` to `CrawlerRunConfig`.
+ - Validate the proxy definitions you pass into the strategy.
diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index 368f53fb..41984ba5 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -21,21 +21,35 @@ browser_cfg = BrowserConfig(
|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
| **`browser_type`** | `"chromium"`, `"firefox"`, `"webkit"`
*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests. |
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
+| **`browser_mode`** | `str` (default: `"dedicated"`) | How browser is initialized: `"dedicated"` (new instance), `"builtin"` (CDP background), `"custom"` (explicit CDP), `"docker"` (container). |
+| **`use_managed_browser`** | `bool` (default: `False`) | Launch browser via CDP for advanced control. Set automatically based on `browser_mode`. |
+| **`cdp_url`** | `str` (default: `None`) | Chrome DevTools Protocol endpoint URL (e.g., `"ws://localhost:9222/devtools/browser/"`). Set automatically based on `browser_mode`. |
+| **`debugging_port`** | `int` (default: `9222`) | Port for browser debugging protocol. |
+| **`host`** | `str` (default: `"localhost"`) | Host for browser connection. |
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
+| **`viewport`** | `dict` (default: `None`) | Viewport dimensions dict. If set, overrides `viewport_width` and `viewport_height`. |
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
-| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
+| **`proxy_config`** | `ProxyConfig or dict` (default: `None`)| For advanced or multi-proxy needs, specify `ProxyConfig` object or dict like `{"server": "...", "username": "...", "password": "..."}`. |
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
+| **`chrome_channel`** | `str` (default: `"chromium"`) | Chrome channel to launch (e.g., "chrome", "msedge"). Only for `browser_type="chromium"`. Auto-set to empty for Firefox/WebKit. |
+| **`channel`** | `str` (default: `"chromium"`) | Alias for `chrome_channel`. |
+| **`accept_downloads`** | `bool` (default: `False`) | Whether to allow file downloads. Requires `downloads_path` if `True`. |
+| **`downloads_path`** | `str or None` (default: `None`) | Directory to store downloaded files. |
+| **`storage_state`** | `str or dict or None` (default: `None`)| In-memory storage state (cookies, localStorage) to restore browser state. |
| **`ignore_https_errors`** | `bool` (default: `True`) | If `True`, continues despite invalid certificates (common in dev/staging). |
| **`java_script_enabled`** | `bool` (default: `True`) | Disable if you want no JS overhead, or if only static content is needed. |
+| **`sleep_on_close`** | `bool` (default: `False`) | Add a small delay when closing browser (can help with cleanup issues). |
| **`cookies`** | `list` (default: `[]`) | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`. |
| **`headers`** | `dict` (default: `{}`) | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`. |
-| **`user_agent`** | `str` (default: Chrome-based UA) | Your custom or random user agent. `user_agent_mode="random"` can shuffle it. |
-| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. |
+| **`user_agent`** | `str` (default: Chrome-based UA) | Your custom user agent string. |
+| **`user_agent_mode`** | `str` (default: `""`) | Set to `"random"` to randomize user agent from a pool (helps with bot detection). |
+| **`user_agent_generator_config`** | `dict` (default: `{}`) | Configuration dict for user agent generation when `user_agent_mode="random"`. |
| **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. |
-| **`use_managed_browser`** | `bool` (default: `False`) | For advanced “managed” interactions (debugging, CDP usage). Typically set automatically if persistent context is on. |
+| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. |
| **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. |
+| **`enable_stealth`** | `bool` (default: `False`) | Enable playwright-stealth mode to bypass bot detection. Cannot be used with `browser_mode="builtin"`. |
**Tips**:
- Set `headless=False` to visually **debug** how pages load or how interactions proceed.
@@ -70,6 +84,7 @@ We group them by category.
|------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
+| **`chunking_strategy`** | `ChunkingStrategy` (default: RegexChunking()) | Strategy to chunk content before extraction. Can be customized for different chunking approaches. |
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html'). |
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
@@ -78,32 +93,50 @@ We group them by category.
| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. |
| **`prettiify`** | `bool` (False) | If `True`, beautifies final HTML (slower, purely cosmetic). |
| **`keep_data_attributes`** | `bool` (False) | If `True`, preserve `data-*` attributes in cleaned HTML. |
+| **`keep_attrs`** | `list` (default: []) | List of HTML attributes to keep during processing (e.g., `["id", "class", "data-value"]`). |
| **`remove_forms`** | `bool` (False) | If `True`, remove all `