Release v0.7.7

- Updated version to 0.7.7
- Added comprehensive demo and release notes
- Updated all documentation
This commit is contained in:
ntohidi
2025-11-14 10:23:31 +01:00
parent 2c973b1183
commit 6244f56f36
7 changed files with 1966 additions and 20 deletions

View File

@@ -0,0 +1,628 @@
#!/usr/bin/env python3
"""
Crawl4AI v0.7.7 Release Demo
============================
This demo showcases the major feature in v0.7.7:
**Self-Hosting with Real-time Monitoring Dashboard**
Features Demonstrated:
1. System health monitoring with live metrics
2. Real-time request tracking (active & completed)
3. Browser pool management (permanent/hot/cold pools)
4. Monitor API endpoints for programmatic access
5. WebSocket streaming for real-time updates
6. Control actions (kill browser, cleanup, restart)
7. Production metrics (efficiency, reuse rates, memory)
Prerequisites:
- Crawl4AI Docker container running on localhost:11235
- Python packages: pip install httpx websockets
Usage:
python docs/releases_review/demo_v0.7.7.py
"""
import asyncio
import httpx
import json
import time
from datetime import datetime
from typing import Dict, Any
# Configuration
CRAWL4AI_BASE_URL = "http://localhost:11235"
MONITOR_DASHBOARD_URL = f"{CRAWL4AI_BASE_URL}/dashboard"
def print_section(title: str, description: str = ""):
"""Print a formatted section header"""
print(f"\n{'=' * 70}")
print(f"📊 {title}")
if description:
print(f"{description}")
print(f"{'=' * 70}\n")
def print_subsection(title: str):
"""Print a formatted subsection header"""
print(f"\n{'-' * 70}")
print(f"{title}")
print(f"{'-' * 70}")
async def check_server_health():
"""Check if Crawl4AI server is running"""
try:
async with httpx.AsyncClient(timeout=5.0) as client:
response = await client.get(f"{CRAWL4AI_BASE_URL}/health")
return response.status_code == 200
except:
return False
async def demo_1_system_health_overview():
"""Demo 1: System Health Overview - Live metrics and pool status"""
print_section(
"Demo 1: System Health Overview",
"Real-time monitoring of system resources and browser pool"
)
async with httpx.AsyncClient(timeout=30.0) as client:
print("🔍 Fetching system health metrics...")
try:
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/health")
health = response.json()
print("\n✅ System Health Report:")
print(f"\n🖥️ Container Metrics:")
print(f" • CPU Usage: {health['container']['cpu_percent']:.1f}%")
print(f" • Memory Usage: {health['container']['memory_percent']:.1f}% "
f"({health['container']['memory_mb']:.0f} MB)")
print(f" • Network RX: {health['container']['network_rx_mb']:.2f} MB")
print(f" • Network TX: {health['container']['network_tx_mb']:.2f} MB")
print(f" • Uptime: {health['container']['uptime_seconds']:.0f}s")
print(f"\n🌐 Browser Pool Status:")
print(f" Permanent Browser:")
print(f" • Active: {health['pool']['permanent']['active']}")
print(f" • Total Requests: {health['pool']['permanent']['total_requests']}")
print(f" Hot Pool (Frequently Used Configs):")
print(f" • Count: {health['pool']['hot']['count']}")
print(f" • Total Requests: {health['pool']['hot']['total_requests']}")
print(f" Cold Pool (On-Demand Configs):")
print(f" • Count: {health['pool']['cold']['count']}")
print(f" • Total Requests: {health['pool']['cold']['total_requests']}")
print(f"\n📈 Overall Statistics:")
print(f" • Total Requests: {health['stats']['total_requests']}")
print(f" • Success Rate: {health['stats']['success_rate_percent']:.1f}%")
print(f" • Avg Latency: {health['stats']['avg_latency_ms']:.0f}ms")
print(f"\n💡 Dashboard URL: {MONITOR_DASHBOARD_URL}")
except Exception as e:
print(f"❌ Error fetching health: {e}")
async def demo_2_request_tracking():
"""Demo 2: Real-time Request Tracking - Generate and monitor requests"""
print_section(
"Demo 2: Real-time Request Tracking",
"Submit crawl jobs and watch them in real-time"
)
async with httpx.AsyncClient(timeout=60.0) as client:
print("🚀 Submitting crawl requests...")
# Submit multiple requests
urls_to_crawl = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://example.com"
]
tasks = []
for url in urls_to_crawl:
task = client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json={"urls": [url], "crawler_config": {}}
)
tasks.append(task)
print(f" • Submitting {len(urls_to_crawl)} requests in parallel...")
results = await asyncio.gather(*tasks, return_exceptions=True)
successful = sum(1 for r in results if not isinstance(r, Exception) and r.status_code == 200)
print(f"{successful}/{len(urls_to_crawl)} requests submitted")
# Check request tracking
print("\n📊 Checking request tracking...")
await asyncio.sleep(2) # Wait for requests to process
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/requests")
requests_data = response.json()
print(f"\n📋 Request Status:")
print(f" • Active Requests: {len(requests_data['active'])}")
print(f" • Completed Requests: {len(requests_data['completed'])}")
if requests_data['completed']:
print(f"\n📝 Recent Completed Requests:")
for req in requests_data['completed'][:3]:
status_icon = "" if req['success'] else ""
print(f" {status_icon} {req['endpoint']} - {req['latency_ms']:.0f}ms")
async def demo_3_browser_pool_management():
"""Demo 3: Browser Pool Management - 3-tier architecture in action"""
print_section(
"Demo 3: Browser Pool Management",
"Understanding permanent, hot, and cold browser pools"
)
async with httpx.AsyncClient(timeout=60.0) as client:
print("🌊 Testing browser pool with different configurations...")
# Test 1: Default config (permanent browser)
print("\n🔥 Test 1: Default Config → Permanent Browser")
for i in range(3):
await client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json={"urls": [f"https://httpbin.org/html?req={i}"], "crawler_config": {}}
)
print(f" • Request {i+1}/3 sent (should use permanent browser)")
await asyncio.sleep(2)
# Test 2: Custom viewport (cold → hot promotion after 3 uses)
print("\n♨️ Test 2: Custom Viewport → Cold Pool (promoting to Hot)")
viewport_config = {"viewport": {"width": 1280, "height": 720}}
for i in range(4):
await client.post(
f"{CRAWL4AI_BASE_URL}/crawl",
json={
"urls": [f"https://httpbin.org/json?viewport={i}"],
"browser_config": viewport_config,
"crawler_config": {}
}
)
print(f" • Request {i+1}/4 sent (cold→hot promotion after 3rd use)")
await asyncio.sleep(2)
# Check browser pool status
print("\n📊 Browser Pool Report:")
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/browsers")
browsers = response.json()
print(f"\n🎯 Pool Summary:")
print(f" • Total Browsers: {browsers['summary']['total_count']}")
print(f" • Total Memory: {browsers['summary']['total_memory_mb']} MB")
print(f" • Reuse Rate: {browsers['summary']['reuse_rate_percent']:.1f}%")
print(f"\n📋 Browser Pool Details:")
if browsers['permanent']:
for browser in browsers['permanent']:
print(f" 🔥 Permanent: {browser['browser_id'][:8]}... | "
f"Requests: {browser['request_count']} | "
f"Memory: {browser['memory_mb']:.0f} MB")
if browsers['hot']:
for browser in browsers['hot']:
print(f" ♨️ Hot: {browser['browser_id'][:8]}... | "
f"Requests: {browser['request_count']} | "
f"Memory: {browser['memory_mb']:.0f} MB")
if browsers['cold']:
for browser in browsers['cold']:
print(f" ❄️ Cold: {browser['browser_id'][:8]}... | "
f"Requests: {browser['request_count']} | "
f"Memory: {browser['memory_mb']:.0f} MB")
async def demo_4_monitor_api_endpoints():
"""Demo 4: Monitor API Endpoints - Complete API surface"""
print_section(
"Demo 4: Monitor API Endpoints",
"Programmatic access to all monitoring data"
)
async with httpx.AsyncClient(timeout=30.0) as client:
print("🔌 Testing Monitor API endpoints...")
# Endpoint performance statistics
print_subsection("Endpoint Performance Statistics")
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/endpoints/stats")
endpoint_stats = response.json()
print("\n📊 Per-Endpoint Analytics:")
for endpoint, stats in endpoint_stats.items():
print(f" {endpoint}:")
print(f" • Requests: {stats['count']}")
print(f" • Avg Latency: {stats['avg_latency_ms']:.0f}ms")
print(f" • Success Rate: {stats['success_rate_percent']:.1f}%")
# Timeline data for charts
print_subsection("Timeline Data (for Charts)")
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/timeline?minutes=5")
timeline = response.json()
print(f"\n📈 Timeline Metrics (last 5 minutes):")
print(f" • Data Points: {len(timeline['memory'])}")
if timeline['memory']:
latest = timeline['memory'][-1]
print(f" • Latest Memory: {latest['value']:.1f}%")
print(f" • Timestamp: {latest['timestamp']}")
# Janitor logs
print_subsection("Janitor Cleanup Events")
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/logs/janitor?limit=3")
janitor_logs = response.json()
print(f"\n🧹 Recent Cleanup Activities:")
if janitor_logs:
for log in janitor_logs[:3]:
print(f"{log['timestamp']}: {log['message']}")
else:
print(" (No cleanup events yet - janitor runs periodically)")
# Error logs
print_subsection("Error Monitoring")
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/logs/errors?limit=3")
error_logs = response.json()
print(f"\n❌ Recent Errors:")
if error_logs:
for log in error_logs[:3]:
print(f"{log['timestamp']}: {log['error_type']}")
print(f" {log['message'][:100]}...")
else:
print(" ✅ No recent errors!")
async def demo_5_websocket_streaming():
"""Demo 5: WebSocket Streaming - Real-time updates"""
print_section(
"Demo 5: WebSocket Streaming",
"Live monitoring with 2-second update intervals"
)
print("⚡ WebSocket Streaming Demo")
print("\n💡 The monitoring dashboard uses WebSocket for real-time updates")
print(f" • Connection: ws://localhost:11235/monitor/ws")
print(f" • Update Interval: 2 seconds")
print(f" • Data: Health, requests, browsers, memory, errors")
print("\n📝 Sample WebSocket Integration Code:")
print("""
import websockets
import json
async def monitor_realtime():
uri = "ws://localhost:11235/monitor/ws"
async with websockets.connect(uri) as websocket:
while True:
data = await websocket.recv()
update = json.loads(data)
print(f"Memory: {update['health']['container']['memory_percent']:.1f}%")
print(f"Active Requests: {len(update['requests']['active'])}")
print(f"Browser Pool: {update['health']['pool']['permanent']['active']}")
""")
print("\n🌐 Open the dashboard to see WebSocket in action:")
print(f" {MONITOR_DASHBOARD_URL}")
async def demo_6_control_actions():
"""Demo 6: Control Actions - Manual browser management"""
print_section(
"Demo 6: Control Actions",
"Manual control over browser pool and cleanup"
)
async with httpx.AsyncClient(timeout=30.0) as client:
print("🎮 Testing control actions...")
# Force cleanup
print_subsection("Force Immediate Cleanup")
print("🧹 Triggering manual cleanup...")
try:
response = await client.post(f"{CRAWL4AI_BASE_URL}/monitor/actions/cleanup")
if response.status_code == 200:
result = response.json()
print(f" ✅ Cleanup completed")
print(f" • Browsers cleaned: {result.get('cleaned_count', 0)}")
print(f" • Memory freed: {result.get('memory_freed_mb', 0):.1f} MB")
else:
print(f" ⚠️ Response: {response.status_code}")
except Exception as e:
print(f" Cleanup action: {e}")
# Get browser list for potential kill/restart
print_subsection("Browser Management")
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/browsers")
browsers = response.json()
cold_browsers = browsers.get('cold', [])
if cold_browsers:
browser_id = cold_browsers[0]['browser_id']
print(f"\n🎯 Example: Kill specific browser")
print(f" POST /monitor/actions/kill_browser")
print(f" JSON: {{'browser_id': '{browser_id[:16]}...'}}")
print(f" → Kills the browser and frees resources")
print(f"\n🔄 Example: Restart browser")
print(f" POST /monitor/actions/restart_browser")
print(f" JSON: {{'browser_id': 'browser_id_here'}}")
print(f" → Restart a specific browser instance")
# Reset statistics
print_subsection("Reset Statistics")
print("📊 Statistics can be reset for fresh monitoring:")
print(f" POST /monitor/stats/reset")
print(f" → Clears all accumulated statistics")
async def demo_7_production_metrics():
"""Demo 7: Production Metrics - Key indicators for operations"""
print_section(
"Demo 7: Production Metrics",
"Critical metrics for production monitoring"
)
async with httpx.AsyncClient(timeout=30.0) as client:
print("📊 Key Production Metrics:")
# Overall health
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/health")
health = response.json()
# Browser efficiency
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/browsers")
browsers = response.json()
print("\n🎯 Critical Metrics to Track:")
print(f"\n1⃣ Memory Usage Trends")
print(f" • Current: {health['container']['memory_percent']:.1f}%")
print(f" • Alert if: >80%")
print(f" • Action: Trigger cleanup or scale")
print(f"\n2⃣ Request Success Rate")
print(f" • Current: {health['stats']['success_rate_percent']:.1f}%")
print(f" • Target: >95%")
print(f" • Alert if: <90%")
print(f"\n3⃣ Average Latency")
print(f" • Current: {health['stats']['avg_latency_ms']:.0f}ms")
print(f" • Target: <2000ms")
print(f" • Alert if: >5000ms")
print(f"\n4⃣ Browser Pool Efficiency")
print(f" • Reuse Rate: {browsers['summary']['reuse_rate_percent']:.1f}%")
print(f" • Target: >80%")
print(f" • Indicates: Effective browser pooling")
print(f"\n5⃣ Total Browsers")
print(f" • Current: {browsers['summary']['total_count']}")
print(f" • Alert if: >20 (possible leak)")
print(f" • Check: Janitor is running correctly")
print(f"\n6⃣ Error Frequency")
response = await client.get(f"{CRAWL4AI_BASE_URL}/monitor/logs/errors?limit=10")
errors = response.json()
print(f" • Recent Errors: {len(errors)}")
print(f" • Alert if: >10 in last hour")
print(f" • Action: Review error patterns")
print("\n💡 Integration Examples:")
print(" • Prometheus: Scrape /monitor/health")
print(" • Alerting: Monitor memory, success rate, latency")
print(" • Dashboards: WebSocket streaming to custom UI")
print(" • Log Aggregation: Collect /monitor/logs/* endpoints")
async def demo_8_self_hosting_value():
"""Demo 8: Self-Hosting Value Proposition"""
print_section(
"Demo 8: Why Self-Host Crawl4AI?",
"The value proposition of owning your infrastructure"
)
print("🎯 Self-Hosting Benefits:\n")
print("🔒 Data Privacy & Security")
print(" • Your data never leaves your infrastructure")
print(" • No third-party access to crawled content")
print(" • Keep sensitive workflows behind your firewall")
print("\n💰 Cost Control")
print(" • No per-request pricing or rate limits")
print(" • Predictable infrastructure costs")
print(" • Scale based on your actual needs")
print("\n🎯 Full Customization")
print(" • Complete control over browser configs")
print(" • Custom hooks and strategies")
print(" • Tailored monitoring and alerting")
print("\n📊 Complete Transparency")
print(" • Real-time monitoring dashboard")
print(" • Full visibility into system performance")
print(" • Detailed request and error tracking")
print("\n⚡ Performance & Flexibility")
print(" • Direct access, no network overhead")
print(" • Integrate with existing infrastructure")
print(" • Custom resource allocation")
print("\n🛡️ Enterprise-Grade Operations")
print(" • Prometheus integration ready")
print(" • WebSocket for real-time dashboards")
print(" • Full API for automation")
print(" • Manual controls for troubleshooting")
print(f"\n🌐 Get Started:")
print(f" docker pull unclecode/crawl4ai:0.7.7")
print(f" docker run -d -p 11235:11235 --shm-size=1g unclecode/crawl4ai:0.7.7")
print(f" # Visit: {MONITOR_DASHBOARD_URL}")
def print_summary():
"""Print comprehensive demo summary"""
print("\n" + "=" * 70)
print("📊 DEMO SUMMARY - Crawl4AI v0.7.7")
print("=" * 70)
print("\n✨ Features Demonstrated:")
print("=" * 70)
print("✅ System Health Overview")
print(" → Real-time CPU, memory, network, and uptime monitoring")
print("\n✅ Request Tracking")
print(" → Active and completed request monitoring with full details")
print("\n✅ Browser Pool Management")
print(" → 3-tier architecture: Permanent, Hot, and Cold pools")
print(" → Automatic promotion and cleanup")
print("\n✅ Monitor API Endpoints")
print(" → Complete REST API for programmatic access")
print(" → Health, requests, browsers, timeline, logs, errors")
print("\n✅ WebSocket Streaming")
print(" → Real-time updates every 2 seconds")
print(" → Build custom dashboards with live data")
print("\n✅ Control Actions")
print(" → Manual browser management (kill, restart)")
print(" → Force cleanup and statistics reset")
print("\n✅ Production Metrics")
print(" → 6 critical metrics for operational excellence")
print(" → Prometheus integration patterns")
print("\n✅ Self-Hosting Value")
print(" → Data privacy, cost control, full customization")
print(" → Enterprise-grade transparency and control")
print("\n" + "=" * 70)
print("🎯 What's New in v0.7.7?")
print("=" * 70)
print("• 📊 Complete Real-time Monitoring System")
print("• 🌐 Interactive Web Dashboard (/dashboard)")
print("• 🔌 Comprehensive Monitor API")
print("• ⚡ WebSocket Streaming (2-second updates)")
print("• 🎮 Manual Control Actions")
print("• 📈 Production Integration Examples")
print("• 🏭 Prometheus, Alerting, Log Aggregation")
print("• 🔥 Smart Browser Pool (Permanent/Hot/Cold)")
print("• 🧹 Automatic Janitor Cleanup")
print("• 📋 Full Request & Error Tracking")
print("\n" + "=" * 70)
print("💡 Why This Matters")
print("=" * 70)
print("Before v0.7.7: Docker was just a containerized crawler")
print("After v0.7.7: Complete self-hosting platform with enterprise monitoring")
print("\nYou now have:")
print(" • Full visibility into what's happening inside")
print(" • Real-time operational dashboards")
print(" • Complete control over browser resources")
print(" • Production-ready observability")
print(" • Zero external dependencies")
print("\n" + "=" * 70)
print("📚 Next Steps")
print("=" * 70)
print(f"1. Open the dashboard: {MONITOR_DASHBOARD_URL}")
print("2. Read the docs: https://docs.crawl4ai.com/basic/self-hosting/")
print("3. Try the Monitor API endpoints yourself")
print("4. Set up Prometheus integration for production")
print("5. Build custom dashboards with WebSocket streaming")
print("\n" + "=" * 70)
print("🔗 Resources")
print("=" * 70)
print(f"• Dashboard: {MONITOR_DASHBOARD_URL}")
print(f"• Health API: {CRAWL4AI_BASE_URL}/monitor/health")
print(f"• Documentation: https://docs.crawl4ai.com/")
print(f"• GitHub: https://github.com/unclecode/crawl4ai")
print("\n" + "=" * 70)
print("🎉 You're now in control of your web crawling destiny!")
print("=" * 70)
async def main():
"""Run all demos"""
print("\n" + "=" * 70)
print("🚀 Crawl4AI v0.7.7 Release Demo")
print("=" * 70)
print("Feature: Self-Hosting with Real-time Monitoring Dashboard")
print("=" * 70)
# Check if server is running
print("\n🔍 Checking Crawl4AI server...")
server_running = await check_server_health()
if not server_running:
print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
print("\nPlease start the Docker container:")
print(" docker pull unclecode/crawl4ai:0.7.7")
print(" docker run -d -p 11235:11235 --shm-size=1g unclecode/crawl4ai:0.7.7")
print("\nThen re-run this demo.")
return
print(f"✅ Crawl4AI server is running!")
print(f"📊 Dashboard available at: {MONITOR_DASHBOARD_URL}")
# Run all demos
demos = [
demo_1_system_health_overview,
demo_2_request_tracking,
demo_3_browser_pool_management,
demo_4_monitor_api_endpoints,
demo_5_websocket_streaming,
demo_6_control_actions,
demo_7_production_metrics,
demo_8_self_hosting_value,
]
for i, demo_func in enumerate(demos, 1):
try:
await demo_func()
if i < len(demos):
await asyncio.sleep(2) # Brief pause between demos
except KeyboardInterrupt:
print(f"\n\n⚠️ Demo interrupted by user")
return
except Exception as e:
print(f"\n❌ Demo {i} error: {e}")
print("Continuing to next demo...\n")
continue
# Print comprehensive summary
print_summary()
print("\n" + "=" * 70)
print("✅ Demo completed!")
print("=" * 70)
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
print("\n\n👋 Demo stopped by user. Thanks for trying Crawl4AI v0.7.7!")
except Exception as e:
print(f"\n\n❌ Demo failed: {e}")
print("Make sure the Docker container is running:")
print(" docker run -d -p 11235:11235 --shm-size=1g unclecode/crawl4ai:0.7.7")