Files
crawl4ai/docs/releases_review/v0.7.5_docker_hooks_demo.py
2025-10-14 13:49:57 +08:00

656 lines
23 KiB
Python

#!/usr/bin/env python3
"""
🚀 Crawl4AI v0.7.5 - Docker Hooks System Complete Demonstration
================================================================
This file demonstrates the NEW Docker Hooks System introduced in v0.7.5.
The Docker Hooks System is a completely NEW feature that provides pipeline
customization through user-provided Python functions. It offers three approaches:
1. String-based hooks for REST API
2. hooks_to_string() utility to convert functions
3. Docker Client with automatic conversion (most convenient)
All three approaches are part of this NEW v0.7.5 feature!
Perfect for video recording and demonstration purposes.
Requirements:
- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest
- crawl4ai v0.7.5 installed: pip install crawl4ai==0.7.5
"""
import asyncio
import requests
import json
import time
from typing import Dict, Any
# Import Crawl4AI components
from crawl4ai import hooks_to_string
from crawl4ai.docker_client import Crawl4aiDockerClient
# Configuration
DOCKER_URL = "http://localhost:11235"
# DOCKER_URL = "http://localhost:11234"
TEST_URLS = [
# "https://httpbin.org/html",
"https://www.kidocode.com",
"https://quotes.toscrape.com",
]
def print_section(title: str, description: str = ""):
"""Print a formatted section header"""
print("\n" + "=" * 70)
print(f" {title}")
if description:
print(f" {description}")
print("=" * 70 + "\n")
def check_docker_service() -> bool:
"""Check if Docker service is running"""
try:
response = requests.get(f"{DOCKER_URL}/health", timeout=3)
return response.status_code == 200
except:
return False
# ============================================================================
# REUSABLE HOOK LIBRARY (NEW in v0.7.5)
# ============================================================================
async def performance_optimization_hook(page, context, **kwargs):
"""
Performance Hook: Block unnecessary resources to speed up crawling
"""
print(" [Hook] 🚀 Optimizing performance - blocking images and ads...")
# Block images
await context.route(
"**/*.{png,jpg,jpeg,gif,webp,svg,ico}",
lambda route: route.abort()
)
# Block ads and analytics
await context.route("**/analytics/*", lambda route: route.abort())
await context.route("**/ads/*", lambda route: route.abort())
await context.route("**/google-analytics.com/*", lambda route: route.abort())
print(" [Hook] ✓ Performance optimization applied")
return page
async def viewport_setup_hook(page, context, **kwargs):
"""
Viewport Hook: Set consistent viewport size for rendering
"""
print(" [Hook] 🖥️ Setting viewport to 1920x1080...")
await page.set_viewport_size({"width": 1920, "height": 1080})
print(" [Hook] ✓ Viewport configured")
return page
async def authentication_headers_hook(page, context, url, **kwargs):
"""
Headers Hook: Add custom authentication and tracking headers
"""
print(f" [Hook] 🔐 Adding custom headers for {url[:50]}...")
await page.set_extra_http_headers({
'X-Crawl4AI-Version': '0.7.5',
'X-Custom-Hook': 'function-based-demo',
'Accept-Language': 'en-US,en;q=0.9',
'User-Agent': 'Crawl4AI/0.7.5 (Educational Demo)'
})
print(" [Hook] ✓ Custom headers added")
return page
async def lazy_loading_handler_hook(page, context, **kwargs):
"""
Content Hook: Handle lazy-loaded content by scrolling
"""
print(" [Hook] 📜 Scrolling to load lazy content...")
# Scroll to bottom
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(1000)
# Scroll to middle
await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
await page.wait_for_timeout(500)
# Scroll back to top
await page.evaluate("window.scrollTo(0, 0)")
await page.wait_for_timeout(500)
print(" [Hook] ✓ Lazy content loaded")
return page
async def page_analytics_hook(page, context, **kwargs):
"""
Analytics Hook: Log page metrics before extraction
"""
print(" [Hook] 📊 Collecting page analytics...")
metrics = await page.evaluate('''
() => ({
title: document.title,
images: document.images.length,
links: document.links.length,
scripts: document.scripts.length,
headings: document.querySelectorAll('h1, h2, h3').length,
paragraphs: document.querySelectorAll('p').length
})
''')
print(f" [Hook] 📈 Page: {metrics['title'][:50]}...")
print(f" Links: {metrics['links']}, Images: {metrics['images']}, "
f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}")
return page
# ============================================================================
# DEMO 1: String-Based Hooks (NEW Docker Hooks System)
# ============================================================================
def demo_1_string_based_hooks():
"""
Demonstrate string-based hooks with REST API (part of NEW Docker Hooks System)
"""
print_section(
"DEMO 1: String-Based Hooks (REST API)",
"Part of the NEW Docker Hooks System - hooks as strings"
)
# Define hooks as strings
hooks_config = {
"on_page_context_created": """
async def hook(page, context, **kwargs):
print(" [String Hook] Setting up page context...")
# Block images for performance
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
await page.set_viewport_size({"width": 1920, "height": 1080})
return page
""",
"before_goto": """
async def hook(page, context, url, **kwargs):
print(f" [String Hook] Navigating to {url[:50]}...")
await page.set_extra_http_headers({
'X-Crawl4AI': 'string-based-hooks',
'X-Demo': 'v0.7.5'
})
return page
""",
"before_retrieve_html": """
async def hook(page, context, **kwargs):
print(" [String Hook] Scrolling page...")
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(1000)
return page
"""
}
# Prepare request payload
payload = {
"urls": [TEST_URLS[0]],
"hooks": {
"code": hooks_config,
"timeout": 30
},
"crawler_config": {
"cache_mode": "bypass"
}
}
print(f"🎯 Target URL: {TEST_URLS[0]}")
print(f"🔧 Configured {len(hooks_config)} string-based hooks")
print(f"📡 Sending request to Docker API...\n")
try:
start_time = time.time()
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
execution_time = time.time() - start_time
if response.status_code == 200:
result = response.json()
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
# Display results
if result.get('results') and result['results'][0].get('success'):
crawl_result = result['results'][0]
html_length = len(crawl_result.get('html', ''))
markdown_length = len(crawl_result.get('markdown', ''))
print(f"\n📊 Results:")
print(f" • HTML length: {html_length:,} characters")
print(f" • Markdown length: {markdown_length:,} characters")
print(f" • URL: {crawl_result.get('url')}")
# Check hooks execution
if 'hooks' in result:
hooks_info = result['hooks']
print(f"\n🎣 Hooks Execution:")
print(f" • Status: {hooks_info['status']['status']}")
print(f" • Attached hooks: {len(hooks_info['status']['attached_hooks'])}")
if 'summary' in hooks_info:
summary = hooks_info['summary']
print(f" • Total executions: {summary['total_executions']}")
print(f" • Successful: {summary['successful']}")
print(f" • Success rate: {summary['success_rate']:.1f}%")
else:
print(f"⚠️ Crawl completed but no results")
else:
print(f"❌ Request failed with status {response.status_code}")
print(f" Error: {response.text[:200]}")
except requests.exceptions.Timeout:
print("⏰ Request timed out after 60 seconds")
except Exception as e:
print(f"❌ Error: {str(e)}")
print("\n" + "" * 70)
print("✓ String-based hooks demo complete\n")
# ============================================================================
# DEMO 2: Function-Based Hooks with hooks_to_string() Utility
# ============================================================================
def demo_2_hooks_to_string_utility():
"""
Demonstrate the new hooks_to_string() utility for converting functions
"""
print_section(
"DEMO 2: hooks_to_string() Utility (NEW! ✨)",
"Convert Python functions to strings for REST API"
)
print("📦 Creating hook functions...")
print(" • performance_optimization_hook")
print(" • viewport_setup_hook")
print(" • authentication_headers_hook")
print(" • lazy_loading_handler_hook")
# Convert function objects to strings using the NEW utility
print("\n🔄 Converting functions to strings with hooks_to_string()...")
hooks_dict = {
"on_page_context_created": performance_optimization_hook,
"before_goto": authentication_headers_hook,
"before_retrieve_html": lazy_loading_handler_hook,
}
hooks_as_strings = hooks_to_string(hooks_dict)
print(f"✅ Successfully converted {len(hooks_as_strings)} functions to strings")
# Show a preview
print("\n📝 Sample converted hook (first 250 characters):")
print("" * 70)
sample_hook = list(hooks_as_strings.values())[0]
print(sample_hook[:250] + "...")
print("" * 70)
# Use the converted hooks with REST API
print("\n📡 Using converted hooks with REST API...")
payload = {
"urls": [TEST_URLS[0]],
"hooks": {
"code": hooks_as_strings,
"timeout": 30
}
}
try:
start_time = time.time()
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
execution_time = time.time() - start_time
if response.status_code == 200:
result = response.json()
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
if result.get('results') and result['results'][0].get('success'):
crawl_result = result['results'][0]
print(f" • HTML length: {len(crawl_result.get('html', '')):,} characters")
print(f" • Hooks executed successfully!")
else:
print(f"❌ Request failed: {response.status_code}")
except Exception as e:
print(f"❌ Error: {str(e)}")
print("\n💡 Benefits of hooks_to_string():")
print(" ✓ Write hooks as regular Python functions")
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
print(" ✓ Type checking and linting")
print(" ✓ Easy to test and debug")
print(" ✓ Reusable across projects")
print(" ✓ Works with any REST API client")
print("\n" + "" * 70)
print("✓ hooks_to_string() utility demo complete\n")
# ============================================================================
# DEMO 3: Docker Client with Automatic Conversion (RECOMMENDED! 🌟)
# ============================================================================
async def demo_3_docker_client_auto_conversion():
"""
Demonstrate Docker Client with automatic hook conversion (RECOMMENDED)
"""
print_section(
"DEMO 3: Docker Client with Auto-Conversion (RECOMMENDED! 🌟)",
"Pass function objects directly - conversion happens automatically!"
)
print("🐳 Initializing Crawl4AI Docker Client...")
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
print("✅ Client ready!\n")
# Use our reusable hook library - just pass the function objects!
print("📚 Using reusable hook library:")
print(" • performance_optimization_hook")
print(" • viewport_setup_hook")
print(" • authentication_headers_hook")
print(" • lazy_loading_handler_hook")
print(" • page_analytics_hook")
print("\n🎯 Target URL: " + TEST_URLS[1])
print("🚀 Starting crawl with automatic hook conversion...\n")
try:
start_time = time.time()
# Pass function objects directly - NO manual conversion needed! ✨
results = await client.crawl(
urls=[TEST_URLS[0]],
hooks={
"on_page_context_created": performance_optimization_hook,
"before_goto": authentication_headers_hook,
"before_retrieve_html": lazy_loading_handler_hook,
"before_return_html": page_analytics_hook,
},
hooks_timeout=30
)
execution_time = time.time() - start_time
print(f"\n✅ Crawl completed! (took {execution_time:.2f}s)\n")
# Display results
if results and results.success:
result = results
print(f"📊 Results:")
print(f" • URL: {result.url}")
print(f" • Success: {result.success}")
print(f" • HTML length: {len(result.html):,} characters")
print(f" • Markdown length: {len(result.markdown):,} characters")
# Show metadata
if result.metadata:
print(f"\n📋 Metadata:")
print(f" • Title: {result.metadata.get('title', 'N/A')}")
print(f" • Description: {result.metadata.get('description', 'N/A')}")
# Show links
if result.links:
internal_count = len(result.links.get('internal', []))
external_count = len(result.links.get('external', []))
print(f"\n🔗 Links Found:")
print(f" • Internal: {internal_count}")
print(f" • External: {external_count}")
else:
print(f"⚠️ Crawl completed but no successful results")
if results:
print(f" Error: {results.error_message}")
except Exception as e:
print(f"❌ Error: {str(e)}")
import traceback
traceback.print_exc()
print("\n🌟 Why Docker Client is RECOMMENDED:")
print(" ✓ Automatic function-to-string conversion")
print(" ✓ No manual hooks_to_string() calls needed")
print(" ✓ Cleaner, more Pythonic code")
print(" ✓ Full type hints and IDE support")
print(" ✓ Built-in error handling")
print(" ✓ Async/await support")
print("\n" + "" * 70)
print("✓ Docker Client auto-conversion demo complete\n")
# ============================================================================
# DEMO 4: Advanced Use Case - Complete Hook Pipeline
# ============================================================================
async def demo_4_complete_hook_pipeline():
"""
Demonstrate a complete hook pipeline using all 8 hook points
"""
print_section(
"DEMO 4: Complete Hook Pipeline",
"Using all 8 available hook points for comprehensive control"
)
# Define all 8 hooks
async def on_browser_created_hook(browser, **kwargs):
"""Hook 1: Called after browser is created"""
print(" [Pipeline] 1/8 Browser created")
return browser
async def on_page_context_created_hook(page, context, **kwargs):
"""Hook 2: Called after page context is created"""
print(" [Pipeline] 2/8 Page context created - setting up...")
await page.set_viewport_size({"width": 1920, "height": 1080})
return page
async def on_user_agent_updated_hook(page, context, user_agent, **kwargs):
"""Hook 3: Called when user agent is updated"""
print(f" [Pipeline] 3/8 User agent updated: {user_agent[:50]}...")
return page
async def before_goto_hook(page, context, url, **kwargs):
"""Hook 4: Called before navigating to URL"""
print(f" [Pipeline] 4/8 Before navigation to: {url[:60]}...")
return page
async def after_goto_hook(page, context, url, response, **kwargs):
"""Hook 5: Called after navigation completes"""
print(f" [Pipeline] 5/8 After navigation - Status: {response.status if response else 'N/A'}")
await page.wait_for_timeout(1000)
return page
async def on_execution_started_hook(page, context, **kwargs):
"""Hook 6: Called when JavaScript execution starts"""
print(" [Pipeline] 6/8 JavaScript execution started")
return page
async def before_retrieve_html_hook(page, context, **kwargs):
"""Hook 7: Called before retrieving HTML"""
print(" [Pipeline] 7/8 Before HTML retrieval - scrolling...")
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
return page
async def before_return_html_hook(page, context, html, **kwargs):
"""Hook 8: Called before returning HTML"""
print(f" [Pipeline] 8/8 Before return - HTML length: {len(html):,} chars")
return page
print("🎯 Target URL: " + TEST_URLS[0])
print("🔧 Configured ALL 8 hook points for complete pipeline control\n")
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
try:
print("🚀 Starting complete pipeline crawl...\n")
start_time = time.time()
results = await client.crawl(
urls=[TEST_URLS[0]],
hooks={
"on_browser_created": on_browser_created_hook,
"on_page_context_created": on_page_context_created_hook,
"on_user_agent_updated": on_user_agent_updated_hook,
"before_goto": before_goto_hook,
"after_goto": after_goto_hook,
"on_execution_started": on_execution_started_hook,
"before_retrieve_html": before_retrieve_html_hook,
"before_return_html": before_return_html_hook,
},
hooks_timeout=45
)
execution_time = time.time() - start_time
if results and results.success:
print(f"\n✅ Complete pipeline executed successfully! (took {execution_time:.2f}s)")
print(f" • All 8 hooks executed in sequence")
print(f" • HTML length: {len(results.html):,} characters")
else:
print(f"⚠️ Pipeline completed with warnings")
except Exception as e:
print(f"❌ Error: {str(e)}")
print("\n📚 Available Hook Points:")
print(" 1. on_browser_created - Browser initialization")
print(" 2. on_page_context_created - Page context setup")
print(" 3. on_user_agent_updated - User agent configuration")
print(" 4. before_goto - Pre-navigation setup")
print(" 5. after_goto - Post-navigation processing")
print(" 6. on_execution_started - JavaScript execution start")
print(" 7. before_retrieve_html - Pre-extraction processing")
print(" 8. before_return_html - Final HTML processing")
print("\n" + "" * 70)
print("✓ Complete hook pipeline demo complete\n")
# ============================================================================
# MAIN EXECUTION
# ============================================================================
async def main():
"""
Run all demonstrations
"""
print("\n" + "=" * 70)
print(" 🚀 Crawl4AI v0.7.5 - Docker Hooks Complete Demonstration")
print("=" * 70)
# Check Docker service
print("\n🔍 Checking Docker service status...")
if not check_docker_service():
print("❌ Docker service is not running!")
print("\n📋 To start the Docker service:")
print(" docker run -p 11235:11235 unclecode/crawl4ai:latest")
print("\nPlease start the service and run this demo again.")
return
print("✅ Docker service is running!\n")
# Run all demos
demos = [
("String-Based Hooks (REST API)", demo_1_string_based_hooks, False),
("hooks_to_string() Utility", demo_2_hooks_to_string_utility, False),
("Docker Client Auto-Conversion", demo_3_docker_client_auto_conversion, True),
# ("Complete Hook Pipeline", demo_4_complete_hook_pipeline, True),
]
for i, (name, demo_func, is_async) in enumerate(demos, 1):
print(f"\n{'🔷' * 35}")
print(f"Starting Demo {i}/{len(demos)}: {name}")
print(f"{'🔷' * 35}\n")
try:
if is_async:
await demo_func()
else:
demo_func()
print(f"✅ Demo {i} completed successfully!")
# Pause between demos (except the last one)
if i < len(demos):
print("\n⏸️ Press Enter to continue to next demo...")
# input()
except KeyboardInterrupt:
print(f"\n⏹️ Demo interrupted by user")
break
except Exception as e:
print(f"\n❌ Demo {i} failed: {str(e)}")
import traceback
traceback.print_exc()
print("\nContinuing to next demo...\n")
continue
# Final summary
print("\n" + "=" * 70)
print(" 🎉 All Demonstrations Complete!")
print("=" * 70)
print("\n📊 Summary of v0.7.5 Docker Hooks System:")
print("\n🆕 COMPLETELY NEW FEATURE in v0.7.5:")
print(" The Docker Hooks System lets you customize the crawling pipeline")
print(" with user-provided Python functions at 8 strategic points.")
print("\n✨ Three Ways to Use Docker Hooks (All NEW!):")
print(" 1. String-based - Write hooks as strings for REST API")
print(" 2. hooks_to_string() - Convert Python functions to strings")
print(" 3. Docker Client - Automatic conversion (RECOMMENDED)")
print("\n💡 Key Benefits:")
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
print(" ✓ Type checking and linting")
print(" ✓ Easy to test and debug")
print(" ✓ Reusable across projects")
print(" ✓ Complete pipeline control")
print("\n🎯 8 Hook Points Available:")
print(" • on_browser_created, on_page_context_created")
print(" • on_user_agent_updated, before_goto, after_goto")
print(" • on_execution_started, before_retrieve_html, before_return_html")
print("\n📚 Resources:")
print(" • Docs: https://docs.crawl4ai.com")
print(" • GitHub: https://github.com/unclecode/crawl4ai")
print(" • Discord: https://discord.gg/jP8KfhDhyN")
print("\n" + "=" * 70)
print(" Happy Crawling with v0.7.5! 🕷️")
print("=" * 70 + "\n")
if __name__ == "__main__":
print("\n🎬 Starting Crawl4AI v0.7.5 Docker Hooks Demonstration...")
print("Press Ctrl+C anytime to exit\n")
try:
asyncio.run(main())
except KeyboardInterrupt:
print("\n\n👋 Demo stopped by user. Thanks for exploring Crawl4AI v0.7.5!")
except Exception as e:
print(f"\n\n❌ Demo error: {str(e)}")
import traceback
traceback.print_exc()