#!/usr/bin/env python3 """ πŸš€ Crawl4AI Docker Hooks System - Complete Examples ==================================================== This file demonstrates the Docker Hooks System with three different approaches: 1. String-based hooks for REST API 2. hooks_to_string() utility to convert functions 3. Docker Client with automatic conversion (most convenient) Requirements: - Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest - crawl4ai installed: pip install crawl4ai """ import asyncio import requests import json import time from typing import Dict, Any # Import Crawl4AI components from crawl4ai import hooks_to_string from crawl4ai.docker_client import Crawl4aiDockerClient # Configuration DOCKER_URL = "http://localhost:11235" TEST_URLS = [ "https://www.kidocode.com", "https://quotes.toscrape.com", "https://httpbin.org/html", ] def print_section(title: str, description: str = ""): """Print a formatted section header""" print("\n" + "=" * 70) print(f" {title}") if description: print(f" {description}") print("=" * 70 + "\n") def check_docker_service() -> bool: """Check if Docker service is running""" try: response = requests.get(f"{DOCKER_URL}/health", timeout=3) return response.status_code == 200 except: return False # ============================================================================ # REUSABLE HOOK LIBRARY # ============================================================================ async def performance_optimization_hook(page, context, **kwargs): """ Performance Hook: Block unnecessary resources to speed up crawling """ print(" [Hook] πŸš€ Optimizing performance - blocking images and ads...") # Block images await context.route( "**/*.{png,jpg,jpeg,gif,webp,svg,ico}", lambda route: route.abort() ) # Block ads and analytics await context.route("**/analytics/*", lambda route: route.abort()) await context.route("**/ads/*", lambda route: route.abort()) await context.route("**/google-analytics.com/*", lambda route: route.abort()) print(" [Hook] βœ“ Performance optimization applied") return page async def viewport_setup_hook(page, context, **kwargs): """ Viewport Hook: Set consistent viewport size for rendering """ print(" [Hook] πŸ–₯️ Setting viewport to 1920x1080...") await page.set_viewport_size({"width": 1920, "height": 1080}) print(" [Hook] βœ“ Viewport configured") return page async def authentication_headers_hook(page, context, url, **kwargs): """ Headers Hook: Add custom authentication and tracking headers """ print(f" [Hook] πŸ” Adding custom headers for {url[:50]}...") await page.set_extra_http_headers({ 'X-Crawl4AI': 'docker-hooks', 'X-Custom-Hook': 'function-based', 'Accept-Language': 'en-US,en;q=0.9', }) print(" [Hook] βœ“ Custom headers added") return page async def lazy_loading_handler_hook(page, context, **kwargs): """ Content Hook: Handle lazy-loaded content by scrolling """ print(" [Hook] πŸ“œ Scrolling to load lazy content...") # Scroll to bottom await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1000) # Scroll to middle await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") await page.wait_for_timeout(500) # Scroll back to top await page.evaluate("window.scrollTo(0, 0)") await page.wait_for_timeout(500) print(" [Hook] βœ“ Lazy content loaded") return page async def page_analytics_hook(page, context, **kwargs): """ Analytics Hook: Log page metrics before extraction """ print(" [Hook] πŸ“Š Collecting page analytics...") metrics = await page.evaluate(''' () => ({ title: document.title, images: document.images.length, links: document.links.length, scripts: document.scripts.length, headings: document.querySelectorAll('h1, h2, h3').length, paragraphs: document.querySelectorAll('p').length }) ''') print(f" [Hook] πŸ“ˆ Page: {metrics['title'][:50]}...") print(f" Links: {metrics['links']}, Images: {metrics['images']}, " f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}") return page # ============================================================================ # APPROACH 1: String-Based Hooks (REST API) # ============================================================================ def example_1_string_based_hooks(): """ Demonstrate string-based hooks with REST API Use this when working with REST API directly or non-Python clients """ print_section( "APPROACH 1: String-Based Hooks (REST API)", "Define hooks as strings for REST API requests" ) # Define hooks as strings hooks_config = { "on_page_context_created": """ async def hook(page, context, **kwargs): print(" [String Hook] Setting up page context...") # Block images for performance await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) await page.set_viewport_size({"width": 1920, "height": 1080}) return page """, "before_goto": """ async def hook(page, context, url, **kwargs): print(f" [String Hook] Navigating to {url[:50]}...") await page.set_extra_http_headers({ 'X-Crawl4AI': 'string-based-hooks', }) return page """, "before_retrieve_html": """ async def hook(page, context, **kwargs): print(" [String Hook] Scrolling page...") await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1000) return page """ } # Prepare request payload payload = { "urls": [TEST_URLS[2]], # httpbin.org "hooks": { "code": hooks_config, "timeout": 30 }, "crawler_config": { "cache_mode": "bypass" } } print(f"🎯 Target URL: {TEST_URLS[2]}") print(f"πŸ”§ Configured {len(hooks_config)} string-based hooks") print(f"πŸ“‘ Sending request to Docker API...\n") try: start_time = time.time() response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) execution_time = time.time() - start_time if response.status_code == 200: result = response.json() print(f"\nβœ… Request successful! (took {execution_time:.2f}s)") # Display results if result.get('results') and result['results'][0].get('success'): crawl_result = result['results'][0] html_length = len(crawl_result.get('html', '')) markdown_length = len(crawl_result.get('markdown', '')) print(f"\nπŸ“Š Results:") print(f" β€’ HTML length: {html_length:,} characters") print(f" β€’ Markdown length: {markdown_length:,} characters") print(f" β€’ URL: {crawl_result.get('url')}") # Check hooks execution if 'hooks' in result: hooks_info = result['hooks'] print(f"\n🎣 Hooks Execution:") print(f" β€’ Status: {hooks_info['status']['status']}") print(f" β€’ Attached hooks: {len(hooks_info['status']['attached_hooks'])}") if 'summary' in hooks_info: summary = hooks_info['summary'] print(f" β€’ Total executions: {summary['total_executions']}") print(f" β€’ Successful: {summary['successful']}") print(f" β€’ Success rate: {summary['success_rate']:.1f}%") else: print(f"⚠️ Crawl completed but no results") else: print(f"❌ Request failed with status {response.status_code}") print(f" Error: {response.text[:200]}") except requests.exceptions.Timeout: print("⏰ Request timed out after 60 seconds") except Exception as e: print(f"❌ Error: {str(e)}") print("\n" + "─" * 70) print("βœ“ String-based hooks example complete\n") # ============================================================================ # APPROACH 2: Function-Based Hooks with hooks_to_string() Utility # ============================================================================ def example_2_hooks_to_string_utility(): """ Demonstrate the hooks_to_string() utility for converting functions Use this when you want to write hooks as functions but use REST API """ print_section( "APPROACH 2: hooks_to_string() Utility", "Convert Python functions to strings for REST API" ) print("πŸ“¦ Creating hook functions...") print(" β€’ performance_optimization_hook") print(" β€’ authentication_headers_hook") print(" β€’ lazy_loading_handler_hook") # Convert function objects to strings using the utility print("\nπŸ”„ Converting functions to strings with hooks_to_string()...") hooks_dict = { "on_page_context_created": performance_optimization_hook, "before_goto": authentication_headers_hook, "before_retrieve_html": lazy_loading_handler_hook, } hooks_as_strings = hooks_to_string(hooks_dict) print(f"βœ… Successfully converted {len(hooks_as_strings)} functions to strings") # Show a preview print("\nπŸ“ Sample converted hook (first 200 characters):") print("─" * 70) sample_hook = list(hooks_as_strings.values())[0] print(sample_hook[:200] + "...") print("─" * 70) # Use the converted hooks with REST API print("\nπŸ“‘ Using converted hooks with REST API...") payload = { "urls": [TEST_URLS[2]], "hooks": { "code": hooks_as_strings, "timeout": 30 } } try: start_time = time.time() response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) execution_time = time.time() - start_time if response.status_code == 200: result = response.json() print(f"\nβœ… Request successful! (took {execution_time:.2f}s)") if result.get('results') and result['results'][0].get('success'): crawl_result = result['results'][0] print(f" β€’ HTML length: {len(crawl_result.get('html', '')):,} characters") print(f" β€’ Hooks executed successfully!") else: print(f"❌ Request failed: {response.status_code}") except Exception as e: print(f"❌ Error: {str(e)}") print("\nπŸ’‘ Benefits of hooks_to_string():") print(" βœ“ Write hooks as regular Python functions") print(" βœ“ Full IDE support (autocomplete, syntax highlighting)") print(" βœ“ Type checking and linting") print(" βœ“ Easy to test and debug") print(" βœ“ Reusable across projects") print(" βœ“ Works with any REST API client") print("\n" + "─" * 70) print("βœ“ hooks_to_string() utility example complete\n") # ============================================================================ # APPROACH 3: Docker Client with Automatic Conversion (RECOMMENDED) # ============================================================================ async def example_3_docker_client_auto_conversion(): """ Demonstrate Docker Client with automatic hook conversion (RECOMMENDED) Use this for the best developer experience with Python """ print_section( "APPROACH 3: Docker Client with Auto-Conversion (RECOMMENDED)", "Pass function objects directly - conversion happens automatically!" ) print("🐳 Initializing Crawl4AI Docker Client...") client = Crawl4aiDockerClient(base_url=DOCKER_URL) print("βœ… Client ready!\n") # Use our reusable hook library - just pass the function objects! print("πŸ“š Using reusable hook library:") print(" β€’ performance_optimization_hook") print(" β€’ authentication_headers_hook") print(" β€’ lazy_loading_handler_hook") print(" β€’ page_analytics_hook") print("\n🎯 Target URL: " + TEST_URLS[0]) print("πŸš€ Starting crawl with automatic hook conversion...\n") try: start_time = time.time() # Pass function objects directly - NO manual conversion needed! ✨ results = await client.crawl( urls=[TEST_URLS[0]], hooks={ "on_page_context_created": performance_optimization_hook, "before_goto": authentication_headers_hook, "before_retrieve_html": lazy_loading_handler_hook, "before_return_html": page_analytics_hook, }, hooks_timeout=30 ) execution_time = time.time() - start_time print(f"\nβœ… Crawl completed! (took {execution_time:.2f}s)\n") # Display results if results and results.success: result = results print(f"πŸ“Š Results:") print(f" β€’ URL: {result.url}") print(f" β€’ Success: {result.success}") print(f" β€’ HTML length: {len(result.html):,} characters") print(f" β€’ Markdown length: {len(result.markdown):,} characters") # Show metadata if result.metadata: print(f"\nπŸ“‹ Metadata:") print(f" β€’ Title: {result.metadata.get('title', 'N/A')[:50]}...") # Show links if result.links: internal_count = len(result.links.get('internal', [])) external_count = len(result.links.get('external', [])) print(f"\nπŸ”— Links Found:") print(f" β€’ Internal: {internal_count}") print(f" β€’ External: {external_count}") else: print(f"⚠️ Crawl completed but no successful results") if results: print(f" Error: {results.error_message}") except Exception as e: print(f"❌ Error: {str(e)}") import traceback traceback.print_exc() print("\n🌟 Why Docker Client is RECOMMENDED:") print(" βœ“ Automatic function-to-string conversion") print(" βœ“ No manual hooks_to_string() calls needed") print(" βœ“ Cleaner, more Pythonic code") print(" βœ“ Full type hints and IDE support") print(" βœ“ Built-in error handling") print(" βœ“ Async/await support") print("\n" + "─" * 70) print("βœ“ Docker Client auto-conversion example complete\n") # ============================================================================ # APPROACH 4: Authentication Example # ============================================================================ def example_4_authentication_flow(): """ Demonstrate authentication flow with multiple hooks """ print_section( "EXAMPLE 4: Authentication Flow", "Using hooks for authentication with cookies and headers" ) hooks_code = { "on_page_context_created": """ async def hook(page, context, **kwargs): print("[HOOK] Setting up authentication context") # Add authentication cookies await context.add_cookies([ { "name": "auth_token", "value": "fake_jwt_token_here", "domain": ".httpbin.org", "path": "/", "httpOnly": True, "secure": True } ]) return page """, "before_goto": """ async def hook(page, context, url, **kwargs): print(f"[HOOK] Adding auth headers for {url}") # Add Authorization header import base64 credentials = base64.b64encode(b"user:passwd").decode('ascii') await page.set_extra_http_headers({ 'Authorization': f'Basic {credentials}', 'X-API-Key': 'test-api-key-123' }) return page """ } payload = { "urls": ["https://httpbin.org/basic-auth/user/passwd"], "hooks": { "code": hooks_code, "timeout": 15 } } print("\nTesting authentication with httpbin endpoints...") response = requests.post(f"{DOCKER_URL}/crawl", json=payload) if response.status_code == 200: data = response.json() print("βœ… Authentication test completed") if 'results' in data: for i, result in enumerate(data['results']): print(f"\n URL {i+1}: {result['url']}") if result.get('success'): # Check for authentication success indicators html_content = result.get('html', '') if '"authenticated"' in html_content and 'true' in html_content: print(" βœ… Authentication successful! Basic auth worked.") else: print(" ⚠️ Page loaded but auth status unclear") else: print(f" ❌ Failed: {result.get('error_message', 'Unknown error')}") else: print(f"❌ Error: {response.status_code}") print("\n" + "─" * 70) print("βœ“ Authentication example complete\n") # ============================================================================ # MAIN EXECUTION # ============================================================================ async def main(): """ Run all example demonstrations """ print("\n" + "=" * 70) print(" πŸš€ Crawl4AI - Docker Hooks System Examples") print("=" * 70) # Check Docker service print("\nπŸ” Checking Docker service status...") if not check_docker_service(): print("❌ Docker service is not running!") print("\nπŸ“‹ To start the Docker service:") print(" docker run -p 11235:11235 unclecode/crawl4ai:latest") print("\nPlease start the service and run this example again.") return print("βœ… Docker service is running!\n") # Run all examples examples = [ ("String-Based Hooks (REST API)", example_1_string_based_hooks, False), ("hooks_to_string() Utility", example_2_hooks_to_string_utility, False), ("Docker Client Auto-Conversion (Recommended)", example_3_docker_client_auto_conversion, True), ("Authentication Flow", example_4_authentication_flow, False), ] for i, (name, example_func, is_async) in enumerate(examples, 1): print(f"\n{'πŸ”·' * 35}") print(f"Example {i}/{len(examples)}: {name}") print(f"{'πŸ”·' * 35}\n") try: if is_async: await example_func() else: example_func() print(f"βœ… Example {i} completed successfully!") # Pause between examples (except the last one) if i < len(examples): print("\n⏸️ Press Enter to continue to next example...") input() except KeyboardInterrupt: print(f"\n⏹️ Examples interrupted by user") break except Exception as e: print(f"\n❌ Example {i} failed: {str(e)}") import traceback traceback.print_exc() print("\nContinuing to next example...\n") continue # Final summary print("\n" + "=" * 70) print(" πŸŽ‰ All Examples Complete!") print("=" * 70) print("\nπŸ“Š Summary - Three Approaches to Docker Hooks:") print("\n✨ 1. String-Based Hooks:") print(" β€’ Write hooks as strings directly in JSON") print(" β€’ Best for: REST API, non-Python clients, simple use cases") print(" β€’ Cons: No IDE support, harder to debug") print("\n✨ 2. hooks_to_string() Utility:") print(" β€’ Write hooks as Python functions, convert to strings") print(" β€’ Best for: Python with REST API, reusable hook libraries") print(" β€’ Pros: IDE support, type checking, easy debugging") print("\n✨ 3. Docker Client (RECOMMENDED):") print(" β€’ Pass function objects directly, automatic conversion") print(" β€’ Best for: Python applications, best developer experience") print(" β€’ Pros: All benefits of #2 + cleaner code, no manual conversion") print("\nπŸ’‘ Recommendation:") print(" Use Docker Client (#3) for Python applications") print(" Use hooks_to_string() (#2) when you need REST API flexibility") print(" Use string-based (#1) for non-Python clients or simple scripts") print("\n🎯 8 Hook Points Available:") print(" β€’ on_browser_created, on_page_context_created") print(" β€’ on_user_agent_updated, before_goto, after_goto") print(" β€’ on_execution_started, before_retrieve_html, before_return_html") print("\nπŸ“š Resources:") print(" β€’ Docs: https://docs.crawl4ai.com/core/docker-deployment") print(" β€’ GitHub: https://github.com/unclecode/crawl4ai") print(" β€’ Discord: https://discord.gg/jP8KfhDhyN") print("\n" + "=" * 70) print(" Happy Crawling! πŸ•·οΈ") print("=" * 70 + "\n") if __name__ == "__main__": print("\n🎬 Starting Crawl4AI Docker Hooks Examples...") print("Press Ctrl+C anytime to exit\n") try: asyncio.run(main()) except KeyboardInterrupt: print("\n\nπŸ‘‹ Examples stopped by user. Thanks for exploring Crawl4AI!") except Exception as e: print(f"\n\n❌ Error: {str(e)}") import traceback traceback.print_exc()