From b74524fdfbf532ddd4b3740cdbc5fa6278037767 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 22 Oct 2025 16:29:19 +0200 Subject: [PATCH] docs: update docker_hooks_examples.py with comprehensive examples and improved structure --- docs/examples/docker_hooks_examples.py | 922 ++++++++++++++----------- 1 file changed, 518 insertions(+), 404 deletions(-) diff --git a/docs/examples/docker_hooks_examples.py b/docs/examples/docker_hooks_examples.py index a9c94d03..b64caf02 100644 --- a/docs/examples/docker_hooks_examples.py +++ b/docs/examples/docker_hooks_examples.py @@ -1,235 +1,451 @@ #!/usr/bin/env python3 """ -Comprehensive test demonstrating all hook types from hooks_example.py -adapted for the Docker API with real URLs +πŸš€ Crawl4AI Docker Hooks System - Complete Examples +==================================================== + +This file demonstrates the Docker Hooks System with three different approaches: + +1. String-based hooks for REST API +2. hooks_to_string() utility to convert functions +3. Docker Client with automatic conversion (most convenient) + +Requirements: +- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest +- crawl4ai installed: pip install crawl4ai """ +import asyncio import requests import json import time from typing import Dict, Any -# API_BASE_URL = "http://localhost:11234" -API_BASE_URL = "http://localhost:11235" +# Import Crawl4AI components +from crawl4ai import hooks_to_string +from crawl4ai.docker_client import Crawl4aiDockerClient + +# Configuration +DOCKER_URL = "http://localhost:11235" +TEST_URLS = [ + "https://www.kidocode.com", + "https://quotes.toscrape.com", + "https://httpbin.org/html", +] -def test_all_hooks_demo(): - """Demonstrate all 8 hook types with practical examples""" - print("=" * 70) - print("Testing: All Hooks Comprehensive Demo") - print("=" * 70) - - hooks_code = { - "on_browser_created": """ -async def hook(browser, **kwargs): - # Hook called after browser is created - print("[HOOK] on_browser_created - Browser is ready!") - # Browser-level configurations would go here - return browser -""", - - "on_page_context_created": """ -async def hook(page, context, **kwargs): - # Hook called after a new page and context are created - print("[HOOK] on_page_context_created - New page created!") - - # Set viewport size for consistent rendering - await page.set_viewport_size({"width": 1920, "height": 1080}) - - # Add cookies for the session (using httpbin.org domain) - await context.add_cookies([ - { - "name": "test_session", - "value": "abc123xyz", - "domain": ".httpbin.org", - "path": "/", - "httpOnly": True, - "secure": True - } - ]) - - # Block ads and tracking scripts to speed up crawling - await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda route: route.abort()) +def print_section(title: str, description: str = ""): + """Print a formatted section header""" + print("\n" + "=" * 70) + print(f" {title}") + if description: + print(f" {description}") + print("=" * 70 + "\n") + + +def check_docker_service() -> bool: + """Check if Docker service is running""" + try: + response = requests.get(f"{DOCKER_URL}/health", timeout=3) + return response.status_code == 200 + except: + return False + + +# ============================================================================ +# REUSABLE HOOK LIBRARY +# ============================================================================ + +async def performance_optimization_hook(page, context, **kwargs): + """ + Performance Hook: Block unnecessary resources to speed up crawling + """ + print(" [Hook] πŸš€ Optimizing performance - blocking images and ads...") + + # Block images + await context.route( + "**/*.{png,jpg,jpeg,gif,webp,svg,ico}", + lambda route: route.abort() + ) + + # Block ads and analytics await context.route("**/analytics/*", lambda route: route.abort()) await context.route("**/ads/*", lambda route: route.abort()) - - print("[HOOK] Viewport set, cookies added, and ads blocked") + await context.route("**/google-analytics.com/*", lambda route: route.abort()) + + print(" [Hook] βœ“ Performance optimization applied") return page -""", - - "on_user_agent_updated": """ -async def hook(page, context, user_agent, **kwargs): - # Hook called when user agent is updated - print(f"[HOOK] on_user_agent_updated - User agent: {user_agent[:50]}...") + + +async def viewport_setup_hook(page, context, **kwargs): + """ + Viewport Hook: Set consistent viewport size for rendering + """ + print(" [Hook] πŸ–₯️ Setting viewport to 1920x1080...") + await page.set_viewport_size({"width": 1920, "height": 1080}) + print(" [Hook] βœ“ Viewport configured") return page -""", - - "before_goto": """ -async def hook(page, context, url, **kwargs): - # Hook called before navigating to each URL - print(f"[HOOK] before_goto - About to visit: {url}") - - # Add custom headers for the request + + +async def authentication_headers_hook(page, context, url, **kwargs): + """ + Headers Hook: Add custom authentication and tracking headers + """ + print(f" [Hook] πŸ” Adding custom headers for {url[:50]}...") + await page.set_extra_http_headers({ - "X-Custom-Header": "crawl4ai-test", - "Accept-Language": "en-US,en;q=0.9", - "DNT": "1" + 'X-Crawl4AI': 'docker-hooks', + 'X-Custom-Hook': 'function-based', + 'Accept-Language': 'en-US,en;q=0.9', }) - + + print(" [Hook] βœ“ Custom headers added") return page -""", - - "after_goto": """ -async def hook(page, context, url, response, **kwargs): - # Hook called after navigating to each URL - print(f"[HOOK] after_goto - Successfully loaded: {url}") - - # Wait a moment for dynamic content to load + + +async def lazy_loading_handler_hook(page, context, **kwargs): + """ + Content Hook: Handle lazy-loaded content by scrolling + """ + print(" [Hook] πŸ“œ Scrolling to load lazy content...") + + # Scroll to bottom + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1000) - - # Check if specific elements exist (with error handling) - try: - # For httpbin.org, wait for body element - await page.wait_for_selector("body", timeout=2000) - print("[HOOK] Body element found and loaded") - except: - print("[HOOK] Timeout waiting for body, continuing anyway") - - return page -""", - - "on_execution_started": """ -async def hook(page, context, **kwargs): - # Hook called after custom JavaScript execution - print("[HOOK] on_execution_started - Custom JS executed!") - - # You could inject additional JavaScript here if needed - await page.evaluate("console.log('[INJECTED] Hook JS running');") - - return page -""", - - "before_retrieve_html": """ -async def hook(page, context, **kwargs): - # Hook called before retrieving the HTML content - print("[HOOK] before_retrieve_html - Preparing to get HTML") - - # Scroll to bottom to trigger lazy loading - await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + + # Scroll to middle + await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") await page.wait_for_timeout(500) - + # Scroll back to top - await page.evaluate("window.scrollTo(0, 0);") + await page.evaluate("window.scrollTo(0, 0)") await page.wait_for_timeout(500) - - # One more scroll to middle for good measure - await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2);") - - print("[HOOK] Scrolling completed for lazy-loaded content") + + print(" [Hook] βœ“ Lazy content loaded") return page -""", - - "before_return_html": """ -async def hook(page, context, html, **kwargs): - # Hook called before returning the HTML content - print(f"[HOOK] before_return_html - HTML length: {len(html)} characters") - - # Log some page metrics - metrics = await page.evaluate('''() => { - return { + + +async def page_analytics_hook(page, context, **kwargs): + """ + Analytics Hook: Log page metrics before extraction + """ + print(" [Hook] πŸ“Š Collecting page analytics...") + + metrics = await page.evaluate(''' + () => ({ + title: document.title, images: document.images.length, links: document.links.length, - scripts: document.scripts.length - } - }''') - - print(f"[HOOK] Page metrics - Images: {metrics['images']}, Links: {metrics['links']}, Scripts: {metrics['scripts']}") - + scripts: document.scripts.length, + headings: document.querySelectorAll('h1, h2, h3').length, + paragraphs: document.querySelectorAll('p').length + }) + ''') + + print(f" [Hook] πŸ“ˆ Page: {metrics['title'][:50]}...") + print(f" Links: {metrics['links']}, Images: {metrics['images']}, " + f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}") + + return page + + +# ============================================================================ +# APPROACH 1: String-Based Hooks (REST API) +# ============================================================================ + +def example_1_string_based_hooks(): + """ + Demonstrate string-based hooks with REST API + Use this when working with REST API directly or non-Python clients + """ + print_section( + "APPROACH 1: String-Based Hooks (REST API)", + "Define hooks as strings for REST API requests" + ) + + # Define hooks as strings + hooks_config = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + print(" [String Hook] Setting up page context...") + # Block images for performance + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page +""", + + "before_goto": """ +async def hook(page, context, url, **kwargs): + print(f" [String Hook] Navigating to {url[:50]}...") + await page.set_extra_http_headers({ + 'X-Crawl4AI': 'string-based-hooks', + }) + return page +""", + + "before_retrieve_html": """ +async def hook(page, context, **kwargs): + print(" [String Hook] Scrolling page...") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) return page """ } - - # Create request payload + + # Prepare request payload payload = { - "urls": ["https://httpbin.org/html"], + "urls": [TEST_URLS[2]], # httpbin.org "hooks": { - "code": hooks_code, + "code": hooks_config, "timeout": 30 }, "crawler_config": { - "js_code": "window.scrollTo(0, document.body.scrollHeight);", - "wait_for": "body", "cache_mode": "bypass" } } - - print("\nSending request with all 8 hooks...") - start_time = time.time() - - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - - elapsed_time = time.time() - start_time - print(f"Request completed in {elapsed_time:.2f} seconds") - - if response.status_code == 200: - data = response.json() - print("\nβœ… Request successful!") - - # Check hooks execution - if 'hooks' in data: - hooks_info = data['hooks'] - print("\nπŸ“Š Hooks Execution Summary:") - print(f" Status: {hooks_info['status']['status']}") - print(f" Attached hooks: {len(hooks_info['status']['attached_hooks'])}") - - for hook_name in hooks_info['status']['attached_hooks']: - print(f" βœ“ {hook_name}") - - if 'summary' in hooks_info: - summary = hooks_info['summary'] - print(f"\nπŸ“ˆ Execution Statistics:") - print(f" Total executions: {summary['total_executions']}") - print(f" Successful: {summary['successful']}") - print(f" Failed: {summary['failed']}") - print(f" Timed out: {summary['timed_out']}") - print(f" Success rate: {summary['success_rate']:.1f}%") - - if hooks_info.get('execution_log'): - print(f"\nπŸ“ Execution Log:") - for log_entry in hooks_info['execution_log']: - status_icon = "βœ…" if log_entry['status'] == 'success' else "❌" - exec_time = log_entry.get('execution_time', 0) - print(f" {status_icon} {log_entry['hook_point']}: {exec_time:.3f}s") - - # Check crawl results - if 'results' in data and len(data['results']) > 0: - print(f"\nπŸ“„ Crawl Results:") - for result in data['results']: - print(f" URL: {result['url']}") - print(f" Success: {result.get('success', False)}") - if result.get('html'): - print(f" HTML length: {len(result['html'])} characters") - - else: - print(f"❌ Error: {response.status_code}") - try: - error_data = response.json() - print(f"Error details: {json.dumps(error_data, indent=2)}") - except: - print(f"Error text: {response.text[:500]}") + + print(f"🎯 Target URL: {TEST_URLS[2]}") + print(f"πŸ”§ Configured {len(hooks_config)} string-based hooks") + print(f"πŸ“‘ Sending request to Docker API...\n") + + try: + start_time = time.time() + response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) + execution_time = time.time() - start_time + + if response.status_code == 200: + result = response.json() + + print(f"\nβœ… Request successful! (took {execution_time:.2f}s)") + + # Display results + if result.get('results') and result['results'][0].get('success'): + crawl_result = result['results'][0] + html_length = len(crawl_result.get('html', '')) + markdown_length = len(crawl_result.get('markdown', '')) + + print(f"\nπŸ“Š Results:") + print(f" β€’ HTML length: {html_length:,} characters") + print(f" β€’ Markdown length: {markdown_length:,} characters") + print(f" β€’ URL: {crawl_result.get('url')}") + + # Check hooks execution + if 'hooks' in result: + hooks_info = result['hooks'] + print(f"\n🎣 Hooks Execution:") + print(f" β€’ Status: {hooks_info['status']['status']}") + print(f" β€’ Attached hooks: {len(hooks_info['status']['attached_hooks'])}") + + if 'summary' in hooks_info: + summary = hooks_info['summary'] + print(f" β€’ Total executions: {summary['total_executions']}") + print(f" β€’ Successful: {summary['successful']}") + print(f" β€’ Success rate: {summary['success_rate']:.1f}%") + else: + print(f"⚠️ Crawl completed but no results") + + else: + print(f"❌ Request failed with status {response.status_code}") + print(f" Error: {response.text[:200]}") + + except requests.exceptions.Timeout: + print("⏰ Request timed out after 60 seconds") + except Exception as e: + print(f"❌ Error: {str(e)}") + + print("\n" + "─" * 70) + print("βœ“ String-based hooks example complete\n") -def test_authentication_flow(): - """Test a complete authentication flow with multiple hooks""" - print("\n" + "=" * 70) - print("Testing: Authentication Flow with Multiple Hooks") - print("=" * 70) - +# ============================================================================ +# APPROACH 2: Function-Based Hooks with hooks_to_string() Utility +# ============================================================================ + +def example_2_hooks_to_string_utility(): + """ + Demonstrate the hooks_to_string() utility for converting functions + Use this when you want to write hooks as functions but use REST API + """ + print_section( + "APPROACH 2: hooks_to_string() Utility", + "Convert Python functions to strings for REST API" + ) + + print("πŸ“¦ Creating hook functions...") + print(" β€’ performance_optimization_hook") + print(" β€’ authentication_headers_hook") + print(" β€’ lazy_loading_handler_hook") + + # Convert function objects to strings using the utility + print("\nπŸ”„ Converting functions to strings with hooks_to_string()...") + + hooks_dict = { + "on_page_context_created": performance_optimization_hook, + "before_goto": authentication_headers_hook, + "before_retrieve_html": lazy_loading_handler_hook, + } + + hooks_as_strings = hooks_to_string(hooks_dict) + + print(f"βœ… Successfully converted {len(hooks_as_strings)} functions to strings") + + # Show a preview + print("\nπŸ“ Sample converted hook (first 200 characters):") + print("─" * 70) + sample_hook = list(hooks_as_strings.values())[0] + print(sample_hook[:200] + "...") + print("─" * 70) + + # Use the converted hooks with REST API + print("\nπŸ“‘ Using converted hooks with REST API...") + + payload = { + "urls": [TEST_URLS[2]], + "hooks": { + "code": hooks_as_strings, + "timeout": 30 + } + } + + try: + start_time = time.time() + response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) + execution_time = time.time() - start_time + + if response.status_code == 200: + result = response.json() + print(f"\nβœ… Request successful! (took {execution_time:.2f}s)") + + if result.get('results') and result['results'][0].get('success'): + crawl_result = result['results'][0] + print(f" β€’ HTML length: {len(crawl_result.get('html', '')):,} characters") + print(f" β€’ Hooks executed successfully!") + else: + print(f"❌ Request failed: {response.status_code}") + + except Exception as e: + print(f"❌ Error: {str(e)}") + + print("\nπŸ’‘ Benefits of hooks_to_string():") + print(" βœ“ Write hooks as regular Python functions") + print(" βœ“ Full IDE support (autocomplete, syntax highlighting)") + print(" βœ“ Type checking and linting") + print(" βœ“ Easy to test and debug") + print(" βœ“ Reusable across projects") + print(" βœ“ Works with any REST API client") + + print("\n" + "─" * 70) + print("βœ“ hooks_to_string() utility example complete\n") + + +# ============================================================================ +# APPROACH 3: Docker Client with Automatic Conversion (RECOMMENDED) +# ============================================================================ + +async def example_3_docker_client_auto_conversion(): + """ + Demonstrate Docker Client with automatic hook conversion (RECOMMENDED) + Use this for the best developer experience with Python + """ + print_section( + "APPROACH 3: Docker Client with Auto-Conversion (RECOMMENDED)", + "Pass function objects directly - conversion happens automatically!" + ) + + print("🐳 Initializing Crawl4AI Docker Client...") + client = Crawl4aiDockerClient(base_url=DOCKER_URL) + + print("βœ… Client ready!\n") + + # Use our reusable hook library - just pass the function objects! + print("πŸ“š Using reusable hook library:") + print(" β€’ performance_optimization_hook") + print(" β€’ authentication_headers_hook") + print(" β€’ lazy_loading_handler_hook") + print(" β€’ page_analytics_hook") + + print("\n🎯 Target URL: " + TEST_URLS[0]) + print("πŸš€ Starting crawl with automatic hook conversion...\n") + + try: + start_time = time.time() + + # Pass function objects directly - NO manual conversion needed! ✨ + results = await client.crawl( + urls=[TEST_URLS[0]], + hooks={ + "on_page_context_created": performance_optimization_hook, + "before_goto": authentication_headers_hook, + "before_retrieve_html": lazy_loading_handler_hook, + "before_return_html": page_analytics_hook, + }, + hooks_timeout=30 + ) + + execution_time = time.time() - start_time + + print(f"\nβœ… Crawl completed! (took {execution_time:.2f}s)\n") + + # Display results + if results and results.success: + result = results + print(f"πŸ“Š Results:") + print(f" β€’ URL: {result.url}") + print(f" β€’ Success: {result.success}") + print(f" β€’ HTML length: {len(result.html):,} characters") + print(f" β€’ Markdown length: {len(result.markdown):,} characters") + + # Show metadata + if result.metadata: + print(f"\nπŸ“‹ Metadata:") + print(f" β€’ Title: {result.metadata.get('title', 'N/A')[:50]}...") + + # Show links + if result.links: + internal_count = len(result.links.get('internal', [])) + external_count = len(result.links.get('external', [])) + print(f"\nπŸ”— Links Found:") + print(f" β€’ Internal: {internal_count}") + print(f" β€’ External: {external_count}") + else: + print(f"⚠️ Crawl completed but no successful results") + if results: + print(f" Error: {results.error_message}") + + except Exception as e: + print(f"❌ Error: {str(e)}") + import traceback + traceback.print_exc() + + print("\n🌟 Why Docker Client is RECOMMENDED:") + print(" βœ“ Automatic function-to-string conversion") + print(" βœ“ No manual hooks_to_string() calls needed") + print(" βœ“ Cleaner, more Pythonic code") + print(" βœ“ Full type hints and IDE support") + print(" βœ“ Built-in error handling") + print(" βœ“ Async/await support") + + print("\n" + "─" * 70) + print("βœ“ Docker Client auto-conversion example complete\n") + + +# ============================================================================ +# APPROACH 4: Authentication Example +# ============================================================================ + +def example_4_authentication_flow(): + """ + Demonstrate authentication flow with multiple hooks + """ + print_section( + "EXAMPLE 4: Authentication Flow", + "Using hooks for authentication with cookies and headers" + ) + hooks_code = { "on_page_context_created": """ async def hook(page, context, **kwargs): print("[HOOK] Setting up authentication context") - + # Add authentication cookies await context.add_cookies([ { @@ -241,50 +457,42 @@ async def hook(page, context, **kwargs): "secure": True } ]) - - # Set localStorage items (for SPA authentication) - await page.evaluate(''' - localStorage.setItem('user_id', '12345'); - localStorage.setItem('auth_time', new Date().toISOString()); - ''') - + return page """, - + "before_goto": """ async def hook(page, context, url, **kwargs): print(f"[HOOK] Adding auth headers for {url}") - + # Add Authorization header import base64 credentials = base64.b64encode(b"user:passwd").decode('ascii') - + await page.set_extra_http_headers({ 'Authorization': f'Basic {credentials}', 'X-API-Key': 'test-api-key-123' }) - + return page """ } - + payload = { - "urls": [ - "https://httpbin.org/basic-auth/user/passwd" - ], + "urls": ["https://httpbin.org/basic-auth/user/passwd"], "hooks": { "code": hooks_code, "timeout": 15 } } - + print("\nTesting authentication with httpbin endpoints...") - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - + response = requests.post(f"{DOCKER_URL}/crawl", json=payload) + if response.status_code == 200: data = response.json() print("βœ… Authentication test completed") - + if 'results' in data: for i, result in enumerate(data['results']): print(f"\n URL {i+1}: {result['url']}") @@ -300,214 +508,120 @@ async def hook(page, context, url, **kwargs): else: print(f"❌ Error: {response.status_code}") + print("\n" + "─" * 70) + print("βœ“ Authentication example complete\n") -def test_performance_optimization_hooks(): - """Test hooks for performance optimization""" + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +async def main(): + """ + Run all example demonstrations + """ print("\n" + "=" * 70) - print("Testing: Performance Optimization Hooks") + print(" πŸš€ Crawl4AI - Docker Hooks System Examples") print("=" * 70) - - hooks_code = { - "on_page_context_created": """ -async def hook(page, context, **kwargs): - print("[HOOK] Optimizing page for performance") - - # Block resource-heavy content - await context.route("**/*.{png,jpg,jpeg,gif,webp,svg,ico}", lambda route: route.abort()) - await context.route("**/*.{woff,woff2,ttf,otf}", lambda route: route.abort()) - await context.route("**/*.{mp4,webm,ogg,mp3,wav}", lambda route: route.abort()) - await context.route("**/googletagmanager.com/*", lambda route: route.abort()) - await context.route("**/google-analytics.com/*", lambda route: route.abort()) - await context.route("**/doubleclick.net/*", lambda route: route.abort()) - await context.route("**/facebook.com/*", lambda route: route.abort()) - - # Disable animations and transitions - await page.add_style_tag(content=''' - *, *::before, *::after { - animation-duration: 0s !important; - animation-delay: 0s !important; - transition-duration: 0s !important; - transition-delay: 0s !important; - } - ''') - - print("[HOOK] Performance optimizations applied") - return page -""", - - "before_retrieve_html": """ -async def hook(page, context, **kwargs): - print("[HOOK] Removing unnecessary elements before extraction") - - # Remove ads, popups, and other unnecessary elements - await page.evaluate('''() => { - // Remove common ad containers - const adSelectors = [ - '.ad', '.ads', '.advertisement', '[id*="ad-"]', '[class*="ad-"]', - '.popup', '.modal', '.overlay', '.cookie-banner', '.newsletter-signup' - ]; - - adSelectors.forEach(selector => { - document.querySelectorAll(selector).forEach(el => el.remove()); - }); - - // Remove script tags to clean up HTML - document.querySelectorAll('script').forEach(el => el.remove()); - - // Remove style tags we don't need - document.querySelectorAll('style').forEach(el => el.remove()); - }''') - - return page -""" - } - - payload = { - "urls": ["https://httpbin.org/html"], - "hooks": { - "code": hooks_code, - "timeout": 10 - } - } - - print("\nTesting performance optimization hooks...") - start_time = time.time() - - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - - elapsed_time = time.time() - start_time - print(f"Request completed in {elapsed_time:.2f} seconds") - - if response.status_code == 200: - data = response.json() - print("βœ… Performance optimization test completed") - - if 'results' in data and len(data['results']) > 0: - result = data['results'][0] - if result.get('html'): - print(f" HTML size: {len(result['html'])} characters") - print(" Resources blocked, ads removed, animations disabled") - else: - print(f"❌ Error: {response.status_code}") + # Check Docker service + print("\nπŸ” Checking Docker service status...") + if not check_docker_service(): + print("❌ Docker service is not running!") + print("\nπŸ“‹ To start the Docker service:") + print(" docker run -p 11235:11235 unclecode/crawl4ai:latest") + print("\nPlease start the service and run this example again.") + return -def test_content_extraction_hooks(): - """Test hooks for intelligent content extraction""" - print("\n" + "=" * 70) - print("Testing: Content Extraction Hooks") - print("=" * 70) - - hooks_code = { - "after_goto": """ -async def hook(page, context, url, response, **kwargs): - print(f"[HOOK] Waiting for dynamic content on {url}") - - # Wait for any lazy-loaded content - await page.wait_for_timeout(2000) - - # Trigger any "Load More" buttons - try: - load_more = await page.query_selector('[class*="load-more"], [class*="show-more"], button:has-text("Load More")') - if load_more: - await load_more.click() - await page.wait_for_timeout(1000) - print("[HOOK] Clicked 'Load More' button") - except: - pass - - return page -""", - - "before_retrieve_html": """ -async def hook(page, context, **kwargs): - print("[HOOK] Extracting structured data") - - # Extract metadata - metadata = await page.evaluate('''() => { - const getMeta = (name) => { - const element = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`); - return element ? element.getAttribute('content') : null; - }; - - return { - title: document.title, - description: getMeta('description') || getMeta('og:description'), - author: getMeta('author'), - keywords: getMeta('keywords'), - ogTitle: getMeta('og:title'), - ogImage: getMeta('og:image'), - canonical: document.querySelector('link[rel="canonical"]')?.href, - jsonLd: Array.from(document.querySelectorAll('script[type="application/ld+json"]')) - .map(el => el.textContent).filter(Boolean) - }; - }''') - - print(f"[HOOK] Extracted metadata: {json.dumps(metadata, indent=2)}") - - # Infinite scroll handling - for i in range(3): - await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") - await page.wait_for_timeout(1000) - print(f"[HOOK] Scroll iteration {i+1}/3") - - return page -""" - } - - payload = { - "urls": ["https://httpbin.org/html", "https://httpbin.org/json"], - "hooks": { - "code": hooks_code, - "timeout": 20 - } - } - - print("\nTesting content extraction hooks...") - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - - if response.status_code == 200: - data = response.json() - print("βœ… Content extraction test completed") - - if 'hooks' in data and 'summary' in data['hooks']: - summary = data['hooks']['summary'] - print(f" Hooks executed: {summary['successful']}/{summary['total_executions']}") - - if 'results' in data: - for result in data['results']: - print(f"\n URL: {result['url']}") - print(f" Success: {result.get('success', False)}") - else: - print(f"❌ Error: {response.status_code}") + print("βœ… Docker service is running!\n") - -def main(): - """Run comprehensive hook tests""" - print("πŸ”§ Crawl4AI Docker API - Comprehensive Hooks Testing") - print("Based on docs/examples/hooks_example.py") - print("=" * 70) - - tests = [ - ("All Hooks Demo", test_all_hooks_demo), - ("Authentication Flow", test_authentication_flow), - ("Performance Optimization", test_performance_optimization_hooks), - ("Content Extraction", test_content_extraction_hooks), + # Run all examples + examples = [ + ("String-Based Hooks (REST API)", example_1_string_based_hooks, False), + ("hooks_to_string() Utility", example_2_hooks_to_string_utility, False), + ("Docker Client Auto-Conversion (Recommended)", example_3_docker_client_auto_conversion, True), + ("Authentication Flow", example_4_authentication_flow, False), ] - - for i, (name, test_func) in enumerate(tests, 1): - print(f"\nπŸ“Œ Test {i}/{len(tests)}: {name}") + + for i, (name, example_func, is_async) in enumerate(examples, 1): + print(f"\n{'πŸ”·' * 35}") + print(f"Example {i}/{len(examples)}: {name}") + print(f"{'πŸ”·' * 35}\n") + try: - test_func() - print(f"βœ… {name} completed") + if is_async: + await example_func() + else: + example_func() + + print(f"βœ… Example {i} completed successfully!") + + # Pause between examples (except the last one) + if i < len(examples): + print("\n⏸️ Press Enter to continue to next example...") + input() + + except KeyboardInterrupt: + print(f"\n⏹️ Examples interrupted by user") + break except Exception as e: - print(f"❌ {name} failed: {e}") + print(f"\n❌ Example {i} failed: {str(e)}") import traceback traceback.print_exc() - + print("\nContinuing to next example...\n") + continue + + # Final summary print("\n" + "=" * 70) - print("πŸŽ‰ All comprehensive hook tests completed!") + print(" πŸŽ‰ All Examples Complete!") print("=" * 70) + print("\nπŸ“Š Summary - Three Approaches to Docker Hooks:") + + print("\n✨ 1. String-Based Hooks:") + print(" β€’ Write hooks as strings directly in JSON") + print(" β€’ Best for: REST API, non-Python clients, simple use cases") + print(" β€’ Cons: No IDE support, harder to debug") + + print("\n✨ 2. hooks_to_string() Utility:") + print(" β€’ Write hooks as Python functions, convert to strings") + print(" β€’ Best for: Python with REST API, reusable hook libraries") + print(" β€’ Pros: IDE support, type checking, easy debugging") + + print("\n✨ 3. Docker Client (RECOMMENDED):") + print(" β€’ Pass function objects directly, automatic conversion") + print(" β€’ Best for: Python applications, best developer experience") + print(" β€’ Pros: All benefits of #2 + cleaner code, no manual conversion") + + print("\nπŸ’‘ Recommendation:") + print(" Use Docker Client (#3) for Python applications") + print(" Use hooks_to_string() (#2) when you need REST API flexibility") + print(" Use string-based (#1) for non-Python clients or simple scripts") + + print("\n🎯 8 Hook Points Available:") + print(" β€’ on_browser_created, on_page_context_created") + print(" β€’ on_user_agent_updated, before_goto, after_goto") + print(" β€’ on_execution_started, before_retrieve_html, before_return_html") + + print("\nπŸ“š Resources:") + print(" β€’ Docs: https://docs.crawl4ai.com/core/docker-deployment") + print(" β€’ GitHub: https://github.com/unclecode/crawl4ai") + print(" β€’ Discord: https://discord.gg/jP8KfhDhyN") + + print("\n" + "=" * 70) + print(" Happy Crawling! πŸ•·οΈ") + print("=" * 70 + "\n") + if __name__ == "__main__": - main() \ No newline at end of file + print("\n🎬 Starting Crawl4AI Docker Hooks Examples...") + print("Press Ctrl+C anytime to exit\n") + + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\n\nπŸ‘‹ Examples stopped by user. Thanks for exploring Crawl4AI!") + except Exception as e: + print(f"\n\n❌ Error: {str(e)}") + import traceback + traceback.print_exc()