diff --git a/deploy/docker/README.md b/deploy/docker/README.md index 2c920ef1..cee8af7f 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -785,6 +785,54 @@ curl http://localhost:11235/crawl/job/crawl_xyz The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`. +#### LLM Extraction Jobs with Webhooks + +The same webhook system works for LLM extraction jobs via `/llm/job`: + +```bash +# Submit LLM extraction job with webhook +curl -X POST http://localhost:11235/llm/job \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com/article", + "q": "Extract the article title, author, and main points", + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/llm-complete", + "webhook_data_in_payload": true, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + }' + +# Response: {"task_id": "llm_1234567890"} +``` + +**Your webhook receives:** +```json +{ + "task_id": "llm_1234567890", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-22T12:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "main_points": ["Point 1", "Point 2", "Point 3"] + } + } +} +``` + +**Key Differences for LLM Jobs:** +- Task type is `"llm_extraction"` instead of `"crawl"` +- Extracted data is in `data.extracted_content` +- Single URL only (not an array) +- Supports schema-based extraction with `schema` parameter + > πŸ’‘ **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling. --- diff --git a/docs/examples/docker_hooks_examples.py b/docs/examples/docker_hooks_examples.py index a9c94d03..b64caf02 100644 --- a/docs/examples/docker_hooks_examples.py +++ b/docs/examples/docker_hooks_examples.py @@ -1,235 +1,451 @@ #!/usr/bin/env python3 """ -Comprehensive test demonstrating all hook types from hooks_example.py -adapted for the Docker API with real URLs +πŸš€ Crawl4AI Docker Hooks System - Complete Examples +==================================================== + +This file demonstrates the Docker Hooks System with three different approaches: + +1. String-based hooks for REST API +2. hooks_to_string() utility to convert functions +3. Docker Client with automatic conversion (most convenient) + +Requirements: +- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest +- crawl4ai installed: pip install crawl4ai """ +import asyncio import requests import json import time from typing import Dict, Any -# API_BASE_URL = "http://localhost:11234" -API_BASE_URL = "http://localhost:11235" +# Import Crawl4AI components +from crawl4ai import hooks_to_string +from crawl4ai.docker_client import Crawl4aiDockerClient + +# Configuration +DOCKER_URL = "http://localhost:11235" +TEST_URLS = [ + "https://www.kidocode.com", + "https://quotes.toscrape.com", + "https://httpbin.org/html", +] -def test_all_hooks_demo(): - """Demonstrate all 8 hook types with practical examples""" - print("=" * 70) - print("Testing: All Hooks Comprehensive Demo") - print("=" * 70) - - hooks_code = { - "on_browser_created": """ -async def hook(browser, **kwargs): - # Hook called after browser is created - print("[HOOK] on_browser_created - Browser is ready!") - # Browser-level configurations would go here - return browser -""", - - "on_page_context_created": """ -async def hook(page, context, **kwargs): - # Hook called after a new page and context are created - print("[HOOK] on_page_context_created - New page created!") - - # Set viewport size for consistent rendering - await page.set_viewport_size({"width": 1920, "height": 1080}) - - # Add cookies for the session (using httpbin.org domain) - await context.add_cookies([ - { - "name": "test_session", - "value": "abc123xyz", - "domain": ".httpbin.org", - "path": "/", - "httpOnly": True, - "secure": True - } - ]) - - # Block ads and tracking scripts to speed up crawling - await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda route: route.abort()) +def print_section(title: str, description: str = ""): + """Print a formatted section header""" + print("\n" + "=" * 70) + print(f" {title}") + if description: + print(f" {description}") + print("=" * 70 + "\n") + + +def check_docker_service() -> bool: + """Check if Docker service is running""" + try: + response = requests.get(f"{DOCKER_URL}/health", timeout=3) + return response.status_code == 200 + except: + return False + + +# ============================================================================ +# REUSABLE HOOK LIBRARY +# ============================================================================ + +async def performance_optimization_hook(page, context, **kwargs): + """ + Performance Hook: Block unnecessary resources to speed up crawling + """ + print(" [Hook] πŸš€ Optimizing performance - blocking images and ads...") + + # Block images + await context.route( + "**/*.{png,jpg,jpeg,gif,webp,svg,ico}", + lambda route: route.abort() + ) + + # Block ads and analytics await context.route("**/analytics/*", lambda route: route.abort()) await context.route("**/ads/*", lambda route: route.abort()) - - print("[HOOK] Viewport set, cookies added, and ads blocked") + await context.route("**/google-analytics.com/*", lambda route: route.abort()) + + print(" [Hook] βœ“ Performance optimization applied") return page -""", - - "on_user_agent_updated": """ -async def hook(page, context, user_agent, **kwargs): - # Hook called when user agent is updated - print(f"[HOOK] on_user_agent_updated - User agent: {user_agent[:50]}...") + + +async def viewport_setup_hook(page, context, **kwargs): + """ + Viewport Hook: Set consistent viewport size for rendering + """ + print(" [Hook] πŸ–₯️ Setting viewport to 1920x1080...") + await page.set_viewport_size({"width": 1920, "height": 1080}) + print(" [Hook] βœ“ Viewport configured") return page -""", - - "before_goto": """ -async def hook(page, context, url, **kwargs): - # Hook called before navigating to each URL - print(f"[HOOK] before_goto - About to visit: {url}") - - # Add custom headers for the request + + +async def authentication_headers_hook(page, context, url, **kwargs): + """ + Headers Hook: Add custom authentication and tracking headers + """ + print(f" [Hook] πŸ” Adding custom headers for {url[:50]}...") + await page.set_extra_http_headers({ - "X-Custom-Header": "crawl4ai-test", - "Accept-Language": "en-US,en;q=0.9", - "DNT": "1" + 'X-Crawl4AI': 'docker-hooks', + 'X-Custom-Hook': 'function-based', + 'Accept-Language': 'en-US,en;q=0.9', }) - + + print(" [Hook] βœ“ Custom headers added") return page -""", - - "after_goto": """ -async def hook(page, context, url, response, **kwargs): - # Hook called after navigating to each URL - print(f"[HOOK] after_goto - Successfully loaded: {url}") - - # Wait a moment for dynamic content to load + + +async def lazy_loading_handler_hook(page, context, **kwargs): + """ + Content Hook: Handle lazy-loaded content by scrolling + """ + print(" [Hook] πŸ“œ Scrolling to load lazy content...") + + # Scroll to bottom + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1000) - - # Check if specific elements exist (with error handling) - try: - # For httpbin.org, wait for body element - await page.wait_for_selector("body", timeout=2000) - print("[HOOK] Body element found and loaded") - except: - print("[HOOK] Timeout waiting for body, continuing anyway") - - return page -""", - - "on_execution_started": """ -async def hook(page, context, **kwargs): - # Hook called after custom JavaScript execution - print("[HOOK] on_execution_started - Custom JS executed!") - - # You could inject additional JavaScript here if needed - await page.evaluate("console.log('[INJECTED] Hook JS running');") - - return page -""", - - "before_retrieve_html": """ -async def hook(page, context, **kwargs): - # Hook called before retrieving the HTML content - print("[HOOK] before_retrieve_html - Preparing to get HTML") - - # Scroll to bottom to trigger lazy loading - await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + + # Scroll to middle + await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") await page.wait_for_timeout(500) - + # Scroll back to top - await page.evaluate("window.scrollTo(0, 0);") + await page.evaluate("window.scrollTo(0, 0)") await page.wait_for_timeout(500) - - # One more scroll to middle for good measure - await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2);") - - print("[HOOK] Scrolling completed for lazy-loaded content") + + print(" [Hook] βœ“ Lazy content loaded") return page -""", - - "before_return_html": """ -async def hook(page, context, html, **kwargs): - # Hook called before returning the HTML content - print(f"[HOOK] before_return_html - HTML length: {len(html)} characters") - - # Log some page metrics - metrics = await page.evaluate('''() => { - return { + + +async def page_analytics_hook(page, context, **kwargs): + """ + Analytics Hook: Log page metrics before extraction + """ + print(" [Hook] πŸ“Š Collecting page analytics...") + + metrics = await page.evaluate(''' + () => ({ + title: document.title, images: document.images.length, links: document.links.length, - scripts: document.scripts.length - } - }''') - - print(f"[HOOK] Page metrics - Images: {metrics['images']}, Links: {metrics['links']}, Scripts: {metrics['scripts']}") - + scripts: document.scripts.length, + headings: document.querySelectorAll('h1, h2, h3').length, + paragraphs: document.querySelectorAll('p').length + }) + ''') + + print(f" [Hook] πŸ“ˆ Page: {metrics['title'][:50]}...") + print(f" Links: {metrics['links']}, Images: {metrics['images']}, " + f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}") + + return page + + +# ============================================================================ +# APPROACH 1: String-Based Hooks (REST API) +# ============================================================================ + +def example_1_string_based_hooks(): + """ + Demonstrate string-based hooks with REST API + Use this when working with REST API directly or non-Python clients + """ + print_section( + "APPROACH 1: String-Based Hooks (REST API)", + "Define hooks as strings for REST API requests" + ) + + # Define hooks as strings + hooks_config = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + print(" [String Hook] Setting up page context...") + # Block images for performance + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page +""", + + "before_goto": """ +async def hook(page, context, url, **kwargs): + print(f" [String Hook] Navigating to {url[:50]}...") + await page.set_extra_http_headers({ + 'X-Crawl4AI': 'string-based-hooks', + }) + return page +""", + + "before_retrieve_html": """ +async def hook(page, context, **kwargs): + print(" [String Hook] Scrolling page...") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) return page """ } - - # Create request payload + + # Prepare request payload payload = { - "urls": ["https://httpbin.org/html"], + "urls": [TEST_URLS[2]], # httpbin.org "hooks": { - "code": hooks_code, + "code": hooks_config, "timeout": 30 }, "crawler_config": { - "js_code": "window.scrollTo(0, document.body.scrollHeight);", - "wait_for": "body", "cache_mode": "bypass" } } - - print("\nSending request with all 8 hooks...") - start_time = time.time() - - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - - elapsed_time = time.time() - start_time - print(f"Request completed in {elapsed_time:.2f} seconds") - - if response.status_code == 200: - data = response.json() - print("\nβœ… Request successful!") - - # Check hooks execution - if 'hooks' in data: - hooks_info = data['hooks'] - print("\nπŸ“Š Hooks Execution Summary:") - print(f" Status: {hooks_info['status']['status']}") - print(f" Attached hooks: {len(hooks_info['status']['attached_hooks'])}") - - for hook_name in hooks_info['status']['attached_hooks']: - print(f" βœ“ {hook_name}") - - if 'summary' in hooks_info: - summary = hooks_info['summary'] - print(f"\nπŸ“ˆ Execution Statistics:") - print(f" Total executions: {summary['total_executions']}") - print(f" Successful: {summary['successful']}") - print(f" Failed: {summary['failed']}") - print(f" Timed out: {summary['timed_out']}") - print(f" Success rate: {summary['success_rate']:.1f}%") - - if hooks_info.get('execution_log'): - print(f"\nπŸ“ Execution Log:") - for log_entry in hooks_info['execution_log']: - status_icon = "βœ…" if log_entry['status'] == 'success' else "❌" - exec_time = log_entry.get('execution_time', 0) - print(f" {status_icon} {log_entry['hook_point']}: {exec_time:.3f}s") - - # Check crawl results - if 'results' in data and len(data['results']) > 0: - print(f"\nπŸ“„ Crawl Results:") - for result in data['results']: - print(f" URL: {result['url']}") - print(f" Success: {result.get('success', False)}") - if result.get('html'): - print(f" HTML length: {len(result['html'])} characters") - - else: - print(f"❌ Error: {response.status_code}") - try: - error_data = response.json() - print(f"Error details: {json.dumps(error_data, indent=2)}") - except: - print(f"Error text: {response.text[:500]}") + + print(f"🎯 Target URL: {TEST_URLS[2]}") + print(f"πŸ”§ Configured {len(hooks_config)} string-based hooks") + print(f"πŸ“‘ Sending request to Docker API...\n") + + try: + start_time = time.time() + response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) + execution_time = time.time() - start_time + + if response.status_code == 200: + result = response.json() + + print(f"\nβœ… Request successful! (took {execution_time:.2f}s)") + + # Display results + if result.get('results') and result['results'][0].get('success'): + crawl_result = result['results'][0] + html_length = len(crawl_result.get('html', '')) + markdown_length = len(crawl_result.get('markdown', '')) + + print(f"\nπŸ“Š Results:") + print(f" β€’ HTML length: {html_length:,} characters") + print(f" β€’ Markdown length: {markdown_length:,} characters") + print(f" β€’ URL: {crawl_result.get('url')}") + + # Check hooks execution + if 'hooks' in result: + hooks_info = result['hooks'] + print(f"\n🎣 Hooks Execution:") + print(f" β€’ Status: {hooks_info['status']['status']}") + print(f" β€’ Attached hooks: {len(hooks_info['status']['attached_hooks'])}") + + if 'summary' in hooks_info: + summary = hooks_info['summary'] + print(f" β€’ Total executions: {summary['total_executions']}") + print(f" β€’ Successful: {summary['successful']}") + print(f" β€’ Success rate: {summary['success_rate']:.1f}%") + else: + print(f"⚠️ Crawl completed but no results") + + else: + print(f"❌ Request failed with status {response.status_code}") + print(f" Error: {response.text[:200]}") + + except requests.exceptions.Timeout: + print("⏰ Request timed out after 60 seconds") + except Exception as e: + print(f"❌ Error: {str(e)}") + + print("\n" + "─" * 70) + print("βœ“ String-based hooks example complete\n") -def test_authentication_flow(): - """Test a complete authentication flow with multiple hooks""" - print("\n" + "=" * 70) - print("Testing: Authentication Flow with Multiple Hooks") - print("=" * 70) - +# ============================================================================ +# APPROACH 2: Function-Based Hooks with hooks_to_string() Utility +# ============================================================================ + +def example_2_hooks_to_string_utility(): + """ + Demonstrate the hooks_to_string() utility for converting functions + Use this when you want to write hooks as functions but use REST API + """ + print_section( + "APPROACH 2: hooks_to_string() Utility", + "Convert Python functions to strings for REST API" + ) + + print("πŸ“¦ Creating hook functions...") + print(" β€’ performance_optimization_hook") + print(" β€’ authentication_headers_hook") + print(" β€’ lazy_loading_handler_hook") + + # Convert function objects to strings using the utility + print("\nπŸ”„ Converting functions to strings with hooks_to_string()...") + + hooks_dict = { + "on_page_context_created": performance_optimization_hook, + "before_goto": authentication_headers_hook, + "before_retrieve_html": lazy_loading_handler_hook, + } + + hooks_as_strings = hooks_to_string(hooks_dict) + + print(f"βœ… Successfully converted {len(hooks_as_strings)} functions to strings") + + # Show a preview + print("\nπŸ“ Sample converted hook (first 200 characters):") + print("─" * 70) + sample_hook = list(hooks_as_strings.values())[0] + print(sample_hook[:200] + "...") + print("─" * 70) + + # Use the converted hooks with REST API + print("\nπŸ“‘ Using converted hooks with REST API...") + + payload = { + "urls": [TEST_URLS[2]], + "hooks": { + "code": hooks_as_strings, + "timeout": 30 + } + } + + try: + start_time = time.time() + response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) + execution_time = time.time() - start_time + + if response.status_code == 200: + result = response.json() + print(f"\nβœ… Request successful! (took {execution_time:.2f}s)") + + if result.get('results') and result['results'][0].get('success'): + crawl_result = result['results'][0] + print(f" β€’ HTML length: {len(crawl_result.get('html', '')):,} characters") + print(f" β€’ Hooks executed successfully!") + else: + print(f"❌ Request failed: {response.status_code}") + + except Exception as e: + print(f"❌ Error: {str(e)}") + + print("\nπŸ’‘ Benefits of hooks_to_string():") + print(" βœ“ Write hooks as regular Python functions") + print(" βœ“ Full IDE support (autocomplete, syntax highlighting)") + print(" βœ“ Type checking and linting") + print(" βœ“ Easy to test and debug") + print(" βœ“ Reusable across projects") + print(" βœ“ Works with any REST API client") + + print("\n" + "─" * 70) + print("βœ“ hooks_to_string() utility example complete\n") + + +# ============================================================================ +# APPROACH 3: Docker Client with Automatic Conversion (RECOMMENDED) +# ============================================================================ + +async def example_3_docker_client_auto_conversion(): + """ + Demonstrate Docker Client with automatic hook conversion (RECOMMENDED) + Use this for the best developer experience with Python + """ + print_section( + "APPROACH 3: Docker Client with Auto-Conversion (RECOMMENDED)", + "Pass function objects directly - conversion happens automatically!" + ) + + print("🐳 Initializing Crawl4AI Docker Client...") + client = Crawl4aiDockerClient(base_url=DOCKER_URL) + + print("βœ… Client ready!\n") + + # Use our reusable hook library - just pass the function objects! + print("πŸ“š Using reusable hook library:") + print(" β€’ performance_optimization_hook") + print(" β€’ authentication_headers_hook") + print(" β€’ lazy_loading_handler_hook") + print(" β€’ page_analytics_hook") + + print("\n🎯 Target URL: " + TEST_URLS[0]) + print("πŸš€ Starting crawl with automatic hook conversion...\n") + + try: + start_time = time.time() + + # Pass function objects directly - NO manual conversion needed! ✨ + results = await client.crawl( + urls=[TEST_URLS[0]], + hooks={ + "on_page_context_created": performance_optimization_hook, + "before_goto": authentication_headers_hook, + "before_retrieve_html": lazy_loading_handler_hook, + "before_return_html": page_analytics_hook, + }, + hooks_timeout=30 + ) + + execution_time = time.time() - start_time + + print(f"\nβœ… Crawl completed! (took {execution_time:.2f}s)\n") + + # Display results + if results and results.success: + result = results + print(f"πŸ“Š Results:") + print(f" β€’ URL: {result.url}") + print(f" β€’ Success: {result.success}") + print(f" β€’ HTML length: {len(result.html):,} characters") + print(f" β€’ Markdown length: {len(result.markdown):,} characters") + + # Show metadata + if result.metadata: + print(f"\nπŸ“‹ Metadata:") + print(f" β€’ Title: {result.metadata.get('title', 'N/A')[:50]}...") + + # Show links + if result.links: + internal_count = len(result.links.get('internal', [])) + external_count = len(result.links.get('external', [])) + print(f"\nπŸ”— Links Found:") + print(f" β€’ Internal: {internal_count}") + print(f" β€’ External: {external_count}") + else: + print(f"⚠️ Crawl completed but no successful results") + if results: + print(f" Error: {results.error_message}") + + except Exception as e: + print(f"❌ Error: {str(e)}") + import traceback + traceback.print_exc() + + print("\n🌟 Why Docker Client is RECOMMENDED:") + print(" βœ“ Automatic function-to-string conversion") + print(" βœ“ No manual hooks_to_string() calls needed") + print(" βœ“ Cleaner, more Pythonic code") + print(" βœ“ Full type hints and IDE support") + print(" βœ“ Built-in error handling") + print(" βœ“ Async/await support") + + print("\n" + "─" * 70) + print("βœ“ Docker Client auto-conversion example complete\n") + + +# ============================================================================ +# APPROACH 4: Authentication Example +# ============================================================================ + +def example_4_authentication_flow(): + """ + Demonstrate authentication flow with multiple hooks + """ + print_section( + "EXAMPLE 4: Authentication Flow", + "Using hooks for authentication with cookies and headers" + ) + hooks_code = { "on_page_context_created": """ async def hook(page, context, **kwargs): print("[HOOK] Setting up authentication context") - + # Add authentication cookies await context.add_cookies([ { @@ -241,50 +457,42 @@ async def hook(page, context, **kwargs): "secure": True } ]) - - # Set localStorage items (for SPA authentication) - await page.evaluate(''' - localStorage.setItem('user_id', '12345'); - localStorage.setItem('auth_time', new Date().toISOString()); - ''') - + return page """, - + "before_goto": """ async def hook(page, context, url, **kwargs): print(f"[HOOK] Adding auth headers for {url}") - + # Add Authorization header import base64 credentials = base64.b64encode(b"user:passwd").decode('ascii') - + await page.set_extra_http_headers({ 'Authorization': f'Basic {credentials}', 'X-API-Key': 'test-api-key-123' }) - + return page """ } - + payload = { - "urls": [ - "https://httpbin.org/basic-auth/user/passwd" - ], + "urls": ["https://httpbin.org/basic-auth/user/passwd"], "hooks": { "code": hooks_code, "timeout": 15 } } - + print("\nTesting authentication with httpbin endpoints...") - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - + response = requests.post(f"{DOCKER_URL}/crawl", json=payload) + if response.status_code == 200: data = response.json() print("βœ… Authentication test completed") - + if 'results' in data: for i, result in enumerate(data['results']): print(f"\n URL {i+1}: {result['url']}") @@ -300,214 +508,120 @@ async def hook(page, context, url, **kwargs): else: print(f"❌ Error: {response.status_code}") + print("\n" + "─" * 70) + print("βœ“ Authentication example complete\n") -def test_performance_optimization_hooks(): - """Test hooks for performance optimization""" + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +async def main(): + """ + Run all example demonstrations + """ print("\n" + "=" * 70) - print("Testing: Performance Optimization Hooks") + print(" πŸš€ Crawl4AI - Docker Hooks System Examples") print("=" * 70) - - hooks_code = { - "on_page_context_created": """ -async def hook(page, context, **kwargs): - print("[HOOK] Optimizing page for performance") - - # Block resource-heavy content - await context.route("**/*.{png,jpg,jpeg,gif,webp,svg,ico}", lambda route: route.abort()) - await context.route("**/*.{woff,woff2,ttf,otf}", lambda route: route.abort()) - await context.route("**/*.{mp4,webm,ogg,mp3,wav}", lambda route: route.abort()) - await context.route("**/googletagmanager.com/*", lambda route: route.abort()) - await context.route("**/google-analytics.com/*", lambda route: route.abort()) - await context.route("**/doubleclick.net/*", lambda route: route.abort()) - await context.route("**/facebook.com/*", lambda route: route.abort()) - - # Disable animations and transitions - await page.add_style_tag(content=''' - *, *::before, *::after { - animation-duration: 0s !important; - animation-delay: 0s !important; - transition-duration: 0s !important; - transition-delay: 0s !important; - } - ''') - - print("[HOOK] Performance optimizations applied") - return page -""", - - "before_retrieve_html": """ -async def hook(page, context, **kwargs): - print("[HOOK] Removing unnecessary elements before extraction") - - # Remove ads, popups, and other unnecessary elements - await page.evaluate('''() => { - // Remove common ad containers - const adSelectors = [ - '.ad', '.ads', '.advertisement', '[id*="ad-"]', '[class*="ad-"]', - '.popup', '.modal', '.overlay', '.cookie-banner', '.newsletter-signup' - ]; - - adSelectors.forEach(selector => { - document.querySelectorAll(selector).forEach(el => el.remove()); - }); - - // Remove script tags to clean up HTML - document.querySelectorAll('script').forEach(el => el.remove()); - - // Remove style tags we don't need - document.querySelectorAll('style').forEach(el => el.remove()); - }''') - - return page -""" - } - - payload = { - "urls": ["https://httpbin.org/html"], - "hooks": { - "code": hooks_code, - "timeout": 10 - } - } - - print("\nTesting performance optimization hooks...") - start_time = time.time() - - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - - elapsed_time = time.time() - start_time - print(f"Request completed in {elapsed_time:.2f} seconds") - - if response.status_code == 200: - data = response.json() - print("βœ… Performance optimization test completed") - - if 'results' in data and len(data['results']) > 0: - result = data['results'][0] - if result.get('html'): - print(f" HTML size: {len(result['html'])} characters") - print(" Resources blocked, ads removed, animations disabled") - else: - print(f"❌ Error: {response.status_code}") + # Check Docker service + print("\nπŸ” Checking Docker service status...") + if not check_docker_service(): + print("❌ Docker service is not running!") + print("\nπŸ“‹ To start the Docker service:") + print(" docker run -p 11235:11235 unclecode/crawl4ai:latest") + print("\nPlease start the service and run this example again.") + return -def test_content_extraction_hooks(): - """Test hooks for intelligent content extraction""" - print("\n" + "=" * 70) - print("Testing: Content Extraction Hooks") - print("=" * 70) - - hooks_code = { - "after_goto": """ -async def hook(page, context, url, response, **kwargs): - print(f"[HOOK] Waiting for dynamic content on {url}") - - # Wait for any lazy-loaded content - await page.wait_for_timeout(2000) - - # Trigger any "Load More" buttons - try: - load_more = await page.query_selector('[class*="load-more"], [class*="show-more"], button:has-text("Load More")') - if load_more: - await load_more.click() - await page.wait_for_timeout(1000) - print("[HOOK] Clicked 'Load More' button") - except: - pass - - return page -""", - - "before_retrieve_html": """ -async def hook(page, context, **kwargs): - print("[HOOK] Extracting structured data") - - # Extract metadata - metadata = await page.evaluate('''() => { - const getMeta = (name) => { - const element = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`); - return element ? element.getAttribute('content') : null; - }; - - return { - title: document.title, - description: getMeta('description') || getMeta('og:description'), - author: getMeta('author'), - keywords: getMeta('keywords'), - ogTitle: getMeta('og:title'), - ogImage: getMeta('og:image'), - canonical: document.querySelector('link[rel="canonical"]')?.href, - jsonLd: Array.from(document.querySelectorAll('script[type="application/ld+json"]')) - .map(el => el.textContent).filter(Boolean) - }; - }''') - - print(f"[HOOK] Extracted metadata: {json.dumps(metadata, indent=2)}") - - # Infinite scroll handling - for i in range(3): - await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") - await page.wait_for_timeout(1000) - print(f"[HOOK] Scroll iteration {i+1}/3") - - return page -""" - } - - payload = { - "urls": ["https://httpbin.org/html", "https://httpbin.org/json"], - "hooks": { - "code": hooks_code, - "timeout": 20 - } - } - - print("\nTesting content extraction hooks...") - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - - if response.status_code == 200: - data = response.json() - print("βœ… Content extraction test completed") - - if 'hooks' in data and 'summary' in data['hooks']: - summary = data['hooks']['summary'] - print(f" Hooks executed: {summary['successful']}/{summary['total_executions']}") - - if 'results' in data: - for result in data['results']: - print(f"\n URL: {result['url']}") - print(f" Success: {result.get('success', False)}") - else: - print(f"❌ Error: {response.status_code}") + print("βœ… Docker service is running!\n") - -def main(): - """Run comprehensive hook tests""" - print("πŸ”§ Crawl4AI Docker API - Comprehensive Hooks Testing") - print("Based on docs/examples/hooks_example.py") - print("=" * 70) - - tests = [ - ("All Hooks Demo", test_all_hooks_demo), - ("Authentication Flow", test_authentication_flow), - ("Performance Optimization", test_performance_optimization_hooks), - ("Content Extraction", test_content_extraction_hooks), + # Run all examples + examples = [ + ("String-Based Hooks (REST API)", example_1_string_based_hooks, False), + ("hooks_to_string() Utility", example_2_hooks_to_string_utility, False), + ("Docker Client Auto-Conversion (Recommended)", example_3_docker_client_auto_conversion, True), + ("Authentication Flow", example_4_authentication_flow, False), ] - - for i, (name, test_func) in enumerate(tests, 1): - print(f"\nπŸ“Œ Test {i}/{len(tests)}: {name}") + + for i, (name, example_func, is_async) in enumerate(examples, 1): + print(f"\n{'πŸ”·' * 35}") + print(f"Example {i}/{len(examples)}: {name}") + print(f"{'πŸ”·' * 35}\n") + try: - test_func() - print(f"βœ… {name} completed") + if is_async: + await example_func() + else: + example_func() + + print(f"βœ… Example {i} completed successfully!") + + # Pause between examples (except the last one) + if i < len(examples): + print("\n⏸️ Press Enter to continue to next example...") + input() + + except KeyboardInterrupt: + print(f"\n⏹️ Examples interrupted by user") + break except Exception as e: - print(f"❌ {name} failed: {e}") + print(f"\n❌ Example {i} failed: {str(e)}") import traceback traceback.print_exc() - + print("\nContinuing to next example...\n") + continue + + # Final summary print("\n" + "=" * 70) - print("πŸŽ‰ All comprehensive hook tests completed!") + print(" πŸŽ‰ All Examples Complete!") print("=" * 70) + print("\nπŸ“Š Summary - Three Approaches to Docker Hooks:") + + print("\n✨ 1. String-Based Hooks:") + print(" β€’ Write hooks as strings directly in JSON") + print(" β€’ Best for: REST API, non-Python clients, simple use cases") + print(" β€’ Cons: No IDE support, harder to debug") + + print("\n✨ 2. hooks_to_string() Utility:") + print(" β€’ Write hooks as Python functions, convert to strings") + print(" β€’ Best for: Python with REST API, reusable hook libraries") + print(" β€’ Pros: IDE support, type checking, easy debugging") + + print("\n✨ 3. Docker Client (RECOMMENDED):") + print(" β€’ Pass function objects directly, automatic conversion") + print(" β€’ Best for: Python applications, best developer experience") + print(" β€’ Pros: All benefits of #2 + cleaner code, no manual conversion") + + print("\nπŸ’‘ Recommendation:") + print(" Use Docker Client (#3) for Python applications") + print(" Use hooks_to_string() (#2) when you need REST API flexibility") + print(" Use string-based (#1) for non-Python clients or simple scripts") + + print("\n🎯 8 Hook Points Available:") + print(" β€’ on_browser_created, on_page_context_created") + print(" β€’ on_user_agent_updated, before_goto, after_goto") + print(" β€’ on_execution_started, before_retrieve_html, before_return_html") + + print("\nπŸ“š Resources:") + print(" β€’ Docs: https://docs.crawl4ai.com/core/docker-deployment") + print(" β€’ GitHub: https://github.com/unclecode/crawl4ai") + print(" β€’ Discord: https://discord.gg/jP8KfhDhyN") + + print("\n" + "=" * 70) + print(" Happy Crawling! πŸ•·οΈ") + print("=" * 70 + "\n") + if __name__ == "__main__": - main() \ No newline at end of file + print("\n🎬 Starting Crawl4AI Docker Hooks Examples...") + print("Press Ctrl+C anytime to exit\n") + + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\n\nπŸ‘‹ Examples stopped by user. Thanks for exploring Crawl4AI!") + except Exception as e: + print(f"\n\n❌ Error: {str(e)}") + import traceback + traceback.print_exc() diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md index ec2c69a5..6d5a9eae 100644 --- a/docs/md_v2/core/docker-deployment.md +++ b/docs/md_v2/core/docker-deployment.md @@ -27,6 +27,14 @@ - [Hook Response Information](#hook-response-information) - [Error Handling](#error-handling) - [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python) +- [Job Queue & Webhook API](#job-queue-webhook-api) + - [Why Use the Job Queue API?](#why-use-the-job-queue-api) + - [Available Endpoints](#available-endpoints) + - [Webhook Configuration](#webhook-configuration) + - [Usage Examples](#usage-examples) + - [Webhook Best Practices](#webhook-best-practices) + - [Use Cases](#use-cases) + - [Troubleshooting](#troubleshooting) - [Dockerfile Parameters](#dockerfile-parameters) - [Using the API](#using-the-api) - [Playground Interface](#playground-interface) @@ -1110,6 +1118,464 @@ if __name__ == "__main__": --- +## Job Queue & Webhook API + +The Docker deployment includes a powerful asynchronous job queue system with webhook support for both crawling and LLM extraction tasks. Instead of waiting for long-running operations to complete, submit jobs and receive real-time notifications via webhooks when they finish. + +### Why Use the Job Queue API? + +**Traditional Synchronous API (`/crawl`):** +- Client waits for entire crawl to complete +- Timeout issues with long-running crawls +- Resource blocking during execution +- Constant polling required for status updates + +**Asynchronous Job Queue API (`/crawl/job`, `/llm/job`):** +- βœ… Submit job and continue immediately +- βœ… No timeout concerns for long operations +- βœ… Real-time webhook notifications on completion +- βœ… Better resource utilization +- βœ… Perfect for batch processing +- βœ… Ideal for microservice architectures + +### Available Endpoints + +#### 1. Crawl Job Endpoint + +``` +POST /crawl/job +``` + +Submit an asynchronous crawl job with optional webhook notification. + +**Request Body:** +```json +{ + "urls": ["https://example.com"], + "cache_mode": "bypass", + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "schema": { + "title": "h1", + "content": ".article-body" + } + }, + "webhook_config": { + "webhook_url": "https://your-app.com/webhook/crawl-complete", + "webhook_data_in_payload": true, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token", + "X-Custom-Header": "value" + } + } +} +``` + +**Response:** +```json +{ + "task_id": "crawl_1698765432", + "message": "Crawl job submitted" +} +``` + +#### 2. LLM Extraction Job Endpoint + +``` +POST /llm/job +``` + +Submit an asynchronous LLM extraction job with optional webhook notification. + +**Request Body:** +```json +{ + "url": "https://example.com/article", + "q": "Extract the article title, author, publication date, and main points", + "provider": "openai/gpt-4o-mini", + "schema": "{\"title\": \"string\", \"author\": \"string\", \"date\": \"string\", \"points\": [\"string\"]}", + "cache": false, + "webhook_config": { + "webhook_url": "https://your-app.com/webhook/llm-complete", + "webhook_data_in_payload": true, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } +} +``` + +**Response:** +```json +{ + "task_id": "llm_1698765432", + "message": "LLM job submitted" +} +``` + +#### 3. Job Status Endpoint + +``` +GET /job/{task_id} +``` + +Check the status and retrieve results of a submitted job. + +**Response (In Progress):** +```json +{ + "task_id": "crawl_1698765432", + "status": "processing", + "message": "Job is being processed" +} +``` + +**Response (Completed):** +```json +{ + "task_id": "crawl_1698765432", + "status": "completed", + "result": { + "markdown": "# Page Title\n\nContent...", + "extracted_content": {...}, + "links": {...} + } +} +``` + +### Webhook Configuration + +Webhooks provide real-time notifications when your jobs complete, eliminating the need for constant polling. + +#### Webhook Config Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `webhook_url` | string | Yes | Your HTTP(S) endpoint to receive notifications | +| `webhook_data_in_payload` | boolean | No | Include full result data in webhook payload (default: false) | +| `webhook_headers` | object | No | Custom headers for authentication/identification | + +#### Webhook Payload Format + +**Success Notification (Crawl Job):** +```json +{ + "task_id": "crawl_1698765432", + "task_type": "crawl", + "status": "completed", + "timestamp": "2025-10-22T12:30:00.000000+00:00", + "urls": ["https://example.com"], + "data": { + "markdown": "# Page content...", + "extracted_content": {...}, + "links": {...} + } +} +``` + +**Success Notification (LLM Job):** +```json +{ + "task_id": "llm_1698765432", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-22T12:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "date": "2025-10-22", + "points": ["Point 1", "Point 2"] + } + } +} +``` + +**Failure Notification:** +```json +{ + "task_id": "crawl_1698765432", + "task_type": "crawl", + "status": "failed", + "timestamp": "2025-10-22T12:30:00.000000+00:00", + "urls": ["https://example.com"], + "error": "Connection timeout after 30 seconds" +} +``` + +#### Webhook Delivery & Retry + +- **Delivery Method:** HTTP POST to your `webhook_url` +- **Content-Type:** `application/json` +- **Retry Policy:** Exponential backoff with 5 attempts + - Attempt 1: Immediate + - Attempt 2: 1 second delay + - Attempt 3: 2 seconds delay + - Attempt 4: 4 seconds delay + - Attempt 5: 8 seconds delay +- **Success Status Codes:** 200-299 +- **Custom Headers:** Your `webhook_headers` are included in every request + +### Usage Examples + +#### Example 1: Python with Webhook Handler (Flask) + +```python +from flask import Flask, request, jsonify +import requests + +app = Flask(__name__) + +# Webhook handler +@app.route('/webhook/crawl-complete', methods=['POST']) +def handle_crawl_webhook(): + payload = request.json + + if payload['status'] == 'completed': + print(f"βœ… Job {payload['task_id']} completed!") + print(f"Task type: {payload['task_type']}") + + # Access the crawl results + if 'data' in payload: + markdown = payload['data'].get('markdown', '') + extracted = payload['data'].get('extracted_content', {}) + print(f"Extracted {len(markdown)} characters") + print(f"Structured data: {extracted}") + else: + print(f"❌ Job {payload['task_id']} failed: {payload.get('error')}") + + return jsonify({"status": "received"}), 200 + +# Submit a crawl job with webhook +def submit_crawl_job(): + response = requests.post( + "http://localhost:11235/crawl/job", + json={ + "urls": ["https://example.com"], + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "schema": { + "name": "Example Schema", + "baseSelector": "body", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "description", "selector": "meta[name='description']", "type": "attribute", "attribute": "content"} + ] + } + }, + "webhook_config": { + "webhook_url": "https://your-app.com/webhook/crawl-complete", + "webhook_data_in_payload": True, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + } + ) + + task_id = response.json()['task_id'] + print(f"Job submitted: {task_id}") + return task_id + +if __name__ == '__main__': + app.run(port=5000) +``` + +#### Example 2: LLM Extraction with Webhooks + +```python +import requests + +def submit_llm_job_with_webhook(): + response = requests.post( + "http://localhost:11235/llm/job", + json={ + "url": "https://example.com/article", + "q": "Extract the article title, author, and main points", + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://your-app.com/webhook/llm-complete", + "webhook_data_in_payload": True, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + } + ) + + task_id = response.json()['task_id'] + print(f"LLM job submitted: {task_id}") + return task_id + +# Webhook handler for LLM jobs +@app.route('/webhook/llm-complete', methods=['POST']) +def handle_llm_webhook(): + payload = request.json + + if payload['status'] == 'completed': + extracted = payload['data']['extracted_content'] + print(f"βœ… LLM extraction completed!") + print(f"Results: {extracted}") + else: + print(f"❌ LLM extraction failed: {payload.get('error')}") + + return jsonify({"status": "received"}), 200 +``` + +#### Example 3: Without Webhooks (Polling) + +If you don't use webhooks, you can poll for results: + +```python +import requests +import time + +# Submit job +response = requests.post( + "http://localhost:11235/crawl/job", + json={"urls": ["https://example.com"]} +) +task_id = response.json()['task_id'] + +# Poll for results +while True: + result = requests.get(f"http://localhost:11235/job/{task_id}") + data = result.json() + + if data['status'] == 'completed': + print("Job completed!") + print(data['result']) + break + elif data['status'] == 'failed': + print(f"Job failed: {data.get('error')}") + break + + print("Still processing...") + time.sleep(2) +``` + +#### Example 4: Global Webhook Configuration + +Set a default webhook URL in your `config.yml` to avoid repeating it in every request: + +```yaml +# config.yml +api: + crawler: + # ... other settings ... + webhook: + default_url: "https://your-app.com/webhook/default" + default_headers: + X-Webhook-Secret: "your-secret-token" +``` + +Then submit jobs without webhook config: + +```python +# Uses the global webhook configuration +response = requests.post( + "http://localhost:11235/crawl/job", + json={"urls": ["https://example.com"]} +) +``` + +### Webhook Best Practices + +1. **Authentication:** Always use custom headers for webhook authentication + ```json + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + ``` + +2. **Idempotency:** Design your webhook handler to be idempotent (safe to receive duplicate notifications) + +3. **Fast Response:** Return HTTP 200 quickly; process data asynchronously if needed + ```python + @app.route('/webhook', methods=['POST']) + def webhook(): + payload = request.json + # Queue for background processing + queue.enqueue(process_webhook, payload) + return jsonify({"status": "received"}), 200 + ``` + +4. **Error Handling:** Handle both success and failure notifications + ```python + if payload['status'] == 'completed': + # Process success + elif payload['status'] == 'failed': + # Log error, retry, or alert + ``` + +5. **Validation:** Verify webhook authenticity using custom headers + ```python + secret = request.headers.get('X-Webhook-Secret') + if secret != os.environ['EXPECTED_SECRET']: + return jsonify({"error": "Unauthorized"}), 401 + ``` + +6. **Logging:** Log webhook deliveries for debugging + ```python + logger.info(f"Webhook received: {payload['task_id']} - {payload['status']}") + ``` + +### Use Cases + +**1. Batch Processing** +Submit hundreds of URLs and get notified as each completes: +```python +urls = ["https://site1.com", "https://site2.com", ...] +for url in urls: + submit_crawl_job(url, webhook_url="https://app.com/webhook") +``` + +**2. Microservice Integration** +Integrate with event-driven architectures: +```python +# Service A submits job +task_id = submit_crawl_job(url) + +# Service B receives webhook and triggers next step +@app.route('/webhook') +def webhook(): + process_result(request.json) + trigger_next_service() + return "OK", 200 +``` + +**3. Long-Running Extractions** +Handle complex LLM extractions without timeouts: +```python +submit_llm_job( + url="https://long-article.com", + q="Comprehensive summary with key points and analysis", + webhook_url="https://app.com/webhook/llm" +) +``` + +### Troubleshooting + +**Webhook not receiving notifications?** +- Check your webhook URL is publicly accessible +- Verify firewall/security group settings +- Use webhook testing tools like webhook.site for debugging +- Check server logs for delivery attempts +- Ensure your handler returns 200-299 status code + +**Job stuck in processing?** +- Check Redis connection: `docker logs | grep redis` +- Verify worker processes: `docker exec ps aux | grep worker` +- Check server logs: `docker logs ` + +**Need to cancel a job?** +Jobs are processed asynchronously. If you need to cancel: +- Delete the task from Redis (requires Redis CLI access) +- Or implement a cancellation endpoint in your webhook handler + +--- + ## Dockerfile Parameters You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.