#!/usr/bin/env python3 """ Comprehensive hooks examples using Docker Client with function objects. This approach is recommended because: - Write hooks as regular Python functions - Full IDE support (autocomplete, type checking) - Automatic conversion to API format - Reusable and testable code - Clean, readable syntax """ import asyncio from crawl4ai import Crawl4aiDockerClient # API_BASE_URL = "http://localhost:11235" API_BASE_URL = "http://localhost:11234" # ============================================================================ # Hook Function Definitions # ============================================================================ # --- All Hooks Demo --- async def browser_created_hook(browser, **kwargs): """Called after browser is created""" print("[HOOK] Browser created and ready") return browser async def page_context_hook(page, context, **kwargs): """Setup page environment""" print("[HOOK] Setting up page environment") # Set viewport await page.set_viewport_size({"width": 1920, "height": 1080}) # Add cookies await context.add_cookies([{ "name": "test_session", "value": "abc123xyz", "domain": ".httpbin.org", "path": "/" }]) # Block resources await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort()) await context.route("**/analytics/*", lambda route: route.abort()) print("[HOOK] Environment configured") return page async def user_agent_hook(page, context, user_agent, **kwargs): """Called when user agent is updated""" print(f"[HOOK] User agent: {user_agent[:50]}...") return page async def before_goto_hook(page, context, url, **kwargs): """Called before navigating to URL""" print(f"[HOOK] Navigating to: {url}") await page.set_extra_http_headers({ "X-Custom-Header": "crawl4ai-test", "Accept-Language": "en-US" }) return page async def after_goto_hook(page, context, url, response, **kwargs): """Called after page loads""" print(f"[HOOK] Page loaded: {url}") await page.wait_for_timeout(1000) try: await page.wait_for_selector("body", timeout=2000) print("[HOOK] Body element ready") except: print("[HOOK] Timeout, continuing") return page async def execution_started_hook(page, context, **kwargs): """Called when custom JS execution starts""" print("[HOOK] JS execution started") await page.evaluate("console.log('[HOOK] Custom JS');") return page async def before_retrieve_hook(page, context, **kwargs): """Called before retrieving HTML""" print("[HOOK] Preparing HTML retrieval") # Scroll for lazy content await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") await page.wait_for_timeout(500) await page.evaluate("window.scrollTo(0, 0);") print("[HOOK] Scrolling complete") return page async def before_return_hook(page, context, html, **kwargs): """Called before returning HTML""" print(f"[HOOK] HTML ready: {len(html)} chars") metrics = await page.evaluate('''() => ({ images: document.images.length, links: document.links.length, scripts: document.scripts.length })''') print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}") return page # --- Authentication Hooks --- async def auth_context_hook(page, context, **kwargs): """Setup authentication context""" print("[HOOK] Setting up authentication") # Add auth cookies await context.add_cookies([{ "name": "auth_token", "value": "fake_jwt_token", "domain": ".httpbin.org", "path": "/", "httpOnly": True }]) # Set localStorage await page.evaluate(''' localStorage.setItem('user_id', '12345'); localStorage.setItem('auth_time', new Date().toISOString()); ''') print("[HOOK] Auth context ready") return page async def auth_headers_hook(page, context, url, **kwargs): """Add authentication headers""" print(f"[HOOK] Adding auth headers for {url}") import base64 credentials = base64.b64encode(b"user:passwd").decode('ascii') await page.set_extra_http_headers({ 'Authorization': f'Basic {credentials}', 'X-API-Key': 'test-key-123' }) return page # --- Performance Optimization Hooks --- async def performance_hook(page, context, **kwargs): """Optimize page for performance""" print("[HOOK] Optimizing for performance") # Block resource-heavy content await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort()) await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort()) await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort()) await context.route("**/googletagmanager.com/*", lambda r: r.abort()) await context.route("**/google-analytics.com/*", lambda r: r.abort()) await context.route("**/facebook.com/*", lambda r: r.abort()) # Disable animations await page.add_style_tag(content=''' *, *::before, *::after { animation-duration: 0s !important; transition-duration: 0s !important; } ''') print("[HOOK] Optimizations applied") return page async def cleanup_hook(page, context, **kwargs): """Clean page before extraction""" print("[HOOK] Cleaning page") await page.evaluate('''() => { const selectors = [ '.ad', '.ads', '.advertisement', '.popup', '.modal', '.overlay', '.cookie-banner', '.newsletter' ]; selectors.forEach(sel => { document.querySelectorAll(sel).forEach(el => el.remove()); }); document.querySelectorAll('script, style').forEach(el => el.remove()); }''') print("[HOOK] Page cleaned") return page # --- Content Extraction Hooks --- async def wait_dynamic_content_hook(page, context, url, response, **kwargs): """Wait for dynamic content to load""" print(f"[HOOK] Waiting for dynamic content on {url}") await page.wait_for_timeout(2000) # Click "Load More" if exists try: load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")') if load_more: await load_more.click() await page.wait_for_timeout(1000) print("[HOOK] Clicked 'Load More'") except: pass return page async def extract_metadata_hook(page, context, **kwargs): """Extract page metadata""" print("[HOOK] Extracting metadata") metadata = await page.evaluate('''() => { const getMeta = (name) => { const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`); return el ? el.getAttribute('content') : null; }; return { title: document.title, description: getMeta('description'), author: getMeta('author'), keywords: getMeta('keywords'), }; }''') print(f"[HOOK] Metadata: {metadata}") # Infinite scroll for i in range(3): await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") await page.wait_for_timeout(1000) print(f"[HOOK] Scroll {i+1}/3") return page # --- Multi-URL Hooks --- async def url_specific_hook(page, context, url, **kwargs): """Apply URL-specific logic""" print(f"[HOOK] Processing URL: {url}") # URL-specific headers if 'html' in url: await page.set_extra_http_headers({"X-Type": "HTML"}) elif 'json' in url: await page.set_extra_http_headers({"X-Type": "JSON"}) return page async def track_progress_hook(page, context, url, response, **kwargs): """Track crawl progress""" status = response.status if response else 'unknown' print(f"[HOOK] Loaded {url} - Status: {status}") return page # ============================================================================ # Test Functions # ============================================================================ async def test_all_hooks_comprehensive(): """Test all 8 hook types""" print("=" * 70) print("Test 1: All Hooks Comprehensive Demo (Docker Client)") print("=" * 70) async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: print("\nCrawling with all 8 hooks...") # Define hooks with function objects hooks = { "on_browser_created": browser_created_hook, "on_page_context_created": page_context_hook, "on_user_agent_updated": user_agent_hook, "before_goto": before_goto_hook, "after_goto": after_goto_hook, "on_execution_started": execution_started_hook, "before_retrieve_html": before_retrieve_hook, "before_return_html": before_return_hook } result = await client.crawl( ["https://httpbin.org/html"], hooks=hooks, hooks_timeout=30 ) print("\n✅ Success!") print(f" URL: {result.url}") print(f" Success: {result.success}") print(f" HTML: {len(result.html)} chars") async def test_authentication_workflow(): """Test authentication with hooks""" print("\n" + "=" * 70) print("Test 2: Authentication Workflow (Docker Client)") print("=" * 70) async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: print("\nTesting authentication...") hooks = { "on_page_context_created": auth_context_hook, "before_goto": auth_headers_hook } result = await client.crawl( ["https://httpbin.org/basic-auth/user/passwd"], hooks=hooks, hooks_timeout=15 ) print("\n✅ Authentication completed") if result.success: if '"authenticated"' in result.html and 'true' in result.html: print(" ✅ Basic auth successful!") else: print(" ⚠️ Auth status unclear") else: print(f" ❌ Failed: {result.error_message}") async def test_performance_optimization(): """Test performance optimization""" print("\n" + "=" * 70) print("Test 3: Performance Optimization (Docker Client)") print("=" * 70) async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: print("\nTesting performance hooks...") hooks = { "on_page_context_created": performance_hook, "before_retrieve_html": cleanup_hook } result = await client.crawl( ["https://httpbin.org/html"], hooks=hooks, hooks_timeout=10 ) print("\n✅ Optimization completed") print(f" HTML size: {len(result.html):,} chars") print(" Resources blocked, ads removed") async def test_content_extraction(): """Test content extraction""" print("\n" + "=" * 70) print("Test 4: Content Extraction (Docker Client)") print("=" * 70) async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: print("\nTesting extraction hooks...") hooks = { "after_goto": wait_dynamic_content_hook, "before_retrieve_html": extract_metadata_hook } result = await client.crawl( ["https://www.kidocode.com/"], hooks=hooks, hooks_timeout=20 ) print("\n✅ Extraction completed") print(f" URL: {result.url}") print(f" Success: {result.success}") print(f" Metadata: {result.metadata}") async def test_multi_url_crawl(): """Test hooks with multiple URLs""" print("\n" + "=" * 70) print("Test 5: Multi-URL Crawl (Docker Client)") print("=" * 70) async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: print("\nCrawling multiple URLs...") hooks = { "before_goto": url_specific_hook, "after_goto": track_progress_hook } results = await client.crawl( [ "https://httpbin.org/html", "https://httpbin.org/json", "https://httpbin.org/xml" ], hooks=hooks, hooks_timeout=15 ) print("\n✅ Multi-URL crawl completed") print(f"\n Crawled {len(results)} URLs:") for i, result in enumerate(results, 1): status = "✅" if result.success else "❌" print(f" {status} {i}. {result.url}") async def test_reusable_hook_library(): """Test using reusable hook library""" print("\n" + "=" * 70) print("Test 6: Reusable Hook Library (Docker Client)") print("=" * 70) # Create a library of reusable hooks class HookLibrary: @staticmethod async def block_images(page, context, **kwargs): """Block all images""" await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort()) print("[LIBRARY] Images blocked") return page @staticmethod async def block_analytics(page, context, **kwargs): """Block analytics""" await context.route("**/analytics/*", lambda r: r.abort()) await context.route("**/google-analytics.com/*", lambda r: r.abort()) print("[LIBRARY] Analytics blocked") return page @staticmethod async def scroll_infinite(page, context, **kwargs): """Handle infinite scroll""" for i in range(5): prev = await page.evaluate("document.body.scrollHeight") await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") await page.wait_for_timeout(1000) curr = await page.evaluate("document.body.scrollHeight") if curr == prev: break print("[LIBRARY] Infinite scroll complete") return page async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: print("\nUsing hook library...") hooks = { "on_page_context_created": HookLibrary.block_images, "before_retrieve_html": HookLibrary.scroll_infinite } result = await client.crawl( ["https://www.kidocode.com/"], hooks=hooks, hooks_timeout=20 ) print("\n✅ Library hooks completed") print(f" Success: {result.success}") # ============================================================================ # Main # ============================================================================ async def main(): """Run all Docker client hook examples""" print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)") print("Using Python function objects with automatic conversion") print("=" * 70) tests = [ ("All Hooks Demo", test_all_hooks_comprehensive), ("Authentication", test_authentication_workflow), ("Performance", test_performance_optimization), ("Extraction", test_content_extraction), ("Multi-URL", test_multi_url_crawl), ("Hook Library", test_reusable_hook_library) ] for i, (name, test_func) in enumerate(tests, 1): try: await test_func() print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n") except Exception as e: print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n") import traceback traceback.print_exc() print("=" * 70) print("🎉 All Docker client hook examples completed!") print("\n💡 Key Benefits of Function-Based Hooks:") print(" • Write as regular Python functions") print(" • Full IDE support (autocomplete, types)") print(" • Automatic conversion to API format") print(" • Reusable across projects") print(" • Clean, readable code") print(" • Easy to test and debug") print("=" * 70) if __name__ == "__main__": asyncio.run(main())