feat: Add hooks utility for function-based hooks with Docker client integration. ref #1377

Add hooks_to_string() utility function that converts Python function objects to string representations for the Docker API, enabling developers to write hooks as regular Python functions instead of strings. Core Changes: - New hooks_to_string() utility in crawl4ai/utils.py using inspect.getsource() - Docker client now accepts both function objects and strings for hooks - Automatic detection and conversion in Crawl4aiDockerClient._prepare_request() - New hooks and hooks_timeout parameters in client.crawl() method Documentation: - Docker client examples with function-based hooks (docs/examples/docker_client_hooks_example.py) - Updated main Docker deployment guide with comprehensive hooks section - Added unit tests for hooks utility (tests/docker/test_hooks_utility.py)
2025-10-13 12:34:08 +08:00
parent 216019f29a
commit a3f057e19f
6 changed files with 1198 additions and 44 deletions
--- a/docs/examples/docker_client_hooks_example.py
+++ b/docs/examples/docker_client_hooks_example.py
@@ -0,0 +1,522 @@
+#!/usr/bin/env python3
+"""
+Comprehensive hooks examples using Docker Client with function objects.
+
+This approach is recommended because:
+- Write hooks as regular Python functions
+- Full IDE support (autocomplete, type checking)
+- Automatic conversion to API format
+- Reusable and testable code
+- Clean, readable syntax
+"""
+
+import asyncio
+from crawl4ai import Crawl4aiDockerClient
+
+# API_BASE_URL = "http://localhost:11235"
+API_BASE_URL = "http://localhost:11234"
+
+
+# ============================================================================
+# Hook Function Definitions
+# ============================================================================
+
+# --- All Hooks Demo ---
+async def browser_created_hook(browser, **kwargs):
+    """Called after browser is created"""
+    print("[HOOK] Browser created and ready")
+    return browser
+
+
+async def page_context_hook(page, context, **kwargs):
+    """Setup page environment"""
+    print("[HOOK] Setting up page environment")
+
+    # Set viewport
+    await page.set_viewport_size({"width": 1920, "height": 1080})
+
+    # Add cookies
+    await context.add_cookies([{
+        "name": "test_session",
+        "value": "abc123xyz",
+        "domain": ".httpbin.org",
+        "path": "/"
+    }])
+
+    # Block resources
+    await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
+    await context.route("**/analytics/*", lambda route: route.abort())
+
+    print("[HOOK] Environment configured")
+    return page
+
+
+async def user_agent_hook(page, context, user_agent, **kwargs):
+    """Called when user agent is updated"""
+    print(f"[HOOK] User agent: {user_agent[:50]}...")
+    return page
+
+
+async def before_goto_hook(page, context, url, **kwargs):
+    """Called before navigating to URL"""
+    print(f"[HOOK] Navigating to: {url}")
+
+    await page.set_extra_http_headers({
+        "X-Custom-Header": "crawl4ai-test",
+        "Accept-Language": "en-US"
+    })
+
+    return page
+
+
+async def after_goto_hook(page, context, url, response, **kwargs):
+    """Called after page loads"""
+    print(f"[HOOK] Page loaded: {url}")
+
+    await page.wait_for_timeout(1000)
+
+    try:
+        await page.wait_for_selector("body", timeout=2000)
+        print("[HOOK] Body element ready")
+    except:
+        print("[HOOK] Timeout, continuing")
+
+    return page
+
+
+async def execution_started_hook(page, context, **kwargs):
+    """Called when custom JS execution starts"""
+    print("[HOOK] JS execution started")
+    await page.evaluate("console.log('[HOOK] Custom JS');")
+    return page
+
+
+async def before_retrieve_hook(page, context, **kwargs):
+    """Called before retrieving HTML"""
+    print("[HOOK] Preparing HTML retrieval")
+
+    # Scroll for lazy content
+    await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+    await page.wait_for_timeout(500)
+    await page.evaluate("window.scrollTo(0, 0);")
+
+    print("[HOOK] Scrolling complete")
+    return page
+
+
+async def before_return_hook(page, context, html, **kwargs):
+    """Called before returning HTML"""
+    print(f"[HOOK] HTML ready: {len(html)} chars")
+
+    metrics = await page.evaluate('''() => ({
+        images: document.images.length,
+        links: document.links.length,
+        scripts: document.scripts.length
+    })''')
+
+    print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}")
+    return page
+
+
+# --- Authentication Hooks ---
+async def auth_context_hook(page, context, **kwargs):
+    """Setup authentication context"""
+    print("[HOOK] Setting up authentication")
+
+    # Add auth cookies
+    await context.add_cookies([{
+        "name": "auth_token",
+        "value": "fake_jwt_token",
+        "domain": ".httpbin.org",
+        "path": "/",
+        "httpOnly": True
+    }])
+
+    # Set localStorage
+    await page.evaluate('''
+        localStorage.setItem('user_id', '12345');
+        localStorage.setItem('auth_time', new Date().toISOString());
+    ''')
+
+    print("[HOOK] Auth context ready")
+    return page
+
+
+async def auth_headers_hook(page, context, url, **kwargs):
+    """Add authentication headers"""
+    print(f"[HOOK] Adding auth headers for {url}")
+
+    import base64
+    credentials = base64.b64encode(b"user:passwd").decode('ascii')
+
+    await page.set_extra_http_headers({
+        'Authorization': f'Basic {credentials}',
+        'X-API-Key': 'test-key-123'
+    })
+
+    return page
+
+
+# --- Performance Optimization Hooks ---
+async def performance_hook(page, context, **kwargs):
+    """Optimize page for performance"""
+    print("[HOOK] Optimizing for performance")
+
+    # Block resource-heavy content
+    await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort())
+    await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort())
+    await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort())
+    await context.route("**/googletagmanager.com/*", lambda r: r.abort())
+    await context.route("**/google-analytics.com/*", lambda r: r.abort())
+    await context.route("**/facebook.com/*", lambda r: r.abort())
+
+    # Disable animations
+    await page.add_style_tag(content='''
+        *, *::before, *::after {
+            animation-duration: 0s !important;
+            transition-duration: 0s !important;
+        }
+    ''')
+
+    print("[HOOK] Optimizations applied")
+    return page
+
+
+async def cleanup_hook(page, context, **kwargs):
+    """Clean page before extraction"""
+    print("[HOOK] Cleaning page")
+
+    await page.evaluate('''() => {
+        const selectors = [
+            '.ad', '.ads', '.advertisement',
+            '.popup', '.modal', '.overlay',
+            '.cookie-banner', '.newsletter'
+        ];
+
+        selectors.forEach(sel => {
+            document.querySelectorAll(sel).forEach(el => el.remove());
+        });
+
+        document.querySelectorAll('script, style').forEach(el => el.remove());
+    }''')
+
+    print("[HOOK] Page cleaned")
+    return page
+
+
+# --- Content Extraction Hooks ---
+async def wait_dynamic_content_hook(page, context, url, response, **kwargs):
+    """Wait for dynamic content to load"""
+    print(f"[HOOK] Waiting for dynamic content on {url}")
+
+    await page.wait_for_timeout(2000)
+
+    # Click "Load More" if exists
+    try:
+        load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")')
+        if load_more:
+            await load_more.click()
+            await page.wait_for_timeout(1000)
+            print("[HOOK] Clicked 'Load More'")
+    except:
+        pass
+
+    return page
+
+
+async def extract_metadata_hook(page, context, **kwargs):
+    """Extract page metadata"""
+    print("[HOOK] Extracting metadata")
+
+    metadata = await page.evaluate('''() => {
+        const getMeta = (name) => {
+            const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
+            return el ? el.getAttribute('content') : null;
+        };
+
+        return {
+            title: document.title,
+            description: getMeta('description'),
+            author: getMeta('author'),
+            keywords: getMeta('keywords'),
+        };
+    }''')
+
+    print(f"[HOOK] Metadata: {metadata}")
+
+    # Infinite scroll
+    for i in range(3):
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        await page.wait_for_timeout(1000)
+        print(f"[HOOK] Scroll {i+1}/3")
+
+    return page
+
+
+# --- Multi-URL Hooks ---
+async def url_specific_hook(page, context, url, **kwargs):
+    """Apply URL-specific logic"""
+    print(f"[HOOK] Processing URL: {url}")
+
+    # URL-specific headers
+    if 'html' in url:
+        await page.set_extra_http_headers({"X-Type": "HTML"})
+    elif 'json' in url:
+        await page.set_extra_http_headers({"X-Type": "JSON"})
+
+    return page
+
+
+async def track_progress_hook(page, context, url, response, **kwargs):
+    """Track crawl progress"""
+    status = response.status if response else 'unknown'
+    print(f"[HOOK] Loaded {url} - Status: {status}")
+    return page
+
+
+# ============================================================================
+# Test Functions
+# ============================================================================
+
+async def test_all_hooks_comprehensive():
+    """Test all 8 hook types"""
+    print("=" * 70)
+    print("Test 1: All Hooks Comprehensive Demo (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nCrawling with all 8 hooks...")
+
+        # Define hooks with function objects
+        hooks = {
+            "on_browser_created": browser_created_hook,
+            "on_page_context_created": page_context_hook,
+            "on_user_agent_updated": user_agent_hook,
+            "before_goto": before_goto_hook,
+            "after_goto": after_goto_hook,
+            "on_execution_started": execution_started_hook,
+            "before_retrieve_html": before_retrieve_hook,
+            "before_return_html": before_return_hook
+        }
+
+        result = await client.crawl(
+            ["https://httpbin.org/html"],
+            hooks=hooks,
+            hooks_timeout=30
+        )
+
+        print("\n✅ Success!")
+        print(f"   URL: {result.url}")
+        print(f"   Success: {result.success}")
+        print(f"   HTML: {len(result.html)} chars")
+
+
+async def test_authentication_workflow():
+    """Test authentication with hooks"""
+    print("\n" + "=" * 70)
+    print("Test 2: Authentication Workflow (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nTesting authentication...")
+
+        hooks = {
+            "on_page_context_created": auth_context_hook,
+            "before_goto": auth_headers_hook
+        }
+
+        result = await client.crawl(
+            ["https://httpbin.org/basic-auth/user/passwd"],
+            hooks=hooks,
+            hooks_timeout=15
+        )
+
+        print("\n✅ Authentication completed")
+
+        if result.success:
+            if '"authenticated"' in result.html and 'true' in result.html:
+                print("   ✅ Basic auth successful!")
+            else:
+                print("   ⚠️ Auth status unclear")
+        else:
+            print(f"   ❌ Failed: {result.error_message}")
+
+
+async def test_performance_optimization():
+    """Test performance optimization"""
+    print("\n" + "=" * 70)
+    print("Test 3: Performance Optimization (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nTesting performance hooks...")
+
+        hooks = {
+            "on_page_context_created": performance_hook,
+            "before_retrieve_html": cleanup_hook
+        }
+
+        result = await client.crawl(
+            ["https://httpbin.org/html"],
+            hooks=hooks,
+            hooks_timeout=10
+        )
+
+        print("\n✅ Optimization completed")
+        print(f"   HTML size: {len(result.html):,} chars")
+        print("   Resources blocked, ads removed")
+
+
+async def test_content_extraction():
+    """Test content extraction"""
+    print("\n" + "=" * 70)
+    print("Test 4: Content Extraction (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nTesting extraction hooks...")
+
+        hooks = {
+            "after_goto": wait_dynamic_content_hook,
+            "before_retrieve_html": extract_metadata_hook
+        }
+
+        result = await client.crawl(
+            ["https://www.kidocode.com/"],
+            hooks=hooks,
+            hooks_timeout=20
+        )
+
+        print("\n✅ Extraction completed")
+        print(f"   URL: {result.url}")
+        print(f"   Success: {result.success}")
+        print(f"   Metadata: {result.metadata}")
+
+
+async def test_multi_url_crawl():
+    """Test hooks with multiple URLs"""
+    print("\n" + "=" * 70)
+    print("Test 5: Multi-URL Crawl (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nCrawling multiple URLs...")
+
+        hooks = {
+            "before_goto": url_specific_hook,
+            "after_goto": track_progress_hook
+        }
+
+        results = await client.crawl(
+            [
+                "https://httpbin.org/html",
+                "https://httpbin.org/json",
+                "https://httpbin.org/xml"
+            ],
+            hooks=hooks,
+            hooks_timeout=15
+        )
+
+        print("\n✅ Multi-URL crawl completed")
+        print(f"\n   Crawled {len(results)} URLs:")
+        for i, result in enumerate(results, 1):
+            status = "✅" if result.success else "❌"
+            print(f"   {status} {i}. {result.url}")
+
+
+async def test_reusable_hook_library():
+    """Test using reusable hook library"""
+    print("\n" + "=" * 70)
+    print("Test 6: Reusable Hook Library (Docker Client)")
+    print("=" * 70)
+
+    # Create a library of reusable hooks
+    class HookLibrary:
+        @staticmethod
+        async def block_images(page, context, **kwargs):
+            """Block all images"""
+            await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort())
+            print("[LIBRARY] Images blocked")
+            return page
+
+        @staticmethod
+        async def block_analytics(page, context, **kwargs):
+            """Block analytics"""
+            await context.route("**/analytics/*", lambda r: r.abort())
+            await context.route("**/google-analytics.com/*", lambda r: r.abort())
+            print("[LIBRARY] Analytics blocked")
+            return page
+
+        @staticmethod
+        async def scroll_infinite(page, context, **kwargs):
+            """Handle infinite scroll"""
+            for i in range(5):
+                prev = await page.evaluate("document.body.scrollHeight")
+                await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+                await page.wait_for_timeout(1000)
+                curr = await page.evaluate("document.body.scrollHeight")
+                if curr == prev:
+                    break
+            print("[LIBRARY] Infinite scroll complete")
+            return page
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nUsing hook library...")
+
+        hooks = {
+            "on_page_context_created": HookLibrary.block_images,
+            "before_retrieve_html": HookLibrary.scroll_infinite
+        }
+
+        result = await client.crawl(
+            ["https://www.kidocode.com/"],
+            hooks=hooks,
+            hooks_timeout=20
+        )
+
+        print("\n✅ Library hooks completed")
+        print(f"   Success: {result.success}")
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+async def main():
+    """Run all Docker client hook examples"""
+    print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)")
+    print("Using Python function objects with automatic conversion")
+    print("=" * 70)
+
+    tests = [
+        ("All Hooks Demo", test_all_hooks_comprehensive),
+        ("Authentication", test_authentication_workflow),
+        ("Performance", test_performance_optimization),
+        ("Extraction", test_content_extraction),
+        ("Multi-URL", test_multi_url_crawl),
+        ("Hook Library", test_reusable_hook_library)
+    ]
+
+    for i, (name, test_func) in enumerate(tests, 1):
+        try:
+            await test_func()
+            print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n")
+        except Exception as e:
+            print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n")
+            import traceback
+            traceback.print_exc()
+
+    print("=" * 70)
+    print("🎉 All Docker client hook examples completed!")
+    print("\n💡 Key Benefits of Function-Based Hooks:")
+    print("   • Write as regular Python functions")
+    print("   • Full IDE support (autocomplete, types)")
+    print("   • Automatic conversion to API format")
+    print("   • Reusable across projects")
+    print("   • Clean, readable code")
+    print("   • Easy to test and debug")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())