feat: Add hooks utility for function-based hooks with Docker client integration. ref #1377

Add hooks_to_string() utility function that converts Python function objects to string representations for the Docker API, enabling developers to write hooks as regular Python functions instead of strings. Core Changes: - New hooks_to_string() utility in crawl4ai/utils.py using inspect.getsource() - Docker client now accepts both function objects and strings for hooks - Automatic detection and conversion in Crawl4aiDockerClient._prepare_request() - New hooks and hooks_timeout parameters in client.crawl() method Documentation: - Docker client examples with function-based hooks (docs/examples/docker_client_hooks_example.py) - Updated main Docker deployment guide with comprehensive hooks section - Added unit tests for hooks utility (tests/docker/test_hooks_utility.py)
2025-10-13 12:34:08 +08:00
parent 216019f29a
commit a3f057e19f
6 changed files with 1198 additions and 44 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -103,7 +103,8 @@ from .browser_adapter import (

 from .utils import (
    start_colab_display_server,
-    setup_colab_environment
+    setup_colab_environment,
+    hooks_to_string
 )

 __all__ = [
@@ -183,6 +184,7 @@ __all__ = [
    "ProxyConfig",
    "start_colab_display_server",
    "setup_colab_environment",
+    "hooks_to_string",
    # C4A Script additions
    "c4a_compile",
    "c4a_validate", 
--- a/crawl4ai/docker_client.py
+++ b/crawl4ai/docker_client.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Union, AsyncGenerator, Dict, Any
+from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable
 import httpx
 import json
 from urllib.parse import urljoin
@@ -7,6 +7,7 @@ import asyncio
 from .async_configs import BrowserConfig, CrawlerRunConfig
 from .models import CrawlResult
 from .async_logger import AsyncLogger, LogLevel
+from .utils import hooks_to_string


 class Crawl4aiClientError(Exception):
@@ -70,17 +71,41 @@ class Crawl4aiDockerClient:
            self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
            raise ConnectionError(f"Cannot connect to server: {str(e)}")

-    def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, 
-                       crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
+    def _prepare_request(
+        self,
+        urls: List[str],
+        browser_config: Optional[BrowserConfig] = None,
+        crawler_config: Optional[CrawlerRunConfig] = None,
+        hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
+        hooks_timeout: int = 30
+    ) -> Dict[str, Any]:
        """Prepare request data from configs."""
        if self._token:
            self._http_client.headers["Authorization"] = f"Bearer {self._token}"
-        return {
+
+        request_data = {
            "urls": urls,
            "browser_config": browser_config.dump() if browser_config else {},
            "crawler_config": crawler_config.dump() if crawler_config else {}
        }

+        # Handle hooks if provided
+        if hooks:
+            # Check if hooks are already strings or need conversion
+            if any(callable(v) for v in hooks.values()):
+                # Convert function objects to strings
+                hooks_code = hooks_to_string(hooks)
+            else:
+                # Already in string format
+                hooks_code = hooks
+
+            request_data["hooks"] = {
+                "code": hooks_code,
+                "timeout": hooks_timeout
+            }
+
+        return request_data
+
    async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
        """Make an HTTP request with error handling."""
        url = urljoin(self.base_url, endpoint)
@@ -102,16 +127,42 @@ class Crawl4aiDockerClient:
        self,
        urls: List[str],
        browser_config: Optional[BrowserConfig] = None,
-        crawler_config: Optional[CrawlerRunConfig] = None
+        crawler_config: Optional[CrawlerRunConfig] = None,
+        hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
+        hooks_timeout: int = 30
    ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
-        """Execute a crawl operation."""
+        """
+        Execute a crawl operation.
+
+        Args:
+            urls: List of URLs to crawl
+            browser_config: Browser configuration
+            crawler_config: Crawler configuration
+            hooks: Optional hooks - can be either:
+                   - Dict[str, Callable]: Function objects that will be converted to strings
+                   - Dict[str, str]: Already stringified hook code
+            hooks_timeout: Timeout in seconds for each hook execution (1-120)
+
+        Returns:
+            Single CrawlResult, list of results, or async generator for streaming
+
+        Example with function hooks:
+            >>> async def my_hook(page, context, **kwargs):
+            ...     await page.set_viewport_size({"width": 1920, "height": 1080})
+            ...     return page
+            >>>
+            >>> result = await client.crawl(
+            ...     ["https://example.com"],
+            ...     hooks={"on_page_context_created": my_hook}
+            ... )
+        """
        await self._check_server()
-        
-        data = self._prepare_request(urls, browser_config, crawler_config)
+
+        data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout)
        is_streaming = crawler_config and crawler_config.stream
-        
+
        self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
-        
+
        if is_streaming:
            async def stream_results() -> AsyncGenerator[CrawlResult, None]:
                async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
@@ -128,12 +179,12 @@ class Crawl4aiDockerClient:
                            else:
                                yield CrawlResult(**result)
            return stream_results()
-        
+
        response = await self._request("POST", "/crawl", json=data)
        result_data = response.json()
        if not result_data.get("success", False):
            raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
-        
+
        results = [CrawlResult(**r) for r in result_data.get("results", [])]
        self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
        return results[0] if len(results) == 1 else results
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -47,6 +47,7 @@ from urllib.parse import (
    urljoin, urlparse, urlunparse,
    parse_qsl, urlencode, quote, unquote
 )
+import inspect


 # Monkey patch to fix wildcard handling in urllib.robotparser
@@ -3529,4 +3530,52 @@ def get_memory_stats() -> Tuple[float, float, float]:
    available_gb = get_true_available_memory_gb()
    used_percent = get_true_memory_usage_percent()
    
-    return used_percent, available_gb, total_gb
+    return used_percent, available_gb, total_gb
+
+
+# Hook utilities for Docker API
+def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
+    """
+    Convert hook function objects to string representations for Docker API.
+
+    This utility simplifies the process of using hooks with the Docker API by converting
+    Python function objects into the string format required by the API.
+
+    Args:
+        hooks: Dictionary mapping hook point names to Python function objects.
+               Functions should be async and follow hook signature requirements.
+
+    Returns:
+        Dictionary mapping hook point names to string representations of the functions.
+
+    Example:
+        >>> async def my_hook(page, context, **kwargs):
+        ...     await page.set_viewport_size({"width": 1920, "height": 1080})
+        ...     return page
+        >>>
+        >>> hooks_dict = {"on_page_context_created": my_hook}
+        >>> api_hooks = hooks_to_string(hooks_dict)
+        >>> # api_hooks is now ready to use with Docker API
+
+    Raises:
+        ValueError: If a hook is not callable or source cannot be extracted
+    """
+    result = {}
+
+    for hook_name, hook_func in hooks.items():
+        if not callable(hook_func):
+            raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}")
+
+        try:
+            # Get the source code of the function
+            source = inspect.getsource(hook_func)
+            # Remove any leading indentation to get clean source
+            source = textwrap.dedent(source)
+            result[hook_name] = source
+        except (OSError, TypeError) as e:
+            raise ValueError(
+                f"Cannot extract source code for hook '{hook_name}'. "
+                f"Make sure the function is defined in a file (not interactively). Error: {e}"
+            )
+
+    return result
--- a/docs/examples/docker_client_hooks_example.py
+++ b/docs/examples/docker_client_hooks_example.py
@@ -0,0 +1,522 @@
+#!/usr/bin/env python3
+"""
+Comprehensive hooks examples using Docker Client with function objects.
+
+This approach is recommended because:
+- Write hooks as regular Python functions
+- Full IDE support (autocomplete, type checking)
+- Automatic conversion to API format
+- Reusable and testable code
+- Clean, readable syntax
+"""
+
+import asyncio
+from crawl4ai import Crawl4aiDockerClient
+
+# API_BASE_URL = "http://localhost:11235"
+API_BASE_URL = "http://localhost:11234"
+
+
+# ============================================================================
+# Hook Function Definitions
+# ============================================================================
+
+# --- All Hooks Demo ---
+async def browser_created_hook(browser, **kwargs):
+    """Called after browser is created"""
+    print("[HOOK] Browser created and ready")
+    return browser
+
+
+async def page_context_hook(page, context, **kwargs):
+    """Setup page environment"""
+    print("[HOOK] Setting up page environment")
+
+    # Set viewport
+    await page.set_viewport_size({"width": 1920, "height": 1080})
+
+    # Add cookies
+    await context.add_cookies([{
+        "name": "test_session",
+        "value": "abc123xyz",
+        "domain": ".httpbin.org",
+        "path": "/"
+    }])
+
+    # Block resources
+    await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
+    await context.route("**/analytics/*", lambda route: route.abort())
+
+    print("[HOOK] Environment configured")
+    return page
+
+
+async def user_agent_hook(page, context, user_agent, **kwargs):
+    """Called when user agent is updated"""
+    print(f"[HOOK] User agent: {user_agent[:50]}...")
+    return page
+
+
+async def before_goto_hook(page, context, url, **kwargs):
+    """Called before navigating to URL"""
+    print(f"[HOOK] Navigating to: {url}")
+
+    await page.set_extra_http_headers({
+        "X-Custom-Header": "crawl4ai-test",
+        "Accept-Language": "en-US"
+    })
+
+    return page
+
+
+async def after_goto_hook(page, context, url, response, **kwargs):
+    """Called after page loads"""
+    print(f"[HOOK] Page loaded: {url}")
+
+    await page.wait_for_timeout(1000)
+
+    try:
+        await page.wait_for_selector("body", timeout=2000)
+        print("[HOOK] Body element ready")
+    except:
+        print("[HOOK] Timeout, continuing")
+
+    return page
+
+
+async def execution_started_hook(page, context, **kwargs):
+    """Called when custom JS execution starts"""
+    print("[HOOK] JS execution started")
+    await page.evaluate("console.log('[HOOK] Custom JS');")
+    return page
+
+
+async def before_retrieve_hook(page, context, **kwargs):
+    """Called before retrieving HTML"""
+    print("[HOOK] Preparing HTML retrieval")
+
+    # Scroll for lazy content
+    await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+    await page.wait_for_timeout(500)
+    await page.evaluate("window.scrollTo(0, 0);")
+
+    print("[HOOK] Scrolling complete")
+    return page
+
+
+async def before_return_hook(page, context, html, **kwargs):
+    """Called before returning HTML"""
+    print(f"[HOOK] HTML ready: {len(html)} chars")
+
+    metrics = await page.evaluate('''() => ({
+        images: document.images.length,
+        links: document.links.length,
+        scripts: document.scripts.length
+    })''')
+
+    print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}")
+    return page
+
+
+# --- Authentication Hooks ---
+async def auth_context_hook(page, context, **kwargs):
+    """Setup authentication context"""
+    print("[HOOK] Setting up authentication")
+
+    # Add auth cookies
+    await context.add_cookies([{
+        "name": "auth_token",
+        "value": "fake_jwt_token",
+        "domain": ".httpbin.org",
+        "path": "/",
+        "httpOnly": True
+    }])
+
+    # Set localStorage
+    await page.evaluate('''
+        localStorage.setItem('user_id', '12345');
+        localStorage.setItem('auth_time', new Date().toISOString());
+    ''')
+
+    print("[HOOK] Auth context ready")
+    return page
+
+
+async def auth_headers_hook(page, context, url, **kwargs):
+    """Add authentication headers"""
+    print(f"[HOOK] Adding auth headers for {url}")
+
+    import base64
+    credentials = base64.b64encode(b"user:passwd").decode('ascii')
+
+    await page.set_extra_http_headers({
+        'Authorization': f'Basic {credentials}',
+        'X-API-Key': 'test-key-123'
+    })
+
+    return page
+
+
+# --- Performance Optimization Hooks ---
+async def performance_hook(page, context, **kwargs):
+    """Optimize page for performance"""
+    print("[HOOK] Optimizing for performance")
+
+    # Block resource-heavy content
+    await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort())
+    await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort())
+    await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort())
+    await context.route("**/googletagmanager.com/*", lambda r: r.abort())
+    await context.route("**/google-analytics.com/*", lambda r: r.abort())
+    await context.route("**/facebook.com/*", lambda r: r.abort())
+
+    # Disable animations
+    await page.add_style_tag(content='''
+        *, *::before, *::after {
+            animation-duration: 0s !important;
+            transition-duration: 0s !important;
+        }
+    ''')
+
+    print("[HOOK] Optimizations applied")
+    return page
+
+
+async def cleanup_hook(page, context, **kwargs):
+    """Clean page before extraction"""
+    print("[HOOK] Cleaning page")
+
+    await page.evaluate('''() => {
+        const selectors = [
+            '.ad', '.ads', '.advertisement',
+            '.popup', '.modal', '.overlay',
+            '.cookie-banner', '.newsletter'
+        ];
+
+        selectors.forEach(sel => {
+            document.querySelectorAll(sel).forEach(el => el.remove());
+        });
+
+        document.querySelectorAll('script, style').forEach(el => el.remove());
+    }''')
+
+    print("[HOOK] Page cleaned")
+    return page
+
+
+# --- Content Extraction Hooks ---
+async def wait_dynamic_content_hook(page, context, url, response, **kwargs):
+    """Wait for dynamic content to load"""
+    print(f"[HOOK] Waiting for dynamic content on {url}")
+
+    await page.wait_for_timeout(2000)
+
+    # Click "Load More" if exists
+    try:
+        load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")')
+        if load_more:
+            await load_more.click()
+            await page.wait_for_timeout(1000)
+            print("[HOOK] Clicked 'Load More'")
+    except:
+        pass
+
+    return page
+
+
+async def extract_metadata_hook(page, context, **kwargs):
+    """Extract page metadata"""
+    print("[HOOK] Extracting metadata")
+
+    metadata = await page.evaluate('''() => {
+        const getMeta = (name) => {
+            const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
+            return el ? el.getAttribute('content') : null;
+        };
+
+        return {
+            title: document.title,
+            description: getMeta('description'),
+            author: getMeta('author'),
+            keywords: getMeta('keywords'),
+        };
+    }''')
+
+    print(f"[HOOK] Metadata: {metadata}")
+
+    # Infinite scroll
+    for i in range(3):
+        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+        await page.wait_for_timeout(1000)
+        print(f"[HOOK] Scroll {i+1}/3")
+
+    return page
+
+
+# --- Multi-URL Hooks ---
+async def url_specific_hook(page, context, url, **kwargs):
+    """Apply URL-specific logic"""
+    print(f"[HOOK] Processing URL: {url}")
+
+    # URL-specific headers
+    if 'html' in url:
+        await page.set_extra_http_headers({"X-Type": "HTML"})
+    elif 'json' in url:
+        await page.set_extra_http_headers({"X-Type": "JSON"})
+
+    return page
+
+
+async def track_progress_hook(page, context, url, response, **kwargs):
+    """Track crawl progress"""
+    status = response.status if response else 'unknown'
+    print(f"[HOOK] Loaded {url} - Status: {status}")
+    return page
+
+
+# ============================================================================
+# Test Functions
+# ============================================================================
+
+async def test_all_hooks_comprehensive():
+    """Test all 8 hook types"""
+    print("=" * 70)
+    print("Test 1: All Hooks Comprehensive Demo (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nCrawling with all 8 hooks...")
+
+        # Define hooks with function objects
+        hooks = {
+            "on_browser_created": browser_created_hook,
+            "on_page_context_created": page_context_hook,
+            "on_user_agent_updated": user_agent_hook,
+            "before_goto": before_goto_hook,
+            "after_goto": after_goto_hook,
+            "on_execution_started": execution_started_hook,
+            "before_retrieve_html": before_retrieve_hook,
+            "before_return_html": before_return_hook
+        }
+
+        result = await client.crawl(
+            ["https://httpbin.org/html"],
+            hooks=hooks,
+            hooks_timeout=30
+        )
+
+        print("\n✅ Success!")
+        print(f"   URL: {result.url}")
+        print(f"   Success: {result.success}")
+        print(f"   HTML: {len(result.html)} chars")
+
+
+async def test_authentication_workflow():
+    """Test authentication with hooks"""
+    print("\n" + "=" * 70)
+    print("Test 2: Authentication Workflow (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nTesting authentication...")
+
+        hooks = {
+            "on_page_context_created": auth_context_hook,
+            "before_goto": auth_headers_hook
+        }
+
+        result = await client.crawl(
+            ["https://httpbin.org/basic-auth/user/passwd"],
+            hooks=hooks,
+            hooks_timeout=15
+        )
+
+        print("\n✅ Authentication completed")
+
+        if result.success:
+            if '"authenticated"' in result.html and 'true' in result.html:
+                print("   ✅ Basic auth successful!")
+            else:
+                print("   ⚠️ Auth status unclear")
+        else:
+            print(f"   ❌ Failed: {result.error_message}")
+
+
+async def test_performance_optimization():
+    """Test performance optimization"""
+    print("\n" + "=" * 70)
+    print("Test 3: Performance Optimization (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nTesting performance hooks...")
+
+        hooks = {
+            "on_page_context_created": performance_hook,
+            "before_retrieve_html": cleanup_hook
+        }
+
+        result = await client.crawl(
+            ["https://httpbin.org/html"],
+            hooks=hooks,
+            hooks_timeout=10
+        )
+
+        print("\n✅ Optimization completed")
+        print(f"   HTML size: {len(result.html):,} chars")
+        print("   Resources blocked, ads removed")
+
+
+async def test_content_extraction():
+    """Test content extraction"""
+    print("\n" + "=" * 70)
+    print("Test 4: Content Extraction (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nTesting extraction hooks...")
+
+        hooks = {
+            "after_goto": wait_dynamic_content_hook,
+            "before_retrieve_html": extract_metadata_hook
+        }
+
+        result = await client.crawl(
+            ["https://www.kidocode.com/"],
+            hooks=hooks,
+            hooks_timeout=20
+        )
+
+        print("\n✅ Extraction completed")
+        print(f"   URL: {result.url}")
+        print(f"   Success: {result.success}")
+        print(f"   Metadata: {result.metadata}")
+
+
+async def test_multi_url_crawl():
+    """Test hooks with multiple URLs"""
+    print("\n" + "=" * 70)
+    print("Test 5: Multi-URL Crawl (Docker Client)")
+    print("=" * 70)
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nCrawling multiple URLs...")
+
+        hooks = {
+            "before_goto": url_specific_hook,
+            "after_goto": track_progress_hook
+        }
+
+        results = await client.crawl(
+            [
+                "https://httpbin.org/html",
+                "https://httpbin.org/json",
+                "https://httpbin.org/xml"
+            ],
+            hooks=hooks,
+            hooks_timeout=15
+        )
+
+        print("\n✅ Multi-URL crawl completed")
+        print(f"\n   Crawled {len(results)} URLs:")
+        for i, result in enumerate(results, 1):
+            status = "✅" if result.success else "❌"
+            print(f"   {status} {i}. {result.url}")
+
+
+async def test_reusable_hook_library():
+    """Test using reusable hook library"""
+    print("\n" + "=" * 70)
+    print("Test 6: Reusable Hook Library (Docker Client)")
+    print("=" * 70)
+
+    # Create a library of reusable hooks
+    class HookLibrary:
+        @staticmethod
+        async def block_images(page, context, **kwargs):
+            """Block all images"""
+            await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort())
+            print("[LIBRARY] Images blocked")
+            return page
+
+        @staticmethod
+        async def block_analytics(page, context, **kwargs):
+            """Block analytics"""
+            await context.route("**/analytics/*", lambda r: r.abort())
+            await context.route("**/google-analytics.com/*", lambda r: r.abort())
+            print("[LIBRARY] Analytics blocked")
+            return page
+
+        @staticmethod
+        async def scroll_infinite(page, context, **kwargs):
+            """Handle infinite scroll"""
+            for i in range(5):
+                prev = await page.evaluate("document.body.scrollHeight")
+                await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+                await page.wait_for_timeout(1000)
+                curr = await page.evaluate("document.body.scrollHeight")
+                if curr == prev:
+                    break
+            print("[LIBRARY] Infinite scroll complete")
+            return page
+
+    async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+        print("\nUsing hook library...")
+
+        hooks = {
+            "on_page_context_created": HookLibrary.block_images,
+            "before_retrieve_html": HookLibrary.scroll_infinite
+        }
+
+        result = await client.crawl(
+            ["https://www.kidocode.com/"],
+            hooks=hooks,
+            hooks_timeout=20
+        )
+
+        print("\n✅ Library hooks completed")
+        print(f"   Success: {result.success}")
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+async def main():
+    """Run all Docker client hook examples"""
+    print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)")
+    print("Using Python function objects with automatic conversion")
+    print("=" * 70)
+
+    tests = [
+        ("All Hooks Demo", test_all_hooks_comprehensive),
+        ("Authentication", test_authentication_workflow),
+        ("Performance", test_performance_optimization),
+        ("Extraction", test_content_extraction),
+        ("Multi-URL", test_multi_url_crawl),
+        ("Hook Library", test_reusable_hook_library)
+    ]
+
+    for i, (name, test_func) in enumerate(tests, 1):
+        try:
+            await test_func()
+            print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n")
+        except Exception as e:
+            print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n")
+            import traceback
+            traceback.print_exc()
+
+    print("=" * 70)
+    print("🎉 All Docker client hook examples completed!")
+    print("\n💡 Key Benefits of Function-Based Hooks:")
+    print("   • Write as regular Python functions")
+    print("   • Full IDE support (autocomplete, types)")
+    print("   • Automatic conversion to API format")
+    print("   • Reusable across projects")
+    print("   • Clean, readable code")
+    print("   • Easy to test and debug")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
@@ -6,18 +6,6 @@
  - [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
  - [Option 2: Using Docker Compose](#option-2-using-docker-compose)
  - [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
- [Dockerfile Parameters](#dockerfile-parameters)
- [Using the API](#using-the-api)
-  - [Playground Interface](#playground-interface)
-  - [Python SDK](#python-sdk)
-  - [Understanding Request Schema](#understanding-request-schema)
-  - [REST API Examples](#rest-api-examples)
- [Additional API Endpoints](#additional-api-endpoints)
-  - [HTML Extraction Endpoint](#html-extraction-endpoint)
-  - [Screenshot Endpoint](#screenshot-endpoint)
-  - [PDF Export Endpoint](#pdf-export-endpoint)
-  - [JavaScript Execution Endpoint](#javascript-execution-endpoint)
-  - [Library Context Endpoint](#library-context-endpoint)
 - [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
  - [What is MCP?](#what-is-mcp)
  - [Connecting via MCP](#connecting-via-mcp)
@@ -25,9 +13,28 @@
  - [Available MCP Tools](#available-mcp-tools)
  - [Testing MCP Connections](#testing-mcp-connections)
  - [MCP Schemas](#mcp-schemas)
+- [Additional API Endpoints](#additional-api-endpoints)
+  - [HTML Extraction Endpoint](#html-extraction-endpoint)
+  - [Screenshot Endpoint](#screenshot-endpoint)
+  - [PDF Export Endpoint](#pdf-export-endpoint)
+  - [JavaScript Execution Endpoint](#javascript-execution-endpoint)
+- [User-Provided Hooks API](#user-provided-hooks-api)
+  - [Hook Information Endpoint](#hook-information-endpoint)
+  - [Available Hook Points](#available-hook-points)
+  - [Using Hooks in Requests](#using-hooks-in-requests)
+  - [Hook Examples with Real URLs](#hook-examples-with-real-urls)
+  - [Security Best Practices](#security-best-practices)
+  - [Hook Response Information](#hook-response-information)
+  - [Error Handling](#error-handling)
+  - [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+  - [Playground Interface](#playground-interface)
+  - [Python SDK](#python-sdk)
+  - [Understanding Request Schema](#understanding-request-schema)
+  - [REST API Examples](#rest-api-examples)
+  - [LLM Configuration Examples](#llm-configuration-examples)
 - [Metrics & Monitoring](#metrics--monitoring)
- [Deployment Scenarios](#deployment-scenarios)
- [Complete Examples](#complete-examples)
 - [Server Configuration](#server-configuration)
  - [Understanding config.yml](#understanding-configyml)
  - [JWT Authentication](#jwt-authentication)
@@ -832,6 +839,275 @@ else:

 > 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious.

+### Hooks Utility: Function-Based Approach (Python)
+
+For Python developers, Crawl4AI provides a more convenient way to work with hooks using the `hooks_to_string()` utility function and Docker client integration.
+
+#### Why Use Function-Based Hooks?
+
+**String-Based Approach (shown above)**:
+```python
+hooks_code = {
+    "on_page_context_created": """
+async def hook(page, context, **kwargs):
+    await page.set_viewport_size({"width": 1920, "height": 1080})
+    return page
+"""
+}
+```
+
+**Function-Based Approach (recommended for Python)**:
+```python
+from crawl4ai import Crawl4aiDockerClient
+
+async def my_hook(page, context, **kwargs):
+    await page.set_viewport_size({"width": 1920, "height": 1080})
+    return page
+
+async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
+    result = await client.crawl(
+        ["https://example.com"],
+        hooks={"on_page_context_created": my_hook}
+    )
+```
+
+**Benefits**:
+- ✅ Write hooks as regular Python functions
+- ✅ Full IDE support (autocomplete, syntax highlighting, type checking)
+- ✅ Easy to test and debug
+- ✅ Reusable hook libraries
+- ✅ Automatic conversion to API format
+
+#### Using the Hooks Utility
+
+The `hooks_to_string()` utility converts Python function objects to the string format required by the API:
+
+```python
+from crawl4ai import hooks_to_string
+
+# Define your hooks as functions
+async def setup_hook(page, context, **kwargs):
+    await page.set_viewport_size({"width": 1920, "height": 1080})
+    await context.add_cookies([{
+        "name": "session",
+        "value": "token",
+        "domain": ".example.com"
+    }])
+    return page
+
+async def scroll_hook(page, context, **kwargs):
+    await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+    return page
+
+# Convert to string format
+hooks_dict = {
+    "on_page_context_created": setup_hook,
+    "before_retrieve_html": scroll_hook
+}
+hooks_string = hooks_to_string(hooks_dict)
+
+# Now use with REST API or Docker client
+# hooks_string contains the string representations
+```
+
+#### Docker Client with Automatic Conversion
+
+The Docker client automatically detects and converts function objects:
+
+```python
+from crawl4ai import Crawl4aiDockerClient
+
+async def auth_hook(page, context, **kwargs):
+    """Add authentication cookies"""
+    await context.add_cookies([{
+        "name": "auth_token",
+        "value": "your_token",
+        "domain": ".example.com"
+    }])
+    return page
+
+async def performance_hook(page, context, **kwargs):
+    """Block unnecessary resources"""
+    await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
+    await context.route("**/analytics/*", lambda r: r.abort())
+    return page
+
+async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
+    # Pass functions directly - automatic conversion!
+    result = await client.crawl(
+        ["https://example.com"],
+        hooks={
+            "on_page_context_created": performance_hook,
+            "before_goto": auth_hook
+        },
+        hooks_timeout=30  # Optional timeout in seconds (1-120)
+    )
+
+    print(f"Success: {result.success}")
+    print(f"HTML: {len(result.html)} chars")
+```
+
+#### Creating Reusable Hook Libraries
+
+Build collections of reusable hooks:
+
+```python
+# hooks_library.py
+class CrawlHooks:
+    """Reusable hook collection for common crawling tasks"""
+
+    @staticmethod
+    async def block_images(page, context, **kwargs):
+        """Block all images to speed up crawling"""
+        await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda r: r.abort())
+        return page
+
+    @staticmethod
+    async def block_analytics(page, context, **kwargs):
+        """Block analytics and tracking scripts"""
+        tracking_domains = [
+            "**/google-analytics.com/*",
+            "**/googletagmanager.com/*",
+            "**/facebook.com/tr/*",
+            "**/doubleclick.net/*"
+        ]
+        for domain in tracking_domains:
+            await context.route(domain, lambda r: r.abort())
+        return page
+
+    @staticmethod
+    async def scroll_infinite(page, context, **kwargs):
+        """Handle infinite scroll to load more content"""
+        previous_height = 0
+        for i in range(5):  # Max 5 scrolls
+            current_height = await page.evaluate("document.body.scrollHeight")
+            if current_height == previous_height:
+                break
+            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+            await page.wait_for_timeout(1000)
+            previous_height = current_height
+        return page
+
+    @staticmethod
+    async def wait_for_dynamic_content(page, context, url, response, **kwargs):
+        """Wait for dynamic content to load"""
+        await page.wait_for_timeout(2000)
+        try:
+            # Click "Load More" if present
+            load_more = await page.query_selector('[class*="load-more"]')
+            if load_more:
+                await load_more.click()
+                await page.wait_for_timeout(1000)
+        except:
+            pass
+        return page
+
+# Use in your application
+from hooks_library import CrawlHooks
+from crawl4ai import Crawl4aiDockerClient
+
+async def crawl_with_optimizations(url):
+    async with Crawl4aiDockerClient() as client:
+        result = await client.crawl(
+            [url],
+            hooks={
+                "on_page_context_created": CrawlHooks.block_images,
+                "before_retrieve_html": CrawlHooks.scroll_infinite
+            }
+        )
+        return result
+```
+
+#### Choosing the Right Approach
+
+| Approach | Best For | IDE Support | Language |
+|----------|----------|-------------|----------|
+| **String-based** | Non-Python clients, REST APIs, other languages | ❌ None | Any |
+| **Function-based** | Python applications, local development | ✅ Full | Python only |
+| **Docker Client** | Python apps with automatic conversion | ✅ Full | Python only |
+
+**Recommendation**:
+- **Python applications**: Use Docker client with function objects (easiest)
+- **Non-Python or REST API**: Use string-based hooks (most flexible)
+- **Manual control**: Use `hooks_to_string()` utility (middle ground)
+
+#### Complete Example with Function Hooks
+
+```python
+from crawl4ai import Crawl4aiDockerClient, BrowserConfig, CrawlerRunConfig, CacheMode
+
+# Define hooks as regular Python functions
+async def setup_environment(page, context, **kwargs):
+    """Setup crawling environment"""
+    # Set viewport
+    await page.set_viewport_size({"width": 1920, "height": 1080})
+
+    # Block resources for speed
+    await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
+
+    # Add custom headers
+    await page.set_extra_http_headers({
+        "Accept-Language": "en-US",
+        "X-Custom-Header": "Crawl4AI"
+    })
+
+    print("[HOOK] Environment configured")
+    return page
+
+async def extract_content(page, context, **kwargs):
+    """Extract and prepare content"""
+    # Scroll to load lazy content
+    await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+    await page.wait_for_timeout(1000)
+
+    # Extract metadata
+    metadata = await page.evaluate('''() => ({
+        title: document.title,
+        links: document.links.length,
+        images: document.images.length
+    })''')
+
+    print(f"[HOOK] Page metadata: {metadata}")
+    return page
+
+async def main():
+    async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
+        # Configure crawl
+        browser_config = BrowserConfig(headless=True)
+        crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+        # Crawl with hooks
+        result = await client.crawl(
+            ["https://httpbin.org/html"],
+            browser_config=browser_config,
+            crawler_config=crawler_config,
+            hooks={
+                "on_page_context_created": setup_environment,
+                "before_retrieve_html": extract_content
+            },
+            hooks_timeout=30
+        )
+
+        if result.success:
+            print(f"✅ Crawl successful!")
+            print(f"   URL: {result.url}")
+            print(f"   HTML: {len(result.html)} chars")
+            print(f"   Markdown: {len(result.markdown)} chars")
+        else:
+            print(f"❌ Crawl failed: {result.error_message}")
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())
+```
+
+#### Additional Resources
+
+- **Comprehensive Examples**: See `/docs/examples/hooks_docker_client_example.py` for Python function-based examples
+- **REST API Examples**: See `/docs/examples/hooks_rest_api_example.py` for string-based examples
+- **Comparison Guide**: See `/docs/examples/README_HOOKS.md` for detailed comparison
+- **Utility Documentation**: See `/docs/hooks-utility-guide.md` for complete guide
+
 ---

 ## Dockerfile Parameters
@@ -892,10 +1168,12 @@ This is the easiest way to translate Python configuration to JSON requests when

 Install the SDK: `pip install crawl4ai`

+The Python SDK provides a convenient way to interact with the Docker API, including **automatic hook conversion** when using function objects.
+
 ```python
 import asyncio
 from crawl4ai.docker_client import Crawl4aiDockerClient
-from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode

 async def main():
    # Point to the correct server port
@@ -907,23 +1185,22 @@ async def main():
        print("--- Running Non-Streaming Crawl ---")
        results = await client.crawl(
            ["https://httpbin.org/html"],
-            browser_config=BrowserConfig(headless=True), # Use library classes for config aid
+            browser_config=BrowserConfig(headless=True),
            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        )
-        if results: # client.crawl returns None on failure
-          print(f"Non-streaming results success: {results.success}")
-          if results.success:
-              for result in results: # Iterate through the CrawlResultContainer
-                  print(f"URL: {result.url}, Success: {result.success}")
+        if results:
+            print(f"Non-streaming results success: {results.success}")
+            if results.success:
+                for result in results:
+                    print(f"URL: {result.url}, Success: {result.success}")
        else:
            print("Non-streaming crawl failed.")

-
        # Example Streaming crawl
        print("\n--- Running Streaming Crawl ---")
        stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
        try:
-            async for result in await client.crawl( # client.crawl returns an async generator for streaming
+            async for result in await client.crawl(
                ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
                browser_config=BrowserConfig(headless=True),
                crawler_config=stream_config
@@ -932,17 +1209,56 @@ async def main():
        except Exception as e:
            print(f"Streaming crawl failed: {e}")

+        # Example with hooks (Python function objects)
+        print("\n--- Crawl with Hooks ---")
+
+        async def my_hook(page, context, **kwargs):
+            """Custom hook to optimize performance"""
+            await page.set_viewport_size({"width": 1920, "height": 1080})
+            await context.route("**/*.{png,jpg}", lambda r: r.abort())
+            print("[HOOK] Page optimized")
+            return page
+
+        result = await client.crawl(
+            ["https://httpbin.org/html"],
+            browser_config=BrowserConfig(headless=True),
+            crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+            hooks={"on_page_context_created": my_hook},  # Pass function directly!
+            hooks_timeout=30
+        )
+        print(f"Crawl with hooks success: {result.success}")

        # Example Get schema
        print("\n--- Getting Schema ---")
        schema = await client.get_schema()
-        print(f"Schema received: {bool(schema)}") # Print whether schema was received
+        print(f"Schema received: {bool(schema)}")

 if __name__ == "__main__":
    asyncio.run(main())
 ```

-*(SDK parameters like timeout, verify_ssl etc. remain the same)*
+#### SDK Parameters
+
+The Docker client supports the following parameters:
+
+**Client Initialization**:
+- `base_url` (str): URL of the Docker server (default: `http://localhost:8000`)
+- `timeout` (float): Request timeout in seconds (default: 30.0)
+- `verify_ssl` (bool): Verify SSL certificates (default: True)
+- `verbose` (bool): Enable verbose logging (default: True)
+- `log_file` (Optional[str]): Path to log file (default: None)
+
+**crawl() Method**:
+- `urls` (List[str]): List of URLs to crawl
+- `browser_config` (Optional[BrowserConfig]): Browser configuration
+- `crawler_config` (Optional[CrawlerRunConfig]): Crawler configuration
+- `hooks` (Optional[Dict]): Hook functions or strings - **automatically converts function objects!**
+- `hooks_timeout` (int): Timeout for each hook execution in seconds (default: 30)
+
+**Returns**:
+- Single URL: `CrawlResult` object
+- Multiple URLs: `List[CrawlResult]`
+- Streaming: `AsyncGenerator[CrawlResult]`

 ### Second Approach: Direct API Calls

@@ -1352,19 +1668,40 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:

 In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
 - Building and running the Docker container
- Configuring the environment  
+- Configuring the environment
 - Using the interactive playground for testing
 - Making API requests with proper typing
- Using the Python SDK
+- Using the Python SDK with **automatic hook conversion**
+- **Working with hooks** - both string-based (REST API) and function-based (Python SDK)
 - Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
 - Connecting via the Model Context Protocol (MCP)
 - Monitoring your deployment

-The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
+### Key Features

-For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
+**Hooks Support**: Crawl4AI offers two approaches for working with hooks:
+- **String-based** (REST API): Works with any language, requires manual string formatting
+- **Function-based** (Python SDK): Write hooks as regular Python functions with full IDE support and automatic conversion

-Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+**Playground Interface**: The built-in playground at `http://localhost:11235/playground` makes it easy to test configurations and generate corresponding JSON for API requests.
+
+**MCP Integration**: For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
+
+### Next Steps
+
+1. **Explore Examples**: Check out the comprehensive examples in:
+   - `/docs/examples/hooks_docker_client_example.py` - Python function-based hooks
+   - `/docs/examples/hooks_rest_api_example.py` - REST API string-based hooks
+   - `/docs/examples/README_HOOKS.md` - Comparison and guide
+
+2. **Read Documentation**:
+   - `/docs/hooks-utility-guide.md` - Complete hooks utility guide
+   - API documentation for detailed configuration options
+
+3. **Join the Community**:
+   - GitHub: Report issues and contribute
+   - Discord: Get help and share your experiences
+   - Documentation: Comprehensive guides and tutorials

 Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀

--- a/tests/docker/test_hooks_utility.py
+++ b/tests/docker/test_hooks_utility.py
@@ -0,0 +1,193 @@
+"""
+Test script demonstrating the hooks_to_string utility and Docker client integration.
+"""
+import asyncio
+from crawl4ai import Crawl4aiDockerClient, hooks_to_string
+
+
+# Define hook functions as regular Python functions
+async def auth_hook(page, context, **kwargs):
+    """Add authentication cookies."""
+    await context.add_cookies([{
+        'name': 'test_cookie',
+        'value': 'test_value',
+        'domain': '.httpbin.org',
+        'path': '/'
+    }])
+    return page
+
+
+async def scroll_hook(page, context, **kwargs):
+    """Scroll to load lazy content."""
+    await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+    await page.wait_for_timeout(1000)
+    return page
+
+
+async def viewport_hook(page, context, **kwargs):
+    """Set custom viewport."""
+    await page.set_viewport_size({"width": 1920, "height": 1080})
+    return page
+
+
+async def test_hooks_utility():
+    """Test the hooks_to_string utility function."""
+    print("=" * 60)
+    print("Testing hooks_to_string utility")
+    print("=" * 60)
+
+    # Create hooks dictionary with function objects
+    hooks_dict = {
+        "on_page_context_created": auth_hook,
+        "before_retrieve_html": scroll_hook
+    }
+
+    # Convert to string format
+    hooks_string = hooks_to_string(hooks_dict)
+
+    print("\n✓ Successfully converted function objects to strings")
+    print(f"\n✓ Converted {len(hooks_string)} hooks:")
+    for hook_name in hooks_string.keys():
+        print(f"  - {hook_name}")
+
+    print("\n✓ Preview of converted hook:")
+    print("-" * 60)
+    print(hooks_string["on_page_context_created"][:200] + "...")
+    print("-" * 60)
+
+    return hooks_string
+
+
+async def test_docker_client_with_functions():
+    """Test Docker client with function objects (automatic conversion)."""
+    print("\n" + "=" * 60)
+    print("Testing Docker Client with Function Objects")
+    print("=" * 60)
+
+    # Note: This requires a running Crawl4AI Docker server
+    # Uncomment the following to test with actual server:
+
+    async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
+        # Pass function objects directly - they'll be converted automatically
+        result = await client.crawl(
+            ["https://httpbin.org/html"],
+            hooks={
+                "on_page_context_created": auth_hook,
+                "before_retrieve_html": scroll_hook
+            },
+            hooks_timeout=30
+        )
+        print(f"\n✓ Crawl successful: {result.success}")
+        print(f"✓ URL: {result.url}")
+
+    print("\n✓ Docker client accepts function objects directly")
+    print("✓ Automatic conversion happens internally")
+    print("✓ No manual string formatting needed!")
+
+
+async def test_docker_client_with_strings():
+    """Test Docker client with pre-converted strings."""
+    print("\n" + "=" * 60)
+    print("Testing Docker Client with String Hooks")
+    print("=" * 60)
+
+    # Convert hooks to strings first
+    hooks_dict = {
+        "on_page_context_created": viewport_hook,
+        "before_retrieve_html": scroll_hook
+    }
+    hooks_string = hooks_to_string(hooks_dict)
+
+    # Note: This requires a running Crawl4AI Docker server
+    # Uncomment the following to test with actual server:
+
+    async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
+        # Pass string hooks - they'll be used as-is
+        result = await client.crawl(
+            ["https://httpbin.org/html"],
+            hooks=hooks_string,
+            hooks_timeout=30
+        )
+        print(f"\n✓ Crawl successful: {result.success}")
+
+    print("\n✓ Docker client also accepts pre-converted strings")
+    print("✓ Backward compatible with existing code")
+
+
+async def show_usage_patterns():
+    """Show different usage patterns."""
+    print("\n" + "=" * 60)
+    print("Usage Patterns")
+    print("=" * 60)
+
+    print("\n1. Direct function usage (simplest):")
+    print("-" * 60)
+    print("""
+    async def my_hook(page, context, **kwargs):
+        await page.set_viewport_size({"width": 1920, "height": 1080})
+        return page
+
+    result = await client.crawl(
+        ["https://example.com"],
+        hooks={"on_page_context_created": my_hook}
+    )
+    """)
+
+    print("\n2. Convert then use:")
+    print("-" * 60)
+    print("""
+    hooks_dict = {"on_page_context_created": my_hook}
+    hooks_string = hooks_to_string(hooks_dict)
+
+    result = await client.crawl(
+        ["https://example.com"],
+        hooks=hooks_string
+    )
+    """)
+
+    print("\n3. Manual string (backward compatible):")
+    print("-" * 60)
+    print("""
+    hooks_string = {
+        "on_page_context_created": '''
+async def hook(page, context, **kwargs):
+    await page.set_viewport_size({"width": 1920, "height": 1080})
+    return page
+'''
+    }
+
+    result = await client.crawl(
+        ["https://example.com"],
+        hooks=hooks_string
+    )
+    """)
+
+
+async def main():
+    """Run all tests."""
+    print("\n🚀 Crawl4AI Hooks Utility Test Suite\n")
+
+    # Test the utility function
+    # await test_hooks_utility()
+
+    # Show usage with Docker client
+    # await test_docker_client_with_functions()
+    await test_docker_client_with_strings()
+
+    # Show different patterns
+    # await show_usage_patterns()
+
+    # print("\n" + "=" * 60)
+    # print("✓ All tests completed successfully!")
+    # print("=" * 60)
+    # print("\nKey Benefits:")
+    # print("  • Write hooks as regular Python functions")
+    # print("  • IDE support with autocomplete and type checking")
+    # print("  • Automatic conversion to API format")
+    # print("  • Backward compatible with string hooks")
+    # print("  • Same utility used everywhere")
+    # print("\n")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())