From a3f057e19fc60245ef16c9b8fb1639c7b1555fb2 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 13 Oct 2025 12:34:08 +0800 Subject: [PATCH] feat: Add hooks utility for function-based hooks with Docker client integration. ref #1377 Add hooks_to_string() utility function that converts Python function objects to string representations for the Docker API, enabling developers to write hooks as regular Python functions instead of strings. Core Changes: - New hooks_to_string() utility in crawl4ai/utils.py using inspect.getsource() - Docker client now accepts both function objects and strings for hooks - Automatic detection and conversion in Crawl4aiDockerClient._prepare_request() - New hooks and hooks_timeout parameters in client.crawl() method Documentation: - Docker client examples with function-based hooks (docs/examples/docker_client_hooks_example.py) - Updated main Docker deployment guide with comprehensive hooks section - Added unit tests for hooks utility (tests/docker/test_hooks_utility.py) --- crawl4ai/__init__.py | 4 +- crawl4ai/docker_client.py | 75 ++- crawl4ai/utils.py | 51 +- docs/examples/docker_client_hooks_example.py | 522 +++++++++++++++++++ docs/md_v2/core/docker-deployment.md | 397 ++++++++++++-- tests/docker/test_hooks_utility.py | 193 +++++++ 6 files changed, 1198 insertions(+), 44 deletions(-) create mode 100644 docs/examples/docker_client_hooks_example.py create mode 100644 tests/docker/test_hooks_utility.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 6917f27e..8f1fdef4 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -103,7 +103,8 @@ from .browser_adapter import ( from .utils import ( start_colab_display_server, - setup_colab_environment + setup_colab_environment, + hooks_to_string ) __all__ = [ @@ -183,6 +184,7 @@ __all__ = [ "ProxyConfig", "start_colab_display_server", "setup_colab_environment", + "hooks_to_string", # C4A Script additions "c4a_compile", "c4a_validate", diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py index 4e33431f..969fee7c 100644 --- a/crawl4ai/docker_client.py +++ b/crawl4ai/docker_client.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union, AsyncGenerator, Dict, Any +from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable import httpx import json from urllib.parse import urljoin @@ -7,6 +7,7 @@ import asyncio from .async_configs import BrowserConfig, CrawlerRunConfig from .models import CrawlResult from .async_logger import AsyncLogger, LogLevel +from .utils import hooks_to_string class Crawl4aiClientError(Exception): @@ -70,17 +71,41 @@ class Crawl4aiDockerClient: self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR") raise ConnectionError(f"Cannot connect to server: {str(e)}") - def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, - crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]: + def _prepare_request( + self, + urls: List[str], + browser_config: Optional[BrowserConfig] = None, + crawler_config: Optional[CrawlerRunConfig] = None, + hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None, + hooks_timeout: int = 30 + ) -> Dict[str, Any]: """Prepare request data from configs.""" if self._token: self._http_client.headers["Authorization"] = f"Bearer {self._token}" - return { + + request_data = { "urls": urls, "browser_config": browser_config.dump() if browser_config else {}, "crawler_config": crawler_config.dump() if crawler_config else {} } + # Handle hooks if provided + if hooks: + # Check if hooks are already strings or need conversion + if any(callable(v) for v in hooks.values()): + # Convert function objects to strings + hooks_code = hooks_to_string(hooks) + else: + # Already in string format + hooks_code = hooks + + request_data["hooks"] = { + "code": hooks_code, + "timeout": hooks_timeout + } + + return request_data + async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response: """Make an HTTP request with error handling.""" url = urljoin(self.base_url, endpoint) @@ -102,16 +127,42 @@ class Crawl4aiDockerClient: self, urls: List[str], browser_config: Optional[BrowserConfig] = None, - crawler_config: Optional[CrawlerRunConfig] = None + crawler_config: Optional[CrawlerRunConfig] = None, + hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None, + hooks_timeout: int = 30 ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]: - """Execute a crawl operation.""" + """ + Execute a crawl operation. + + Args: + urls: List of URLs to crawl + browser_config: Browser configuration + crawler_config: Crawler configuration + hooks: Optional hooks - can be either: + - Dict[str, Callable]: Function objects that will be converted to strings + - Dict[str, str]: Already stringified hook code + hooks_timeout: Timeout in seconds for each hook execution (1-120) + + Returns: + Single CrawlResult, list of results, or async generator for streaming + + Example with function hooks: + >>> async def my_hook(page, context, **kwargs): + ... await page.set_viewport_size({"width": 1920, "height": 1080}) + ... return page + >>> + >>> result = await client.crawl( + ... ["https://example.com"], + ... hooks={"on_page_context_created": my_hook} + ... ) + """ await self._check_server() - - data = self._prepare_request(urls, browser_config, crawler_config) + + data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout) is_streaming = crawler_config and crawler_config.stream - + self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL") - + if is_streaming: async def stream_results() -> AsyncGenerator[CrawlResult, None]: async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response: @@ -128,12 +179,12 @@ class Crawl4aiDockerClient: else: yield CrawlResult(**result) return stream_results() - + response = await self._request("POST", "/crawl", json=data) result_data = response.json() if not result_data.get("success", False): raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}") - + results = [CrawlResult(**r) for r in result_data.get("results", [])] self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL") return results[0] if len(results) == 1 else results diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 046351e7..bbd7ffa2 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -47,6 +47,7 @@ from urllib.parse import ( urljoin, urlparse, urlunparse, parse_qsl, urlencode, quote, unquote ) +import inspect # Monkey patch to fix wildcard handling in urllib.robotparser @@ -3529,4 +3530,52 @@ def get_memory_stats() -> Tuple[float, float, float]: available_gb = get_true_available_memory_gb() used_percent = get_true_memory_usage_percent() - return used_percent, available_gb, total_gb \ No newline at end of file + return used_percent, available_gb, total_gb + + +# Hook utilities for Docker API +def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]: + """ + Convert hook function objects to string representations for Docker API. + + This utility simplifies the process of using hooks with the Docker API by converting + Python function objects into the string format required by the API. + + Args: + hooks: Dictionary mapping hook point names to Python function objects. + Functions should be async and follow hook signature requirements. + + Returns: + Dictionary mapping hook point names to string representations of the functions. + + Example: + >>> async def my_hook(page, context, **kwargs): + ... await page.set_viewport_size({"width": 1920, "height": 1080}) + ... return page + >>> + >>> hooks_dict = {"on_page_context_created": my_hook} + >>> api_hooks = hooks_to_string(hooks_dict) + >>> # api_hooks is now ready to use with Docker API + + Raises: + ValueError: If a hook is not callable or source cannot be extracted + """ + result = {} + + for hook_name, hook_func in hooks.items(): + if not callable(hook_func): + raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}") + + try: + # Get the source code of the function + source = inspect.getsource(hook_func) + # Remove any leading indentation to get clean source + source = textwrap.dedent(source) + result[hook_name] = source + except (OSError, TypeError) as e: + raise ValueError( + f"Cannot extract source code for hook '{hook_name}'. " + f"Make sure the function is defined in a file (not interactively). Error: {e}" + ) + + return result diff --git a/docs/examples/docker_client_hooks_example.py b/docs/examples/docker_client_hooks_example.py new file mode 100644 index 00000000..1aa27fdc --- /dev/null +++ b/docs/examples/docker_client_hooks_example.py @@ -0,0 +1,522 @@ +#!/usr/bin/env python3 +""" +Comprehensive hooks examples using Docker Client with function objects. + +This approach is recommended because: +- Write hooks as regular Python functions +- Full IDE support (autocomplete, type checking) +- Automatic conversion to API format +- Reusable and testable code +- Clean, readable syntax +""" + +import asyncio +from crawl4ai import Crawl4aiDockerClient + +# API_BASE_URL = "http://localhost:11235" +API_BASE_URL = "http://localhost:11234" + + +# ============================================================================ +# Hook Function Definitions +# ============================================================================ + +# --- All Hooks Demo --- +async def browser_created_hook(browser, **kwargs): + """Called after browser is created""" + print("[HOOK] Browser created and ready") + return browser + + +async def page_context_hook(page, context, **kwargs): + """Setup page environment""" + print("[HOOK] Setting up page environment") + + # Set viewport + await page.set_viewport_size({"width": 1920, "height": 1080}) + + # Add cookies + await context.add_cookies([{ + "name": "test_session", + "value": "abc123xyz", + "domain": ".httpbin.org", + "path": "/" + }]) + + # Block resources + await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort()) + await context.route("**/analytics/*", lambda route: route.abort()) + + print("[HOOK] Environment configured") + return page + + +async def user_agent_hook(page, context, user_agent, **kwargs): + """Called when user agent is updated""" + print(f"[HOOK] User agent: {user_agent[:50]}...") + return page + + +async def before_goto_hook(page, context, url, **kwargs): + """Called before navigating to URL""" + print(f"[HOOK] Navigating to: {url}") + + await page.set_extra_http_headers({ + "X-Custom-Header": "crawl4ai-test", + "Accept-Language": "en-US" + }) + + return page + + +async def after_goto_hook(page, context, url, response, **kwargs): + """Called after page loads""" + print(f"[HOOK] Page loaded: {url}") + + await page.wait_for_timeout(1000) + + try: + await page.wait_for_selector("body", timeout=2000) + print("[HOOK] Body element ready") + except: + print("[HOOK] Timeout, continuing") + + return page + + +async def execution_started_hook(page, context, **kwargs): + """Called when custom JS execution starts""" + print("[HOOK] JS execution started") + await page.evaluate("console.log('[HOOK] Custom JS');") + return page + + +async def before_retrieve_hook(page, context, **kwargs): + """Called before retrieving HTML""" + print("[HOOK] Preparing HTML retrieval") + + # Scroll for lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + await page.wait_for_timeout(500) + await page.evaluate("window.scrollTo(0, 0);") + + print("[HOOK] Scrolling complete") + return page + + +async def before_return_hook(page, context, html, **kwargs): + """Called before returning HTML""" + print(f"[HOOK] HTML ready: {len(html)} chars") + + metrics = await page.evaluate('''() => ({ + images: document.images.length, + links: document.links.length, + scripts: document.scripts.length + })''') + + print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}") + return page + + +# --- Authentication Hooks --- +async def auth_context_hook(page, context, **kwargs): + """Setup authentication context""" + print("[HOOK] Setting up authentication") + + # Add auth cookies + await context.add_cookies([{ + "name": "auth_token", + "value": "fake_jwt_token", + "domain": ".httpbin.org", + "path": "/", + "httpOnly": True + }]) + + # Set localStorage + await page.evaluate(''' + localStorage.setItem('user_id', '12345'); + localStorage.setItem('auth_time', new Date().toISOString()); + ''') + + print("[HOOK] Auth context ready") + return page + + +async def auth_headers_hook(page, context, url, **kwargs): + """Add authentication headers""" + print(f"[HOOK] Adding auth headers for {url}") + + import base64 + credentials = base64.b64encode(b"user:passwd").decode('ascii') + + await page.set_extra_http_headers({ + 'Authorization': f'Basic {credentials}', + 'X-API-Key': 'test-key-123' + }) + + return page + + +# --- Performance Optimization Hooks --- +async def performance_hook(page, context, **kwargs): + """Optimize page for performance""" + print("[HOOK] Optimizing for performance") + + # Block resource-heavy content + await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort()) + await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort()) + await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort()) + await context.route("**/googletagmanager.com/*", lambda r: r.abort()) + await context.route("**/google-analytics.com/*", lambda r: r.abort()) + await context.route("**/facebook.com/*", lambda r: r.abort()) + + # Disable animations + await page.add_style_tag(content=''' + *, *::before, *::after { + animation-duration: 0s !important; + transition-duration: 0s !important; + } + ''') + + print("[HOOK] Optimizations applied") + return page + + +async def cleanup_hook(page, context, **kwargs): + """Clean page before extraction""" + print("[HOOK] Cleaning page") + + await page.evaluate('''() => { + const selectors = [ + '.ad', '.ads', '.advertisement', + '.popup', '.modal', '.overlay', + '.cookie-banner', '.newsletter' + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(el => el.remove()); + }); + + document.querySelectorAll('script, style').forEach(el => el.remove()); + }''') + + print("[HOOK] Page cleaned") + return page + + +# --- Content Extraction Hooks --- +async def wait_dynamic_content_hook(page, context, url, response, **kwargs): + """Wait for dynamic content to load""" + print(f"[HOOK] Waiting for dynamic content on {url}") + + await page.wait_for_timeout(2000) + + # Click "Load More" if exists + try: + load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")') + if load_more: + await load_more.click() + await page.wait_for_timeout(1000) + print("[HOOK] Clicked 'Load More'") + except: + pass + + return page + + +async def extract_metadata_hook(page, context, **kwargs): + """Extract page metadata""" + print("[HOOK] Extracting metadata") + + metadata = await page.evaluate('''() => { + const getMeta = (name) => { + const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`); + return el ? el.getAttribute('content') : null; + }; + + return { + title: document.title, + description: getMeta('description'), + author: getMeta('author'), + keywords: getMeta('keywords'), + }; + }''') + + print(f"[HOOK] Metadata: {metadata}") + + # Infinite scroll + for i in range(3): + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + await page.wait_for_timeout(1000) + print(f"[HOOK] Scroll {i+1}/3") + + return page + + +# --- Multi-URL Hooks --- +async def url_specific_hook(page, context, url, **kwargs): + """Apply URL-specific logic""" + print(f"[HOOK] Processing URL: {url}") + + # URL-specific headers + if 'html' in url: + await page.set_extra_http_headers({"X-Type": "HTML"}) + elif 'json' in url: + await page.set_extra_http_headers({"X-Type": "JSON"}) + + return page + + +async def track_progress_hook(page, context, url, response, **kwargs): + """Track crawl progress""" + status = response.status if response else 'unknown' + print(f"[HOOK] Loaded {url} - Status: {status}") + return page + + +# ============================================================================ +# Test Functions +# ============================================================================ + +async def test_all_hooks_comprehensive(): + """Test all 8 hook types""" + print("=" * 70) + print("Test 1: All Hooks Comprehensive Demo (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nCrawling with all 8 hooks...") + + # Define hooks with function objects + hooks = { + "on_browser_created": browser_created_hook, + "on_page_context_created": page_context_hook, + "on_user_agent_updated": user_agent_hook, + "before_goto": before_goto_hook, + "after_goto": after_goto_hook, + "on_execution_started": execution_started_hook, + "before_retrieve_html": before_retrieve_hook, + "before_return_html": before_return_hook + } + + result = await client.crawl( + ["https://httpbin.org/html"], + hooks=hooks, + hooks_timeout=30 + ) + + print("\n✅ Success!") + print(f" URL: {result.url}") + print(f" Success: {result.success}") + print(f" HTML: {len(result.html)} chars") + + +async def test_authentication_workflow(): + """Test authentication with hooks""" + print("\n" + "=" * 70) + print("Test 2: Authentication Workflow (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nTesting authentication...") + + hooks = { + "on_page_context_created": auth_context_hook, + "before_goto": auth_headers_hook + } + + result = await client.crawl( + ["https://httpbin.org/basic-auth/user/passwd"], + hooks=hooks, + hooks_timeout=15 + ) + + print("\n✅ Authentication completed") + + if result.success: + if '"authenticated"' in result.html and 'true' in result.html: + print(" ✅ Basic auth successful!") + else: + print(" ⚠️ Auth status unclear") + else: + print(f" ❌ Failed: {result.error_message}") + + +async def test_performance_optimization(): + """Test performance optimization""" + print("\n" + "=" * 70) + print("Test 3: Performance Optimization (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nTesting performance hooks...") + + hooks = { + "on_page_context_created": performance_hook, + "before_retrieve_html": cleanup_hook + } + + result = await client.crawl( + ["https://httpbin.org/html"], + hooks=hooks, + hooks_timeout=10 + ) + + print("\n✅ Optimization completed") + print(f" HTML size: {len(result.html):,} chars") + print(" Resources blocked, ads removed") + + +async def test_content_extraction(): + """Test content extraction""" + print("\n" + "=" * 70) + print("Test 4: Content Extraction (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nTesting extraction hooks...") + + hooks = { + "after_goto": wait_dynamic_content_hook, + "before_retrieve_html": extract_metadata_hook + } + + result = await client.crawl( + ["https://www.kidocode.com/"], + hooks=hooks, + hooks_timeout=20 + ) + + print("\n✅ Extraction completed") + print(f" URL: {result.url}") + print(f" Success: {result.success}") + print(f" Metadata: {result.metadata}") + + +async def test_multi_url_crawl(): + """Test hooks with multiple URLs""" + print("\n" + "=" * 70) + print("Test 5: Multi-URL Crawl (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nCrawling multiple URLs...") + + hooks = { + "before_goto": url_specific_hook, + "after_goto": track_progress_hook + } + + results = await client.crawl( + [ + "https://httpbin.org/html", + "https://httpbin.org/json", + "https://httpbin.org/xml" + ], + hooks=hooks, + hooks_timeout=15 + ) + + print("\n✅ Multi-URL crawl completed") + print(f"\n Crawled {len(results)} URLs:") + for i, result in enumerate(results, 1): + status = "✅" if result.success else "❌" + print(f" {status} {i}. {result.url}") + + +async def test_reusable_hook_library(): + """Test using reusable hook library""" + print("\n" + "=" * 70) + print("Test 6: Reusable Hook Library (Docker Client)") + print("=" * 70) + + # Create a library of reusable hooks + class HookLibrary: + @staticmethod + async def block_images(page, context, **kwargs): + """Block all images""" + await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort()) + print("[LIBRARY] Images blocked") + return page + + @staticmethod + async def block_analytics(page, context, **kwargs): + """Block analytics""" + await context.route("**/analytics/*", lambda r: r.abort()) + await context.route("**/google-analytics.com/*", lambda r: r.abort()) + print("[LIBRARY] Analytics blocked") + return page + + @staticmethod + async def scroll_infinite(page, context, **kwargs): + """Handle infinite scroll""" + for i in range(5): + prev = await page.evaluate("document.body.scrollHeight") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + await page.wait_for_timeout(1000) + curr = await page.evaluate("document.body.scrollHeight") + if curr == prev: + break + print("[LIBRARY] Infinite scroll complete") + return page + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nUsing hook library...") + + hooks = { + "on_page_context_created": HookLibrary.block_images, + "before_retrieve_html": HookLibrary.scroll_infinite + } + + result = await client.crawl( + ["https://www.kidocode.com/"], + hooks=hooks, + hooks_timeout=20 + ) + + print("\n✅ Library hooks completed") + print(f" Success: {result.success}") + + +# ============================================================================ +# Main +# ============================================================================ + +async def main(): + """Run all Docker client hook examples""" + print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)") + print("Using Python function objects with automatic conversion") + print("=" * 70) + + tests = [ + ("All Hooks Demo", test_all_hooks_comprehensive), + ("Authentication", test_authentication_workflow), + ("Performance", test_performance_optimization), + ("Extraction", test_content_extraction), + ("Multi-URL", test_multi_url_crawl), + ("Hook Library", test_reusable_hook_library) + ] + + for i, (name, test_func) in enumerate(tests, 1): + try: + await test_func() + print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n") + except Exception as e: + print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n") + import traceback + traceback.print_exc() + + print("=" * 70) + print("🎉 All Docker client hook examples completed!") + print("\n💡 Key Benefits of Function-Based Hooks:") + print(" • Write as regular Python functions") + print(" • Full IDE support (autocomplete, types)") + print(" • Automatic conversion to API format") + print(" • Reusable across projects") + print(" • Clean, readable code") + print(" • Easy to test and debug") + print("=" * 70) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md index ea3692b2..36bf28e1 100644 --- a/docs/md_v2/core/docker-deployment.md +++ b/docs/md_v2/core/docker-deployment.md @@ -6,18 +6,6 @@ - [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended) - [Option 2: Using Docker Compose](#option-2-using-docker-compose) - [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run) -- [Dockerfile Parameters](#dockerfile-parameters) -- [Using the API](#using-the-api) - - [Playground Interface](#playground-interface) - - [Python SDK](#python-sdk) - - [Understanding Request Schema](#understanding-request-schema) - - [REST API Examples](#rest-api-examples) -- [Additional API Endpoints](#additional-api-endpoints) - - [HTML Extraction Endpoint](#html-extraction-endpoint) - - [Screenshot Endpoint](#screenshot-endpoint) - - [PDF Export Endpoint](#pdf-export-endpoint) - - [JavaScript Execution Endpoint](#javascript-execution-endpoint) - - [Library Context Endpoint](#library-context-endpoint) - [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support) - [What is MCP?](#what-is-mcp) - [Connecting via MCP](#connecting-via-mcp) @@ -25,9 +13,28 @@ - [Available MCP Tools](#available-mcp-tools) - [Testing MCP Connections](#testing-mcp-connections) - [MCP Schemas](#mcp-schemas) +- [Additional API Endpoints](#additional-api-endpoints) + - [HTML Extraction Endpoint](#html-extraction-endpoint) + - [Screenshot Endpoint](#screenshot-endpoint) + - [PDF Export Endpoint](#pdf-export-endpoint) + - [JavaScript Execution Endpoint](#javascript-execution-endpoint) +- [User-Provided Hooks API](#user-provided-hooks-api) + - [Hook Information Endpoint](#hook-information-endpoint) + - [Available Hook Points](#available-hook-points) + - [Using Hooks in Requests](#using-hooks-in-requests) + - [Hook Examples with Real URLs](#hook-examples-with-real-urls) + - [Security Best Practices](#security-best-practices) + - [Hook Response Information](#hook-response-information) + - [Error Handling](#error-handling) + - [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python) +- [Dockerfile Parameters](#dockerfile-parameters) +- [Using the API](#using-the-api) + - [Playground Interface](#playground-interface) + - [Python SDK](#python-sdk) + - [Understanding Request Schema](#understanding-request-schema) + - [REST API Examples](#rest-api-examples) + - [LLM Configuration Examples](#llm-configuration-examples) - [Metrics & Monitoring](#metrics--monitoring) -- [Deployment Scenarios](#deployment-scenarios) -- [Complete Examples](#complete-examples) - [Server Configuration](#server-configuration) - [Understanding config.yml](#understanding-configyml) - [JWT Authentication](#jwt-authentication) @@ -832,6 +839,275 @@ else: > 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious. +### Hooks Utility: Function-Based Approach (Python) + +For Python developers, Crawl4AI provides a more convenient way to work with hooks using the `hooks_to_string()` utility function and Docker client integration. + +#### Why Use Function-Based Hooks? + +**String-Based Approach (shown above)**: +```python +hooks_code = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page +""" +} +``` + +**Function-Based Approach (recommended for Python)**: +```python +from crawl4ai import Crawl4aiDockerClient + +async def my_hook(page, context, **kwargs): + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + +async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client: + result = await client.crawl( + ["https://example.com"], + hooks={"on_page_context_created": my_hook} + ) +``` + +**Benefits**: +- ✅ Write hooks as regular Python functions +- ✅ Full IDE support (autocomplete, syntax highlighting, type checking) +- ✅ Easy to test and debug +- ✅ Reusable hook libraries +- ✅ Automatic conversion to API format + +#### Using the Hooks Utility + +The `hooks_to_string()` utility converts Python function objects to the string format required by the API: + +```python +from crawl4ai import hooks_to_string + +# Define your hooks as functions +async def setup_hook(page, context, **kwargs): + await page.set_viewport_size({"width": 1920, "height": 1080}) + await context.add_cookies([{ + "name": "session", + "value": "token", + "domain": ".example.com" + }]) + return page + +async def scroll_hook(page, context, **kwargs): + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + return page + +# Convert to string format +hooks_dict = { + "on_page_context_created": setup_hook, + "before_retrieve_html": scroll_hook +} +hooks_string = hooks_to_string(hooks_dict) + +# Now use with REST API or Docker client +# hooks_string contains the string representations +``` + +#### Docker Client with Automatic Conversion + +The Docker client automatically detects and converts function objects: + +```python +from crawl4ai import Crawl4aiDockerClient + +async def auth_hook(page, context, **kwargs): + """Add authentication cookies""" + await context.add_cookies([{ + "name": "auth_token", + "value": "your_token", + "domain": ".example.com" + }]) + return page + +async def performance_hook(page, context, **kwargs): + """Block unnecessary resources""" + await context.route("**/*.{png,jpg,gif}", lambda r: r.abort()) + await context.route("**/analytics/*", lambda r: r.abort()) + return page + +async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client: + # Pass functions directly - automatic conversion! + result = await client.crawl( + ["https://example.com"], + hooks={ + "on_page_context_created": performance_hook, + "before_goto": auth_hook + }, + hooks_timeout=30 # Optional timeout in seconds (1-120) + ) + + print(f"Success: {result.success}") + print(f"HTML: {len(result.html)} chars") +``` + +#### Creating Reusable Hook Libraries + +Build collections of reusable hooks: + +```python +# hooks_library.py +class CrawlHooks: + """Reusable hook collection for common crawling tasks""" + + @staticmethod + async def block_images(page, context, **kwargs): + """Block all images to speed up crawling""" + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda r: r.abort()) + return page + + @staticmethod + async def block_analytics(page, context, **kwargs): + """Block analytics and tracking scripts""" + tracking_domains = [ + "**/google-analytics.com/*", + "**/googletagmanager.com/*", + "**/facebook.com/tr/*", + "**/doubleclick.net/*" + ] + for domain in tracking_domains: + await context.route(domain, lambda r: r.abort()) + return page + + @staticmethod + async def scroll_infinite(page, context, **kwargs): + """Handle infinite scroll to load more content""" + previous_height = 0 + for i in range(5): # Max 5 scrolls + current_height = await page.evaluate("document.body.scrollHeight") + if current_height == previous_height: + break + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + previous_height = current_height + return page + + @staticmethod + async def wait_for_dynamic_content(page, context, url, response, **kwargs): + """Wait for dynamic content to load""" + await page.wait_for_timeout(2000) + try: + # Click "Load More" if present + load_more = await page.query_selector('[class*="load-more"]') + if load_more: + await load_more.click() + await page.wait_for_timeout(1000) + except: + pass + return page + +# Use in your application +from hooks_library import CrawlHooks +from crawl4ai import Crawl4aiDockerClient + +async def crawl_with_optimizations(url): + async with Crawl4aiDockerClient() as client: + result = await client.crawl( + [url], + hooks={ + "on_page_context_created": CrawlHooks.block_images, + "before_retrieve_html": CrawlHooks.scroll_infinite + } + ) + return result +``` + +#### Choosing the Right Approach + +| Approach | Best For | IDE Support | Language | +|----------|----------|-------------|----------| +| **String-based** | Non-Python clients, REST APIs, other languages | ❌ None | Any | +| **Function-based** | Python applications, local development | ✅ Full | Python only | +| **Docker Client** | Python apps with automatic conversion | ✅ Full | Python only | + +**Recommendation**: +- **Python applications**: Use Docker client with function objects (easiest) +- **Non-Python or REST API**: Use string-based hooks (most flexible) +- **Manual control**: Use `hooks_to_string()` utility (middle ground) + +#### Complete Example with Function Hooks + +```python +from crawl4ai import Crawl4aiDockerClient, BrowserConfig, CrawlerRunConfig, CacheMode + +# Define hooks as regular Python functions +async def setup_environment(page, context, **kwargs): + """Setup crawling environment""" + # Set viewport + await page.set_viewport_size({"width": 1920, "height": 1080}) + + # Block resources for speed + await context.route("**/*.{png,jpg,gif}", lambda r: r.abort()) + + # Add custom headers + await page.set_extra_http_headers({ + "Accept-Language": "en-US", + "X-Custom-Header": "Crawl4AI" + }) + + print("[HOOK] Environment configured") + return page + +async def extract_content(page, context, **kwargs): + """Extract and prepare content""" + # Scroll to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + + # Extract metadata + metadata = await page.evaluate('''() => ({ + title: document.title, + links: document.links.length, + images: document.images.length + })''') + + print(f"[HOOK] Page metadata: {metadata}") + return page + +async def main(): + async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client: + # Configure crawl + browser_config = BrowserConfig(headless=True) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + + # Crawl with hooks + result = await client.crawl( + ["https://httpbin.org/html"], + browser_config=browser_config, + crawler_config=crawler_config, + hooks={ + "on_page_context_created": setup_environment, + "before_retrieve_html": extract_content + }, + hooks_timeout=30 + ) + + if result.success: + print(f"✅ Crawl successful!") + print(f" URL: {result.url}") + print(f" HTML: {len(result.html)} chars") + print(f" Markdown: {len(result.markdown)} chars") + else: + print(f"❌ Crawl failed: {result.error_message}") + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) +``` + +#### Additional Resources + +- **Comprehensive Examples**: See `/docs/examples/hooks_docker_client_example.py` for Python function-based examples +- **REST API Examples**: See `/docs/examples/hooks_rest_api_example.py` for string-based examples +- **Comparison Guide**: See `/docs/examples/README_HOOKS.md` for detailed comparison +- **Utility Documentation**: See `/docs/hooks-utility-guide.md` for complete guide + --- ## Dockerfile Parameters @@ -892,10 +1168,12 @@ This is the easiest way to translate Python configuration to JSON requests when Install the SDK: `pip install crawl4ai` +The Python SDK provides a convenient way to interact with the Docker API, including **automatic hook conversion** when using function objects. + ```python import asyncio from crawl4ai.docker_client import Crawl4aiDockerClient -from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed +from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode async def main(): # Point to the correct server port @@ -907,23 +1185,22 @@ async def main(): print("--- Running Non-Streaming Crawl ---") results = await client.crawl( ["https://httpbin.org/html"], - browser_config=BrowserConfig(headless=True), # Use library classes for config aid + browser_config=BrowserConfig(headless=True), crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) ) - if results: # client.crawl returns None on failure - print(f"Non-streaming results success: {results.success}") - if results.success: - for result in results: # Iterate through the CrawlResultContainer - print(f"URL: {result.url}, Success: {result.success}") + if results: + print(f"Non-streaming results success: {results.success}") + if results.success: + for result in results: + print(f"URL: {result.url}, Success: {result.success}") else: print("Non-streaming crawl failed.") - # Example Streaming crawl print("\n--- Running Streaming Crawl ---") stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS) try: - async for result in await client.crawl( # client.crawl returns an async generator for streaming + async for result in await client.crawl( ["https://httpbin.org/html", "https://httpbin.org/links/5/0"], browser_config=BrowserConfig(headless=True), crawler_config=stream_config @@ -932,17 +1209,56 @@ async def main(): except Exception as e: print(f"Streaming crawl failed: {e}") + # Example with hooks (Python function objects) + print("\n--- Crawl with Hooks ---") + + async def my_hook(page, context, **kwargs): + """Custom hook to optimize performance""" + await page.set_viewport_size({"width": 1920, "height": 1080}) + await context.route("**/*.{png,jpg}", lambda r: r.abort()) + print("[HOOK] Page optimized") + return page + + result = await client.crawl( + ["https://httpbin.org/html"], + browser_config=BrowserConfig(headless=True), + crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS), + hooks={"on_page_context_created": my_hook}, # Pass function directly! + hooks_timeout=30 + ) + print(f"Crawl with hooks success: {result.success}") # Example Get schema print("\n--- Getting Schema ---") schema = await client.get_schema() - print(f"Schema received: {bool(schema)}") # Print whether schema was received + print(f"Schema received: {bool(schema)}") if __name__ == "__main__": asyncio.run(main()) ``` -*(SDK parameters like timeout, verify_ssl etc. remain the same)* +#### SDK Parameters + +The Docker client supports the following parameters: + +**Client Initialization**: +- `base_url` (str): URL of the Docker server (default: `http://localhost:8000`) +- `timeout` (float): Request timeout in seconds (default: 30.0) +- `verify_ssl` (bool): Verify SSL certificates (default: True) +- `verbose` (bool): Enable verbose logging (default: True) +- `log_file` (Optional[str]): Path to log file (default: None) + +**crawl() Method**: +- `urls` (List[str]): List of URLs to crawl +- `browser_config` (Optional[BrowserConfig]): Browser configuration +- `crawler_config` (Optional[CrawlerRunConfig]): Crawler configuration +- `hooks` (Optional[Dict]): Hook functions or strings - **automatically converts function objects!** +- `hooks_timeout` (int): Timeout for each hook execution in seconds (default: 30) + +**Returns**: +- Single URL: `CrawlResult` object +- Multiple URLs: `List[CrawlResult]` +- Streaming: `AsyncGenerator[CrawlResult]` ### Second Approach: Direct API Calls @@ -1352,19 +1668,40 @@ We're here to help you succeed with Crawl4AI! Here's how to get support: In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment: - Building and running the Docker container -- Configuring the environment +- Configuring the environment - Using the interactive playground for testing - Making API requests with proper typing -- Using the Python SDK +- Using the Python SDK with **automatic hook conversion** +- **Working with hooks** - both string-based (REST API) and function-based (Python SDK) - Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution - Connecting via the Model Context Protocol (MCP) - Monitoring your deployment -The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests. +### Key Features -For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling. +**Hooks Support**: Crawl4AI offers two approaches for working with hooks: +- **String-based** (REST API): Works with any language, requires manual string formatting +- **Function-based** (Python SDK): Write hooks as regular Python functions with full IDE support and automatic conversion -Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs. +**Playground Interface**: The built-in playground at `http://localhost:11235/playground` makes it easy to test configurations and generate corresponding JSON for API requests. + +**MCP Integration**: For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling. + +### Next Steps + +1. **Explore Examples**: Check out the comprehensive examples in: + - `/docs/examples/hooks_docker_client_example.py` - Python function-based hooks + - `/docs/examples/hooks_rest_api_example.py` - REST API string-based hooks + - `/docs/examples/README_HOOKS.md` - Comparison and guide + +2. **Read Documentation**: + - `/docs/hooks-utility-guide.md` - Complete hooks utility guide + - API documentation for detailed configuration options + +3. **Join the Community**: + - GitHub: Report issues and contribute + - Discord: Get help and share your experiences + - Documentation: Comprehensive guides and tutorials Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀 diff --git a/tests/docker/test_hooks_utility.py b/tests/docker/test_hooks_utility.py new file mode 100644 index 00000000..7c820e56 --- /dev/null +++ b/tests/docker/test_hooks_utility.py @@ -0,0 +1,193 @@ +""" +Test script demonstrating the hooks_to_string utility and Docker client integration. +""" +import asyncio +from crawl4ai import Crawl4aiDockerClient, hooks_to_string + + +# Define hook functions as regular Python functions +async def auth_hook(page, context, **kwargs): + """Add authentication cookies.""" + await context.add_cookies([{ + 'name': 'test_cookie', + 'value': 'test_value', + 'domain': '.httpbin.org', + 'path': '/' + }]) + return page + + +async def scroll_hook(page, context, **kwargs): + """Scroll to load lazy content.""" + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + return page + + +async def viewport_hook(page, context, **kwargs): + """Set custom viewport.""" + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + + +async def test_hooks_utility(): + """Test the hooks_to_string utility function.""" + print("=" * 60) + print("Testing hooks_to_string utility") + print("=" * 60) + + # Create hooks dictionary with function objects + hooks_dict = { + "on_page_context_created": auth_hook, + "before_retrieve_html": scroll_hook + } + + # Convert to string format + hooks_string = hooks_to_string(hooks_dict) + + print("\n✓ Successfully converted function objects to strings") + print(f"\n✓ Converted {len(hooks_string)} hooks:") + for hook_name in hooks_string.keys(): + print(f" - {hook_name}") + + print("\n✓ Preview of converted hook:") + print("-" * 60) + print(hooks_string["on_page_context_created"][:200] + "...") + print("-" * 60) + + return hooks_string + + +async def test_docker_client_with_functions(): + """Test Docker client with function objects (automatic conversion).""" + print("\n" + "=" * 60) + print("Testing Docker Client with Function Objects") + print("=" * 60) + + # Note: This requires a running Crawl4AI Docker server + # Uncomment the following to test with actual server: + + async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client: + # Pass function objects directly - they'll be converted automatically + result = await client.crawl( + ["https://httpbin.org/html"], + hooks={ + "on_page_context_created": auth_hook, + "before_retrieve_html": scroll_hook + }, + hooks_timeout=30 + ) + print(f"\n✓ Crawl successful: {result.success}") + print(f"✓ URL: {result.url}") + + print("\n✓ Docker client accepts function objects directly") + print("✓ Automatic conversion happens internally") + print("✓ No manual string formatting needed!") + + +async def test_docker_client_with_strings(): + """Test Docker client with pre-converted strings.""" + print("\n" + "=" * 60) + print("Testing Docker Client with String Hooks") + print("=" * 60) + + # Convert hooks to strings first + hooks_dict = { + "on_page_context_created": viewport_hook, + "before_retrieve_html": scroll_hook + } + hooks_string = hooks_to_string(hooks_dict) + + # Note: This requires a running Crawl4AI Docker server + # Uncomment the following to test with actual server: + + async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client: + # Pass string hooks - they'll be used as-is + result = await client.crawl( + ["https://httpbin.org/html"], + hooks=hooks_string, + hooks_timeout=30 + ) + print(f"\n✓ Crawl successful: {result.success}") + + print("\n✓ Docker client also accepts pre-converted strings") + print("✓ Backward compatible with existing code") + + +async def show_usage_patterns(): + """Show different usage patterns.""" + print("\n" + "=" * 60) + print("Usage Patterns") + print("=" * 60) + + print("\n1. Direct function usage (simplest):") + print("-" * 60) + print(""" + async def my_hook(page, context, **kwargs): + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + + result = await client.crawl( + ["https://example.com"], + hooks={"on_page_context_created": my_hook} + ) + """) + + print("\n2. Convert then use:") + print("-" * 60) + print(""" + hooks_dict = {"on_page_context_created": my_hook} + hooks_string = hooks_to_string(hooks_dict) + + result = await client.crawl( + ["https://example.com"], + hooks=hooks_string + ) + """) + + print("\n3. Manual string (backward compatible):") + print("-" * 60) + print(""" + hooks_string = { + "on_page_context_created": ''' +async def hook(page, context, **kwargs): + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page +''' + } + + result = await client.crawl( + ["https://example.com"], + hooks=hooks_string + ) + """) + + +async def main(): + """Run all tests.""" + print("\n🚀 Crawl4AI Hooks Utility Test Suite\n") + + # Test the utility function + # await test_hooks_utility() + + # Show usage with Docker client + # await test_docker_client_with_functions() + await test_docker_client_with_strings() + + # Show different patterns + # await show_usage_patterns() + + # print("\n" + "=" * 60) + # print("✓ All tests completed successfully!") + # print("=" * 60) + # print("\nKey Benefits:") + # print(" • Write hooks as regular Python functions") + # print(" • IDE support with autocomplete and type checking") + # print(" • Automatic conversion to API format") + # print(" • Backward compatible with string hooks") + # print(" • Same utility used everywhere") + # print("\n") + + +if __name__ == "__main__": + asyncio.run(main())