diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 6917f27e..8f1fdef4 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -103,7 +103,8 @@ from .browser_adapter import (
from .utils import (
start_colab_display_server,
- setup_colab_environment
+ setup_colab_environment,
+ hooks_to_string
)
__all__ = [
@@ -183,6 +184,7 @@ __all__ = [
"ProxyConfig",
"start_colab_display_server",
"setup_colab_environment",
+ "hooks_to_string",
# C4A Script additions
"c4a_compile",
"c4a_validate",
diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py
index 4e33431f..969fee7c 100644
--- a/crawl4ai/docker_client.py
+++ b/crawl4ai/docker_client.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Union, AsyncGenerator, Dict, Any
+from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable
import httpx
import json
from urllib.parse import urljoin
@@ -7,6 +7,7 @@ import asyncio
from .async_configs import BrowserConfig, CrawlerRunConfig
from .models import CrawlResult
from .async_logger import AsyncLogger, LogLevel
+from .utils import hooks_to_string
class Crawl4aiClientError(Exception):
@@ -70,17 +71,41 @@ class Crawl4aiDockerClient:
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
raise ConnectionError(f"Cannot connect to server: {str(e)}")
- def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
- crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
+ def _prepare_request(
+ self,
+ urls: List[str],
+ browser_config: Optional[BrowserConfig] = None,
+ crawler_config: Optional[CrawlerRunConfig] = None,
+ hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
+ hooks_timeout: int = 30
+ ) -> Dict[str, Any]:
"""Prepare request data from configs."""
if self._token:
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
- return {
+
+ request_data = {
"urls": urls,
"browser_config": browser_config.dump() if browser_config else {},
"crawler_config": crawler_config.dump() if crawler_config else {}
}
+ # Handle hooks if provided
+ if hooks:
+ # Check if hooks are already strings or need conversion
+ if any(callable(v) for v in hooks.values()):
+ # Convert function objects to strings
+ hooks_code = hooks_to_string(hooks)
+ else:
+ # Already in string format
+ hooks_code = hooks
+
+ request_data["hooks"] = {
+ "code": hooks_code,
+ "timeout": hooks_timeout
+ }
+
+ return request_data
+
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
"""Make an HTTP request with error handling."""
url = urljoin(self.base_url, endpoint)
@@ -102,16 +127,42 @@ class Crawl4aiDockerClient:
self,
urls: List[str],
browser_config: Optional[BrowserConfig] = None,
- crawler_config: Optional[CrawlerRunConfig] = None
+ crawler_config: Optional[CrawlerRunConfig] = None,
+ hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
+ hooks_timeout: int = 30
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
- """Execute a crawl operation."""
+ """
+ Execute a crawl operation.
+
+ Args:
+ urls: List of URLs to crawl
+ browser_config: Browser configuration
+ crawler_config: Crawler configuration
+ hooks: Optional hooks - can be either:
+ - Dict[str, Callable]: Function objects that will be converted to strings
+ - Dict[str, str]: Already stringified hook code
+ hooks_timeout: Timeout in seconds for each hook execution (1-120)
+
+ Returns:
+ Single CrawlResult, list of results, or async generator for streaming
+
+ Example with function hooks:
+ >>> async def my_hook(page, context, **kwargs):
+ ... await page.set_viewport_size({"width": 1920, "height": 1080})
+ ... return page
+ >>>
+ >>> result = await client.crawl(
+ ... ["https://example.com"],
+ ... hooks={"on_page_context_created": my_hook}
+ ... )
+ """
await self._check_server()
-
- data = self._prepare_request(urls, browser_config, crawler_config)
+
+ data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout)
is_streaming = crawler_config and crawler_config.stream
-
+
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
-
+
if is_streaming:
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
@@ -128,12 +179,12 @@ class Crawl4aiDockerClient:
else:
yield CrawlResult(**result)
return stream_results()
-
+
response = await self._request("POST", "/crawl", json=data)
result_data = response.json()
if not result_data.get("success", False):
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
-
+
results = [CrawlResult(**r) for r in result_data.get("results", [])]
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
return results[0] if len(results) == 1 else results
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 046351e7..bbd7ffa2 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -47,6 +47,7 @@ from urllib.parse import (
urljoin, urlparse, urlunparse,
parse_qsl, urlencode, quote, unquote
)
+import inspect
# Monkey patch to fix wildcard handling in urllib.robotparser
@@ -3529,4 +3530,52 @@ def get_memory_stats() -> Tuple[float, float, float]:
available_gb = get_true_available_memory_gb()
used_percent = get_true_memory_usage_percent()
- return used_percent, available_gb, total_gb
\ No newline at end of file
+ return used_percent, available_gb, total_gb
+
+
+# Hook utilities for Docker API
+def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
+ """
+ Convert hook function objects to string representations for Docker API.
+
+ This utility simplifies the process of using hooks with the Docker API by converting
+ Python function objects into the string format required by the API.
+
+ Args:
+ hooks: Dictionary mapping hook point names to Python function objects.
+ Functions should be async and follow hook signature requirements.
+
+ Returns:
+ Dictionary mapping hook point names to string representations of the functions.
+
+ Example:
+ >>> async def my_hook(page, context, **kwargs):
+ ... await page.set_viewport_size({"width": 1920, "height": 1080})
+ ... return page
+ >>>
+ >>> hooks_dict = {"on_page_context_created": my_hook}
+ >>> api_hooks = hooks_to_string(hooks_dict)
+ >>> # api_hooks is now ready to use with Docker API
+
+ Raises:
+ ValueError: If a hook is not callable or source cannot be extracted
+ """
+ result = {}
+
+ for hook_name, hook_func in hooks.items():
+ if not callable(hook_func):
+ raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}")
+
+ try:
+ # Get the source code of the function
+ source = inspect.getsource(hook_func)
+ # Remove any leading indentation to get clean source
+ source = textwrap.dedent(source)
+ result[hook_name] = source
+ except (OSError, TypeError) as e:
+ raise ValueError(
+ f"Cannot extract source code for hook '{hook_name}'. "
+ f"Make sure the function is defined in a file (not interactively). Error: {e}"
+ )
+
+ return result
diff --git a/docs/examples/docker_client_hooks_example.py b/docs/examples/docker_client_hooks_example.py
new file mode 100644
index 00000000..1aa27fdc
--- /dev/null
+++ b/docs/examples/docker_client_hooks_example.py
@@ -0,0 +1,522 @@
+#!/usr/bin/env python3
+"""
+Comprehensive hooks examples using Docker Client with function objects.
+
+This approach is recommended because:
+- Write hooks as regular Python functions
+- Full IDE support (autocomplete, type checking)
+- Automatic conversion to API format
+- Reusable and testable code
+- Clean, readable syntax
+"""
+
+import asyncio
+from crawl4ai import Crawl4aiDockerClient
+
+# API_BASE_URL = "http://localhost:11235"
+API_BASE_URL = "http://localhost:11234"
+
+
+# ============================================================================
+# Hook Function Definitions
+# ============================================================================
+
+# --- All Hooks Demo ---
+async def browser_created_hook(browser, **kwargs):
+ """Called after browser is created"""
+ print("[HOOK] Browser created and ready")
+ return browser
+
+
+async def page_context_hook(page, context, **kwargs):
+ """Setup page environment"""
+ print("[HOOK] Setting up page environment")
+
+ # Set viewport
+ await page.set_viewport_size({"width": 1920, "height": 1080})
+
+ # Add cookies
+ await context.add_cookies([{
+ "name": "test_session",
+ "value": "abc123xyz",
+ "domain": ".httpbin.org",
+ "path": "/"
+ }])
+
+ # Block resources
+ await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
+ await context.route("**/analytics/*", lambda route: route.abort())
+
+ print("[HOOK] Environment configured")
+ return page
+
+
+async def user_agent_hook(page, context, user_agent, **kwargs):
+ """Called when user agent is updated"""
+ print(f"[HOOK] User agent: {user_agent[:50]}...")
+ return page
+
+
+async def before_goto_hook(page, context, url, **kwargs):
+ """Called before navigating to URL"""
+ print(f"[HOOK] Navigating to: {url}")
+
+ await page.set_extra_http_headers({
+ "X-Custom-Header": "crawl4ai-test",
+ "Accept-Language": "en-US"
+ })
+
+ return page
+
+
+async def after_goto_hook(page, context, url, response, **kwargs):
+ """Called after page loads"""
+ print(f"[HOOK] Page loaded: {url}")
+
+ await page.wait_for_timeout(1000)
+
+ try:
+ await page.wait_for_selector("body", timeout=2000)
+ print("[HOOK] Body element ready")
+ except:
+ print("[HOOK] Timeout, continuing")
+
+ return page
+
+
+async def execution_started_hook(page, context, **kwargs):
+ """Called when custom JS execution starts"""
+ print("[HOOK] JS execution started")
+ await page.evaluate("console.log('[HOOK] Custom JS');")
+ return page
+
+
+async def before_retrieve_hook(page, context, **kwargs):
+ """Called before retrieving HTML"""
+ print("[HOOK] Preparing HTML retrieval")
+
+ # Scroll for lazy content
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+ await page.wait_for_timeout(500)
+ await page.evaluate("window.scrollTo(0, 0);")
+
+ print("[HOOK] Scrolling complete")
+ return page
+
+
+async def before_return_hook(page, context, html, **kwargs):
+ """Called before returning HTML"""
+ print(f"[HOOK] HTML ready: {len(html)} chars")
+
+ metrics = await page.evaluate('''() => ({
+ images: document.images.length,
+ links: document.links.length,
+ scripts: document.scripts.length
+ })''')
+
+ print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}")
+ return page
+
+
+# --- Authentication Hooks ---
+async def auth_context_hook(page, context, **kwargs):
+ """Setup authentication context"""
+ print("[HOOK] Setting up authentication")
+
+ # Add auth cookies
+ await context.add_cookies([{
+ "name": "auth_token",
+ "value": "fake_jwt_token",
+ "domain": ".httpbin.org",
+ "path": "/",
+ "httpOnly": True
+ }])
+
+ # Set localStorage
+ await page.evaluate('''
+ localStorage.setItem('user_id', '12345');
+ localStorage.setItem('auth_time', new Date().toISOString());
+ ''')
+
+ print("[HOOK] Auth context ready")
+ return page
+
+
+async def auth_headers_hook(page, context, url, **kwargs):
+ """Add authentication headers"""
+ print(f"[HOOK] Adding auth headers for {url}")
+
+ import base64
+ credentials = base64.b64encode(b"user:passwd").decode('ascii')
+
+ await page.set_extra_http_headers({
+ 'Authorization': f'Basic {credentials}',
+ 'X-API-Key': 'test-key-123'
+ })
+
+ return page
+
+
+# --- Performance Optimization Hooks ---
+async def performance_hook(page, context, **kwargs):
+ """Optimize page for performance"""
+ print("[HOOK] Optimizing for performance")
+
+ # Block resource-heavy content
+ await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort())
+ await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort())
+ await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort())
+ await context.route("**/googletagmanager.com/*", lambda r: r.abort())
+ await context.route("**/google-analytics.com/*", lambda r: r.abort())
+ await context.route("**/facebook.com/*", lambda r: r.abort())
+
+ # Disable animations
+ await page.add_style_tag(content='''
+ *, *::before, *::after {
+ animation-duration: 0s !important;
+ transition-duration: 0s !important;
+ }
+ ''')
+
+ print("[HOOK] Optimizations applied")
+ return page
+
+
+async def cleanup_hook(page, context, **kwargs):
+ """Clean page before extraction"""
+ print("[HOOK] Cleaning page")
+
+ await page.evaluate('''() => {
+ const selectors = [
+ '.ad', '.ads', '.advertisement',
+ '.popup', '.modal', '.overlay',
+ '.cookie-banner', '.newsletter'
+ ];
+
+ selectors.forEach(sel => {
+ document.querySelectorAll(sel).forEach(el => el.remove());
+ });
+
+ document.querySelectorAll('script, style').forEach(el => el.remove());
+ }''')
+
+ print("[HOOK] Page cleaned")
+ return page
+
+
+# --- Content Extraction Hooks ---
+async def wait_dynamic_content_hook(page, context, url, response, **kwargs):
+ """Wait for dynamic content to load"""
+ print(f"[HOOK] Waiting for dynamic content on {url}")
+
+ await page.wait_for_timeout(2000)
+
+ # Click "Load More" if exists
+ try:
+ load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")')
+ if load_more:
+ await load_more.click()
+ await page.wait_for_timeout(1000)
+ print("[HOOK] Clicked 'Load More'")
+ except:
+ pass
+
+ return page
+
+
+async def extract_metadata_hook(page, context, **kwargs):
+ """Extract page metadata"""
+ print("[HOOK] Extracting metadata")
+
+ metadata = await page.evaluate('''() => {
+ const getMeta = (name) => {
+ const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
+ return el ? el.getAttribute('content') : null;
+ };
+
+ return {
+ title: document.title,
+ description: getMeta('description'),
+ author: getMeta('author'),
+ keywords: getMeta('keywords'),
+ };
+ }''')
+
+ print(f"[HOOK] Metadata: {metadata}")
+
+ # Infinite scroll
+ for i in range(3):
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+ await page.wait_for_timeout(1000)
+ print(f"[HOOK] Scroll {i+1}/3")
+
+ return page
+
+
+# --- Multi-URL Hooks ---
+async def url_specific_hook(page, context, url, **kwargs):
+ """Apply URL-specific logic"""
+ print(f"[HOOK] Processing URL: {url}")
+
+ # URL-specific headers
+ if 'html' in url:
+ await page.set_extra_http_headers({"X-Type": "HTML"})
+ elif 'json' in url:
+ await page.set_extra_http_headers({"X-Type": "JSON"})
+
+ return page
+
+
+async def track_progress_hook(page, context, url, response, **kwargs):
+ """Track crawl progress"""
+ status = response.status if response else 'unknown'
+ print(f"[HOOK] Loaded {url} - Status: {status}")
+ return page
+
+
+# ============================================================================
+# Test Functions
+# ============================================================================
+
+async def test_all_hooks_comprehensive():
+ """Test all 8 hook types"""
+ print("=" * 70)
+ print("Test 1: All Hooks Comprehensive Demo (Docker Client)")
+ print("=" * 70)
+
+ async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+ print("\nCrawling with all 8 hooks...")
+
+ # Define hooks with function objects
+ hooks = {
+ "on_browser_created": browser_created_hook,
+ "on_page_context_created": page_context_hook,
+ "on_user_agent_updated": user_agent_hook,
+ "before_goto": before_goto_hook,
+ "after_goto": after_goto_hook,
+ "on_execution_started": execution_started_hook,
+ "before_retrieve_html": before_retrieve_hook,
+ "before_return_html": before_return_hook
+ }
+
+ result = await client.crawl(
+ ["https://httpbin.org/html"],
+ hooks=hooks,
+ hooks_timeout=30
+ )
+
+ print("\n✅ Success!")
+ print(f" URL: {result.url}")
+ print(f" Success: {result.success}")
+ print(f" HTML: {len(result.html)} chars")
+
+
+async def test_authentication_workflow():
+ """Test authentication with hooks"""
+ print("\n" + "=" * 70)
+ print("Test 2: Authentication Workflow (Docker Client)")
+ print("=" * 70)
+
+ async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+ print("\nTesting authentication...")
+
+ hooks = {
+ "on_page_context_created": auth_context_hook,
+ "before_goto": auth_headers_hook
+ }
+
+ result = await client.crawl(
+ ["https://httpbin.org/basic-auth/user/passwd"],
+ hooks=hooks,
+ hooks_timeout=15
+ )
+
+ print("\n✅ Authentication completed")
+
+ if result.success:
+ if '"authenticated"' in result.html and 'true' in result.html:
+ print(" ✅ Basic auth successful!")
+ else:
+ print(" ⚠️ Auth status unclear")
+ else:
+ print(f" ❌ Failed: {result.error_message}")
+
+
+async def test_performance_optimization():
+ """Test performance optimization"""
+ print("\n" + "=" * 70)
+ print("Test 3: Performance Optimization (Docker Client)")
+ print("=" * 70)
+
+ async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+ print("\nTesting performance hooks...")
+
+ hooks = {
+ "on_page_context_created": performance_hook,
+ "before_retrieve_html": cleanup_hook
+ }
+
+ result = await client.crawl(
+ ["https://httpbin.org/html"],
+ hooks=hooks,
+ hooks_timeout=10
+ )
+
+ print("\n✅ Optimization completed")
+ print(f" HTML size: {len(result.html):,} chars")
+ print(" Resources blocked, ads removed")
+
+
+async def test_content_extraction():
+ """Test content extraction"""
+ print("\n" + "=" * 70)
+ print("Test 4: Content Extraction (Docker Client)")
+ print("=" * 70)
+
+ async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+ print("\nTesting extraction hooks...")
+
+ hooks = {
+ "after_goto": wait_dynamic_content_hook,
+ "before_retrieve_html": extract_metadata_hook
+ }
+
+ result = await client.crawl(
+ ["https://www.kidocode.com/"],
+ hooks=hooks,
+ hooks_timeout=20
+ )
+
+ print("\n✅ Extraction completed")
+ print(f" URL: {result.url}")
+ print(f" Success: {result.success}")
+ print(f" Metadata: {result.metadata}")
+
+
+async def test_multi_url_crawl():
+ """Test hooks with multiple URLs"""
+ print("\n" + "=" * 70)
+ print("Test 5: Multi-URL Crawl (Docker Client)")
+ print("=" * 70)
+
+ async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+ print("\nCrawling multiple URLs...")
+
+ hooks = {
+ "before_goto": url_specific_hook,
+ "after_goto": track_progress_hook
+ }
+
+ results = await client.crawl(
+ [
+ "https://httpbin.org/html",
+ "https://httpbin.org/json",
+ "https://httpbin.org/xml"
+ ],
+ hooks=hooks,
+ hooks_timeout=15
+ )
+
+ print("\n✅ Multi-URL crawl completed")
+ print(f"\n Crawled {len(results)} URLs:")
+ for i, result in enumerate(results, 1):
+ status = "✅" if result.success else "❌"
+ print(f" {status} {i}. {result.url}")
+
+
+async def test_reusable_hook_library():
+ """Test using reusable hook library"""
+ print("\n" + "=" * 70)
+ print("Test 6: Reusable Hook Library (Docker Client)")
+ print("=" * 70)
+
+ # Create a library of reusable hooks
+ class HookLibrary:
+ @staticmethod
+ async def block_images(page, context, **kwargs):
+ """Block all images"""
+ await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort())
+ print("[LIBRARY] Images blocked")
+ return page
+
+ @staticmethod
+ async def block_analytics(page, context, **kwargs):
+ """Block analytics"""
+ await context.route("**/analytics/*", lambda r: r.abort())
+ await context.route("**/google-analytics.com/*", lambda r: r.abort())
+ print("[LIBRARY] Analytics blocked")
+ return page
+
+ @staticmethod
+ async def scroll_infinite(page, context, **kwargs):
+ """Handle infinite scroll"""
+ for i in range(5):
+ prev = await page.evaluate("document.body.scrollHeight")
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
+ await page.wait_for_timeout(1000)
+ curr = await page.evaluate("document.body.scrollHeight")
+ if curr == prev:
+ break
+ print("[LIBRARY] Infinite scroll complete")
+ return page
+
+ async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
+ print("\nUsing hook library...")
+
+ hooks = {
+ "on_page_context_created": HookLibrary.block_images,
+ "before_retrieve_html": HookLibrary.scroll_infinite
+ }
+
+ result = await client.crawl(
+ ["https://www.kidocode.com/"],
+ hooks=hooks,
+ hooks_timeout=20
+ )
+
+ print("\n✅ Library hooks completed")
+ print(f" Success: {result.success}")
+
+
+# ============================================================================
+# Main
+# ============================================================================
+
+async def main():
+ """Run all Docker client hook examples"""
+ print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)")
+ print("Using Python function objects with automatic conversion")
+ print("=" * 70)
+
+ tests = [
+ ("All Hooks Demo", test_all_hooks_comprehensive),
+ ("Authentication", test_authentication_workflow),
+ ("Performance", test_performance_optimization),
+ ("Extraction", test_content_extraction),
+ ("Multi-URL", test_multi_url_crawl),
+ ("Hook Library", test_reusable_hook_library)
+ ]
+
+ for i, (name, test_func) in enumerate(tests, 1):
+ try:
+ await test_func()
+ print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n")
+ except Exception as e:
+ print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n")
+ import traceback
+ traceback.print_exc()
+
+ print("=" * 70)
+ print("🎉 All Docker client hook examples completed!")
+ print("\n💡 Key Benefits of Function-Based Hooks:")
+ print(" • Write as regular Python functions")
+ print(" • Full IDE support (autocomplete, types)")
+ print(" • Automatic conversion to API format")
+ print(" • Reusable across projects")
+ print(" • Clean, readable code")
+ print(" • Easy to test and debug")
+ print("=" * 70)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md
index ea3692b2..36bf28e1 100644
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
@@ -6,18 +6,6 @@
- [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
- [Option 2: Using Docker Compose](#option-2-using-docker-compose)
- [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
-- [Dockerfile Parameters](#dockerfile-parameters)
-- [Using the API](#using-the-api)
- - [Playground Interface](#playground-interface)
- - [Python SDK](#python-sdk)
- - [Understanding Request Schema](#understanding-request-schema)
- - [REST API Examples](#rest-api-examples)
-- [Additional API Endpoints](#additional-api-endpoints)
- - [HTML Extraction Endpoint](#html-extraction-endpoint)
- - [Screenshot Endpoint](#screenshot-endpoint)
- - [PDF Export Endpoint](#pdf-export-endpoint)
- - [JavaScript Execution Endpoint](#javascript-execution-endpoint)
- - [Library Context Endpoint](#library-context-endpoint)
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
- [What is MCP?](#what-is-mcp)
- [Connecting via MCP](#connecting-via-mcp)
@@ -25,9 +13,28 @@
- [Available MCP Tools](#available-mcp-tools)
- [Testing MCP Connections](#testing-mcp-connections)
- [MCP Schemas](#mcp-schemas)
+- [Additional API Endpoints](#additional-api-endpoints)
+ - [HTML Extraction Endpoint](#html-extraction-endpoint)
+ - [Screenshot Endpoint](#screenshot-endpoint)
+ - [PDF Export Endpoint](#pdf-export-endpoint)
+ - [JavaScript Execution Endpoint](#javascript-execution-endpoint)
+- [User-Provided Hooks API](#user-provided-hooks-api)
+ - [Hook Information Endpoint](#hook-information-endpoint)
+ - [Available Hook Points](#available-hook-points)
+ - [Using Hooks in Requests](#using-hooks-in-requests)
+ - [Hook Examples with Real URLs](#hook-examples-with-real-urls)
+ - [Security Best Practices](#security-best-practices)
+ - [Hook Response Information](#hook-response-information)
+ - [Error Handling](#error-handling)
+ - [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+ - [Playground Interface](#playground-interface)
+ - [Python SDK](#python-sdk)
+ - [Understanding Request Schema](#understanding-request-schema)
+ - [REST API Examples](#rest-api-examples)
+ - [LLM Configuration Examples](#llm-configuration-examples)
- [Metrics & Monitoring](#metrics--monitoring)
-- [Deployment Scenarios](#deployment-scenarios)
-- [Complete Examples](#complete-examples)
- [Server Configuration](#server-configuration)
- [Understanding config.yml](#understanding-configyml)
- [JWT Authentication](#jwt-authentication)
@@ -832,6 +839,275 @@ else:
> 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious.
+### Hooks Utility: Function-Based Approach (Python)
+
+For Python developers, Crawl4AI provides a more convenient way to work with hooks using the `hooks_to_string()` utility function and Docker client integration.
+
+#### Why Use Function-Based Hooks?
+
+**String-Based Approach (shown above)**:
+```python
+hooks_code = {
+ "on_page_context_created": """
+async def hook(page, context, **kwargs):
+ await page.set_viewport_size({"width": 1920, "height": 1080})
+ return page
+"""
+}
+```
+
+**Function-Based Approach (recommended for Python)**:
+```python
+from crawl4ai import Crawl4aiDockerClient
+
+async def my_hook(page, context, **kwargs):
+ await page.set_viewport_size({"width": 1920, "height": 1080})
+ return page
+
+async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
+ result = await client.crawl(
+ ["https://example.com"],
+ hooks={"on_page_context_created": my_hook}
+ )
+```
+
+**Benefits**:
+- ✅ Write hooks as regular Python functions
+- ✅ Full IDE support (autocomplete, syntax highlighting, type checking)
+- ✅ Easy to test and debug
+- ✅ Reusable hook libraries
+- ✅ Automatic conversion to API format
+
+#### Using the Hooks Utility
+
+The `hooks_to_string()` utility converts Python function objects to the string format required by the API:
+
+```python
+from crawl4ai import hooks_to_string
+
+# Define your hooks as functions
+async def setup_hook(page, context, **kwargs):
+ await page.set_viewport_size({"width": 1920, "height": 1080})
+ await context.add_cookies([{
+ "name": "session",
+ "value": "token",
+ "domain": ".example.com"
+ }])
+ return page
+
+async def scroll_hook(page, context, **kwargs):
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+ return page
+
+# Convert to string format
+hooks_dict = {
+ "on_page_context_created": setup_hook,
+ "before_retrieve_html": scroll_hook
+}
+hooks_string = hooks_to_string(hooks_dict)
+
+# Now use with REST API or Docker client
+# hooks_string contains the string representations
+```
+
+#### Docker Client with Automatic Conversion
+
+The Docker client automatically detects and converts function objects:
+
+```python
+from crawl4ai import Crawl4aiDockerClient
+
+async def auth_hook(page, context, **kwargs):
+ """Add authentication cookies"""
+ await context.add_cookies([{
+ "name": "auth_token",
+ "value": "your_token",
+ "domain": ".example.com"
+ }])
+ return page
+
+async def performance_hook(page, context, **kwargs):
+ """Block unnecessary resources"""
+ await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
+ await context.route("**/analytics/*", lambda r: r.abort())
+ return page
+
+async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
+ # Pass functions directly - automatic conversion!
+ result = await client.crawl(
+ ["https://example.com"],
+ hooks={
+ "on_page_context_created": performance_hook,
+ "before_goto": auth_hook
+ },
+ hooks_timeout=30 # Optional timeout in seconds (1-120)
+ )
+
+ print(f"Success: {result.success}")
+ print(f"HTML: {len(result.html)} chars")
+```
+
+#### Creating Reusable Hook Libraries
+
+Build collections of reusable hooks:
+
+```python
+# hooks_library.py
+class CrawlHooks:
+ """Reusable hook collection for common crawling tasks"""
+
+ @staticmethod
+ async def block_images(page, context, **kwargs):
+ """Block all images to speed up crawling"""
+ await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda r: r.abort())
+ return page
+
+ @staticmethod
+ async def block_analytics(page, context, **kwargs):
+ """Block analytics and tracking scripts"""
+ tracking_domains = [
+ "**/google-analytics.com/*",
+ "**/googletagmanager.com/*",
+ "**/facebook.com/tr/*",
+ "**/doubleclick.net/*"
+ ]
+ for domain in tracking_domains:
+ await context.route(domain, lambda r: r.abort())
+ return page
+
+ @staticmethod
+ async def scroll_infinite(page, context, **kwargs):
+ """Handle infinite scroll to load more content"""
+ previous_height = 0
+ for i in range(5): # Max 5 scrolls
+ current_height = await page.evaluate("document.body.scrollHeight")
+ if current_height == previous_height:
+ break
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+ await page.wait_for_timeout(1000)
+ previous_height = current_height
+ return page
+
+ @staticmethod
+ async def wait_for_dynamic_content(page, context, url, response, **kwargs):
+ """Wait for dynamic content to load"""
+ await page.wait_for_timeout(2000)
+ try:
+ # Click "Load More" if present
+ load_more = await page.query_selector('[class*="load-more"]')
+ if load_more:
+ await load_more.click()
+ await page.wait_for_timeout(1000)
+ except:
+ pass
+ return page
+
+# Use in your application
+from hooks_library import CrawlHooks
+from crawl4ai import Crawl4aiDockerClient
+
+async def crawl_with_optimizations(url):
+ async with Crawl4aiDockerClient() as client:
+ result = await client.crawl(
+ [url],
+ hooks={
+ "on_page_context_created": CrawlHooks.block_images,
+ "before_retrieve_html": CrawlHooks.scroll_infinite
+ }
+ )
+ return result
+```
+
+#### Choosing the Right Approach
+
+| Approach | Best For | IDE Support | Language |
+|----------|----------|-------------|----------|
+| **String-based** | Non-Python clients, REST APIs, other languages | ❌ None | Any |
+| **Function-based** | Python applications, local development | ✅ Full | Python only |
+| **Docker Client** | Python apps with automatic conversion | ✅ Full | Python only |
+
+**Recommendation**:
+- **Python applications**: Use Docker client with function objects (easiest)
+- **Non-Python or REST API**: Use string-based hooks (most flexible)
+- **Manual control**: Use `hooks_to_string()` utility (middle ground)
+
+#### Complete Example with Function Hooks
+
+```python
+from crawl4ai import Crawl4aiDockerClient, BrowserConfig, CrawlerRunConfig, CacheMode
+
+# Define hooks as regular Python functions
+async def setup_environment(page, context, **kwargs):
+ """Setup crawling environment"""
+ # Set viewport
+ await page.set_viewport_size({"width": 1920, "height": 1080})
+
+ # Block resources for speed
+ await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
+
+ # Add custom headers
+ await page.set_extra_http_headers({
+ "Accept-Language": "en-US",
+ "X-Custom-Header": "Crawl4AI"
+ })
+
+ print("[HOOK] Environment configured")
+ return page
+
+async def extract_content(page, context, **kwargs):
+ """Extract and prepare content"""
+ # Scroll to load lazy content
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+ await page.wait_for_timeout(1000)
+
+ # Extract metadata
+ metadata = await page.evaluate('''() => ({
+ title: document.title,
+ links: document.links.length,
+ images: document.images.length
+ })''')
+
+ print(f"[HOOK] Page metadata: {metadata}")
+ return page
+
+async def main():
+ async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
+ # Configure crawl
+ browser_config = BrowserConfig(headless=True)
+ crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+ # Crawl with hooks
+ result = await client.crawl(
+ ["https://httpbin.org/html"],
+ browser_config=browser_config,
+ crawler_config=crawler_config,
+ hooks={
+ "on_page_context_created": setup_environment,
+ "before_retrieve_html": extract_content
+ },
+ hooks_timeout=30
+ )
+
+ if result.success:
+ print(f"✅ Crawl successful!")
+ print(f" URL: {result.url}")
+ print(f" HTML: {len(result.html)} chars")
+ print(f" Markdown: {len(result.markdown)} chars")
+ else:
+ print(f"❌ Crawl failed: {result.error_message}")
+
+if __name__ == "__main__":
+ import asyncio
+ asyncio.run(main())
+```
+
+#### Additional Resources
+
+- **Comprehensive Examples**: See `/docs/examples/hooks_docker_client_example.py` for Python function-based examples
+- **REST API Examples**: See `/docs/examples/hooks_rest_api_example.py` for string-based examples
+- **Comparison Guide**: See `/docs/examples/README_HOOKS.md` for detailed comparison
+- **Utility Documentation**: See `/docs/hooks-utility-guide.md` for complete guide
+
---
## Dockerfile Parameters
@@ -892,10 +1168,12 @@ This is the easiest way to translate Python configuration to JSON requests when
Install the SDK: `pip install crawl4ai`
+The Python SDK provides a convenient way to interact with the Docker API, including **automatic hook conversion** when using function objects.
+
```python
import asyncio
from crawl4ai.docker_client import Crawl4aiDockerClient
-from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
async def main():
# Point to the correct server port
@@ -907,23 +1185,22 @@ async def main():
print("--- Running Non-Streaming Crawl ---")
results = await client.crawl(
["https://httpbin.org/html"],
- browser_config=BrowserConfig(headless=True), # Use library classes for config aid
+ browser_config=BrowserConfig(headless=True),
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
)
- if results: # client.crawl returns None on failure
- print(f"Non-streaming results success: {results.success}")
- if results.success:
- for result in results: # Iterate through the CrawlResultContainer
- print(f"URL: {result.url}, Success: {result.success}")
+ if results:
+ print(f"Non-streaming results success: {results.success}")
+ if results.success:
+ for result in results:
+ print(f"URL: {result.url}, Success: {result.success}")
else:
print("Non-streaming crawl failed.")
-
# Example Streaming crawl
print("\n--- Running Streaming Crawl ---")
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
try:
- async for result in await client.crawl( # client.crawl returns an async generator for streaming
+ async for result in await client.crawl(
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
browser_config=BrowserConfig(headless=True),
crawler_config=stream_config
@@ -932,17 +1209,56 @@ async def main():
except Exception as e:
print(f"Streaming crawl failed: {e}")
+ # Example with hooks (Python function objects)
+ print("\n--- Crawl with Hooks ---")
+
+ async def my_hook(page, context, **kwargs):
+ """Custom hook to optimize performance"""
+ await page.set_viewport_size({"width": 1920, "height": 1080})
+ await context.route("**/*.{png,jpg}", lambda r: r.abort())
+ print("[HOOK] Page optimized")
+ return page
+
+ result = await client.crawl(
+ ["https://httpbin.org/html"],
+ browser_config=BrowserConfig(headless=True),
+ crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+ hooks={"on_page_context_created": my_hook}, # Pass function directly!
+ hooks_timeout=30
+ )
+ print(f"Crawl with hooks success: {result.success}")
# Example Get schema
print("\n--- Getting Schema ---")
schema = await client.get_schema()
- print(f"Schema received: {bool(schema)}") # Print whether schema was received
+ print(f"Schema received: {bool(schema)}")
if __name__ == "__main__":
asyncio.run(main())
```
-*(SDK parameters like timeout, verify_ssl etc. remain the same)*
+#### SDK Parameters
+
+The Docker client supports the following parameters:
+
+**Client Initialization**:
+- `base_url` (str): URL of the Docker server (default: `http://localhost:8000`)
+- `timeout` (float): Request timeout in seconds (default: 30.0)
+- `verify_ssl` (bool): Verify SSL certificates (default: True)
+- `verbose` (bool): Enable verbose logging (default: True)
+- `log_file` (Optional[str]): Path to log file (default: None)
+
+**crawl() Method**:
+- `urls` (List[str]): List of URLs to crawl
+- `browser_config` (Optional[BrowserConfig]): Browser configuration
+- `crawler_config` (Optional[CrawlerRunConfig]): Crawler configuration
+- `hooks` (Optional[Dict]): Hook functions or strings - **automatically converts function objects!**
+- `hooks_timeout` (int): Timeout for each hook execution in seconds (default: 30)
+
+**Returns**:
+- Single URL: `CrawlResult` object
+- Multiple URLs: `List[CrawlResult]`
+- Streaming: `AsyncGenerator[CrawlResult]`
### Second Approach: Direct API Calls
@@ -1352,19 +1668,40 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
- Building and running the Docker container
-- Configuring the environment
+- Configuring the environment
- Using the interactive playground for testing
- Making API requests with proper typing
-- Using the Python SDK
+- Using the Python SDK with **automatic hook conversion**
+- **Working with hooks** - both string-based (REST API) and function-based (Python SDK)
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
- Connecting via the Model Context Protocol (MCP)
- Monitoring your deployment
-The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
+### Key Features
-For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
+**Hooks Support**: Crawl4AI offers two approaches for working with hooks:
+- **String-based** (REST API): Works with any language, requires manual string formatting
+- **Function-based** (Python SDK): Write hooks as regular Python functions with full IDE support and automatic conversion
-Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+**Playground Interface**: The built-in playground at `http://localhost:11235/playground` makes it easy to test configurations and generate corresponding JSON for API requests.
+
+**MCP Integration**: For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
+
+### Next Steps
+
+1. **Explore Examples**: Check out the comprehensive examples in:
+ - `/docs/examples/hooks_docker_client_example.py` - Python function-based hooks
+ - `/docs/examples/hooks_rest_api_example.py` - REST API string-based hooks
+ - `/docs/examples/README_HOOKS.md` - Comparison and guide
+
+2. **Read Documentation**:
+ - `/docs/hooks-utility-guide.md` - Complete hooks utility guide
+ - API documentation for detailed configuration options
+
+3. **Join the Community**:
+ - GitHub: Report issues and contribute
+ - Discord: Get help and share your experiences
+ - Documentation: Comprehensive guides and tutorials
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
diff --git a/docs/md_v2/marketplace/admin/admin.js b/docs/md_v2/marketplace/admin/admin.js
index 7d31a826..d43dd822 100644
--- a/docs/md_v2/marketplace/admin/admin.js
+++ b/docs/md_v2/marketplace/admin/admin.js
@@ -30,10 +30,10 @@ const { API_BASE, API_ORIGIN } = (() => {
if (origin) {
const normalized = cleanOrigin(origin);
- return { API_BASE: `${normalized}/api`, API_ORIGIN: normalized };
+ return { API_BASE: `${normalized}/marketplace/api`, API_ORIGIN: normalized };
}
- return { API_BASE: '/api', API_ORIGIN: '' };
+ return { API_BASE: '/marketplace/api', API_ORIGIN: '' };
})();
const resolveAssetUrl = (path) => {
diff --git a/docs/md_v2/marketplace/admin/index.html b/docs/md_v2/marketplace/admin/index.html
index a9b649f2..0d302384 100644
--- a/docs/md_v2/marketplace/admin/index.html
+++ b/docs/md_v2/marketplace/admin/index.html
@@ -210,6 +210,6 @@
-
+