feat: Add hooks utility for function-based hooks with Docker client integration. ref #1377
Add hooks_to_string() utility function that converts Python function objects to string representations for the Docker API, enabling developers to write hooks as regular Python functions instead of strings. Core Changes: - New hooks_to_string() utility in crawl4ai/utils.py using inspect.getsource() - Docker client now accepts both function objects and strings for hooks - Automatic detection and conversion in Crawl4aiDockerClient._prepare_request() - New hooks and hooks_timeout parameters in client.crawl() method Documentation: - Docker client examples with function-based hooks (docs/examples/docker_client_hooks_example.py) - Updated main Docker deployment guide with comprehensive hooks section - Added unit tests for hooks utility (tests/docker/test_hooks_utility.py)
This commit is contained in:
522
docs/examples/docker_client_hooks_example.py
Normal file
522
docs/examples/docker_client_hooks_example.py
Normal file
@@ -0,0 +1,522 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive hooks examples using Docker Client with function objects.
|
||||
|
||||
This approach is recommended because:
|
||||
- Write hooks as regular Python functions
|
||||
- Full IDE support (autocomplete, type checking)
|
||||
- Automatic conversion to API format
|
||||
- Reusable and testable code
|
||||
- Clean, readable syntax
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import Crawl4aiDockerClient
|
||||
|
||||
# API_BASE_URL = "http://localhost:11235"
|
||||
API_BASE_URL = "http://localhost:11234"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Hook Function Definitions
|
||||
# ============================================================================
|
||||
|
||||
# --- All Hooks Demo ---
|
||||
async def browser_created_hook(browser, **kwargs):
|
||||
"""Called after browser is created"""
|
||||
print("[HOOK] Browser created and ready")
|
||||
return browser
|
||||
|
||||
|
||||
async def page_context_hook(page, context, **kwargs):
|
||||
"""Setup page environment"""
|
||||
print("[HOOK] Setting up page environment")
|
||||
|
||||
# Set viewport
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
|
||||
# Add cookies
|
||||
await context.add_cookies([{
|
||||
"name": "test_session",
|
||||
"value": "abc123xyz",
|
||||
"domain": ".httpbin.org",
|
||||
"path": "/"
|
||||
}])
|
||||
|
||||
# Block resources
|
||||
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
|
||||
await context.route("**/analytics/*", lambda route: route.abort())
|
||||
|
||||
print("[HOOK] Environment configured")
|
||||
return page
|
||||
|
||||
|
||||
async def user_agent_hook(page, context, user_agent, **kwargs):
|
||||
"""Called when user agent is updated"""
|
||||
print(f"[HOOK] User agent: {user_agent[:50]}...")
|
||||
return page
|
||||
|
||||
|
||||
async def before_goto_hook(page, context, url, **kwargs):
|
||||
"""Called before navigating to URL"""
|
||||
print(f"[HOOK] Navigating to: {url}")
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
"X-Custom-Header": "crawl4ai-test",
|
||||
"Accept-Language": "en-US"
|
||||
})
|
||||
|
||||
return page
|
||||
|
||||
|
||||
async def after_goto_hook(page, context, url, response, **kwargs):
|
||||
"""Called after page loads"""
|
||||
print(f"[HOOK] Page loaded: {url}")
|
||||
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
try:
|
||||
await page.wait_for_selector("body", timeout=2000)
|
||||
print("[HOOK] Body element ready")
|
||||
except:
|
||||
print("[HOOK] Timeout, continuing")
|
||||
|
||||
return page
|
||||
|
||||
|
||||
async def execution_started_hook(page, context, **kwargs):
|
||||
"""Called when custom JS execution starts"""
|
||||
print("[HOOK] JS execution started")
|
||||
await page.evaluate("console.log('[HOOK] Custom JS');")
|
||||
return page
|
||||
|
||||
|
||||
async def before_retrieve_hook(page, context, **kwargs):
|
||||
"""Called before retrieving HTML"""
|
||||
print("[HOOK] Preparing HTML retrieval")
|
||||
|
||||
# Scroll for lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
await page.wait_for_timeout(500)
|
||||
await page.evaluate("window.scrollTo(0, 0);")
|
||||
|
||||
print("[HOOK] Scrolling complete")
|
||||
return page
|
||||
|
||||
|
||||
async def before_return_hook(page, context, html, **kwargs):
|
||||
"""Called before returning HTML"""
|
||||
print(f"[HOOK] HTML ready: {len(html)} chars")
|
||||
|
||||
metrics = await page.evaluate('''() => ({
|
||||
images: document.images.length,
|
||||
links: document.links.length,
|
||||
scripts: document.scripts.length
|
||||
})''')
|
||||
|
||||
print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}")
|
||||
return page
|
||||
|
||||
|
||||
# --- Authentication Hooks ---
|
||||
async def auth_context_hook(page, context, **kwargs):
|
||||
"""Setup authentication context"""
|
||||
print("[HOOK] Setting up authentication")
|
||||
|
||||
# Add auth cookies
|
||||
await context.add_cookies([{
|
||||
"name": "auth_token",
|
||||
"value": "fake_jwt_token",
|
||||
"domain": ".httpbin.org",
|
||||
"path": "/",
|
||||
"httpOnly": True
|
||||
}])
|
||||
|
||||
# Set localStorage
|
||||
await page.evaluate('''
|
||||
localStorage.setItem('user_id', '12345');
|
||||
localStorage.setItem('auth_time', new Date().toISOString());
|
||||
''')
|
||||
|
||||
print("[HOOK] Auth context ready")
|
||||
return page
|
||||
|
||||
|
||||
async def auth_headers_hook(page, context, url, **kwargs):
|
||||
"""Add authentication headers"""
|
||||
print(f"[HOOK] Adding auth headers for {url}")
|
||||
|
||||
import base64
|
||||
credentials = base64.b64encode(b"user:passwd").decode('ascii')
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
'Authorization': f'Basic {credentials}',
|
||||
'X-API-Key': 'test-key-123'
|
||||
})
|
||||
|
||||
return page
|
||||
|
||||
|
||||
# --- Performance Optimization Hooks ---
|
||||
async def performance_hook(page, context, **kwargs):
|
||||
"""Optimize page for performance"""
|
||||
print("[HOOK] Optimizing for performance")
|
||||
|
||||
# Block resource-heavy content
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort())
|
||||
await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort())
|
||||
await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort())
|
||||
await context.route("**/googletagmanager.com/*", lambda r: r.abort())
|
||||
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||
await context.route("**/facebook.com/*", lambda r: r.abort())
|
||||
|
||||
# Disable animations
|
||||
await page.add_style_tag(content='''
|
||||
*, *::before, *::after {
|
||||
animation-duration: 0s !important;
|
||||
transition-duration: 0s !important;
|
||||
}
|
||||
''')
|
||||
|
||||
print("[HOOK] Optimizations applied")
|
||||
return page
|
||||
|
||||
|
||||
async def cleanup_hook(page, context, **kwargs):
|
||||
"""Clean page before extraction"""
|
||||
print("[HOOK] Cleaning page")
|
||||
|
||||
await page.evaluate('''() => {
|
||||
const selectors = [
|
||||
'.ad', '.ads', '.advertisement',
|
||||
'.popup', '.modal', '.overlay',
|
||||
'.cookie-banner', '.newsletter'
|
||||
];
|
||||
|
||||
selectors.forEach(sel => {
|
||||
document.querySelectorAll(sel).forEach(el => el.remove());
|
||||
});
|
||||
|
||||
document.querySelectorAll('script, style').forEach(el => el.remove());
|
||||
}''')
|
||||
|
||||
print("[HOOK] Page cleaned")
|
||||
return page
|
||||
|
||||
|
||||
# --- Content Extraction Hooks ---
|
||||
async def wait_dynamic_content_hook(page, context, url, response, **kwargs):
|
||||
"""Wait for dynamic content to load"""
|
||||
print(f"[HOOK] Waiting for dynamic content on {url}")
|
||||
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# Click "Load More" if exists
|
||||
try:
|
||||
load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")')
|
||||
if load_more:
|
||||
await load_more.click()
|
||||
await page.wait_for_timeout(1000)
|
||||
print("[HOOK] Clicked 'Load More'")
|
||||
except:
|
||||
pass
|
||||
|
||||
return page
|
||||
|
||||
|
||||
async def extract_metadata_hook(page, context, **kwargs):
|
||||
"""Extract page metadata"""
|
||||
print("[HOOK] Extracting metadata")
|
||||
|
||||
metadata = await page.evaluate('''() => {
|
||||
const getMeta = (name) => {
|
||||
const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
||||
return el ? el.getAttribute('content') : null;
|
||||
};
|
||||
|
||||
return {
|
||||
title: document.title,
|
||||
description: getMeta('description'),
|
||||
author: getMeta('author'),
|
||||
keywords: getMeta('keywords'),
|
||||
};
|
||||
}''')
|
||||
|
||||
print(f"[HOOK] Metadata: {metadata}")
|
||||
|
||||
# Infinite scroll
|
||||
for i in range(3):
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
await page.wait_for_timeout(1000)
|
||||
print(f"[HOOK] Scroll {i+1}/3")
|
||||
|
||||
return page
|
||||
|
||||
|
||||
# --- Multi-URL Hooks ---
|
||||
async def url_specific_hook(page, context, url, **kwargs):
|
||||
"""Apply URL-specific logic"""
|
||||
print(f"[HOOK] Processing URL: {url}")
|
||||
|
||||
# URL-specific headers
|
||||
if 'html' in url:
|
||||
await page.set_extra_http_headers({"X-Type": "HTML"})
|
||||
elif 'json' in url:
|
||||
await page.set_extra_http_headers({"X-Type": "JSON"})
|
||||
|
||||
return page
|
||||
|
||||
|
||||
async def track_progress_hook(page, context, url, response, **kwargs):
|
||||
"""Track crawl progress"""
|
||||
status = response.status if response else 'unknown'
|
||||
print(f"[HOOK] Loaded {url} - Status: {status}")
|
||||
return page
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Test Functions
|
||||
# ============================================================================
|
||||
|
||||
async def test_all_hooks_comprehensive():
|
||||
"""Test all 8 hook types"""
|
||||
print("=" * 70)
|
||||
print("Test 1: All Hooks Comprehensive Demo (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nCrawling with all 8 hooks...")
|
||||
|
||||
# Define hooks with function objects
|
||||
hooks = {
|
||||
"on_browser_created": browser_created_hook,
|
||||
"on_page_context_created": page_context_hook,
|
||||
"on_user_agent_updated": user_agent_hook,
|
||||
"before_goto": before_goto_hook,
|
||||
"after_goto": after_goto_hook,
|
||||
"on_execution_started": execution_started_hook,
|
||||
"before_retrieve_html": before_retrieve_hook,
|
||||
"before_return_html": before_return_hook
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=30
|
||||
)
|
||||
|
||||
print("\n✅ Success!")
|
||||
print(f" URL: {result.url}")
|
||||
print(f" Success: {result.success}")
|
||||
print(f" HTML: {len(result.html)} chars")
|
||||
|
||||
|
||||
async def test_authentication_workflow():
|
||||
"""Test authentication with hooks"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 2: Authentication Workflow (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nTesting authentication...")
|
||||
|
||||
hooks = {
|
||||
"on_page_context_created": auth_context_hook,
|
||||
"before_goto": auth_headers_hook
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/basic-auth/user/passwd"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=15
|
||||
)
|
||||
|
||||
print("\n✅ Authentication completed")
|
||||
|
||||
if result.success:
|
||||
if '"authenticated"' in result.html and 'true' in result.html:
|
||||
print(" ✅ Basic auth successful!")
|
||||
else:
|
||||
print(" ⚠️ Auth status unclear")
|
||||
else:
|
||||
print(f" ❌ Failed: {result.error_message}")
|
||||
|
||||
|
||||
async def test_performance_optimization():
|
||||
"""Test performance optimization"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 3: Performance Optimization (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nTesting performance hooks...")
|
||||
|
||||
hooks = {
|
||||
"on_page_context_created": performance_hook,
|
||||
"before_retrieve_html": cleanup_hook
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=10
|
||||
)
|
||||
|
||||
print("\n✅ Optimization completed")
|
||||
print(f" HTML size: {len(result.html):,} chars")
|
||||
print(" Resources blocked, ads removed")
|
||||
|
||||
|
||||
async def test_content_extraction():
|
||||
"""Test content extraction"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 4: Content Extraction (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nTesting extraction hooks...")
|
||||
|
||||
hooks = {
|
||||
"after_goto": wait_dynamic_content_hook,
|
||||
"before_retrieve_html": extract_metadata_hook
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://www.kidocode.com/"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=20
|
||||
)
|
||||
|
||||
print("\n✅ Extraction completed")
|
||||
print(f" URL: {result.url}")
|
||||
print(f" Success: {result.success}")
|
||||
print(f" Metadata: {result.metadata}")
|
||||
|
||||
|
||||
async def test_multi_url_crawl():
|
||||
"""Test hooks with multiple URLs"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 5: Multi-URL Crawl (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nCrawling multiple URLs...")
|
||||
|
||||
hooks = {
|
||||
"before_goto": url_specific_hook,
|
||||
"after_goto": track_progress_hook
|
||||
}
|
||||
|
||||
results = await client.crawl(
|
||||
[
|
||||
"https://httpbin.org/html",
|
||||
"https://httpbin.org/json",
|
||||
"https://httpbin.org/xml"
|
||||
],
|
||||
hooks=hooks,
|
||||
hooks_timeout=15
|
||||
)
|
||||
|
||||
print("\n✅ Multi-URL crawl completed")
|
||||
print(f"\n Crawled {len(results)} URLs:")
|
||||
for i, result in enumerate(results, 1):
|
||||
status = "✅" if result.success else "❌"
|
||||
print(f" {status} {i}. {result.url}")
|
||||
|
||||
|
||||
async def test_reusable_hook_library():
|
||||
"""Test using reusable hook library"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 6: Reusable Hook Library (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
# Create a library of reusable hooks
|
||||
class HookLibrary:
|
||||
@staticmethod
|
||||
async def block_images(page, context, **kwargs):
|
||||
"""Block all images"""
|
||||
await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort())
|
||||
print("[LIBRARY] Images blocked")
|
||||
return page
|
||||
|
||||
@staticmethod
|
||||
async def block_analytics(page, context, **kwargs):
|
||||
"""Block analytics"""
|
||||
await context.route("**/analytics/*", lambda r: r.abort())
|
||||
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||
print("[LIBRARY] Analytics blocked")
|
||||
return page
|
||||
|
||||
@staticmethod
|
||||
async def scroll_infinite(page, context, **kwargs):
|
||||
"""Handle infinite scroll"""
|
||||
for i in range(5):
|
||||
prev = await page.evaluate("document.body.scrollHeight")
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
await page.wait_for_timeout(1000)
|
||||
curr = await page.evaluate("document.body.scrollHeight")
|
||||
if curr == prev:
|
||||
break
|
||||
print("[LIBRARY] Infinite scroll complete")
|
||||
return page
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nUsing hook library...")
|
||||
|
||||
hooks = {
|
||||
"on_page_context_created": HookLibrary.block_images,
|
||||
"before_retrieve_html": HookLibrary.scroll_infinite
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://www.kidocode.com/"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=20
|
||||
)
|
||||
|
||||
print("\n✅ Library hooks completed")
|
||||
print(f" Success: {result.success}")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
||||
async def main():
|
||||
"""Run all Docker client hook examples"""
|
||||
print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)")
|
||||
print("Using Python function objects with automatic conversion")
|
||||
print("=" * 70)
|
||||
|
||||
tests = [
|
||||
("All Hooks Demo", test_all_hooks_comprehensive),
|
||||
("Authentication", test_authentication_workflow),
|
||||
("Performance", test_performance_optimization),
|
||||
("Extraction", test_content_extraction),
|
||||
("Multi-URL", test_multi_url_crawl),
|
||||
("Hook Library", test_reusable_hook_library)
|
||||
]
|
||||
|
||||
for i, (name, test_func) in enumerate(tests, 1):
|
||||
try:
|
||||
await test_func()
|
||||
print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("=" * 70)
|
||||
print("🎉 All Docker client hook examples completed!")
|
||||
print("\n💡 Key Benefits of Function-Based Hooks:")
|
||||
print(" • Write as regular Python functions")
|
||||
print(" • Full IDE support (autocomplete, types)")
|
||||
print(" • Automatic conversion to API format")
|
||||
print(" • Reusable across projects")
|
||||
print(" • Clean, readable code")
|
||||
print(" • Easy to test and debug")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -6,18 +6,6 @@
|
||||
- [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
|
||||
- [Option 2: Using Docker Compose](#option-2-using-docker-compose)
|
||||
- [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
|
||||
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||
- [Using the API](#using-the-api)
|
||||
- [Playground Interface](#playground-interface)
|
||||
- [Python SDK](#python-sdk)
|
||||
- [Understanding Request Schema](#understanding-request-schema)
|
||||
- [REST API Examples](#rest-api-examples)
|
||||
- [Additional API Endpoints](#additional-api-endpoints)
|
||||
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||
- [PDF Export Endpoint](#pdf-export-endpoint)
|
||||
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
||||
- [Library Context Endpoint](#library-context-endpoint)
|
||||
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
|
||||
- [What is MCP?](#what-is-mcp)
|
||||
- [Connecting via MCP](#connecting-via-mcp)
|
||||
@@ -25,9 +13,28 @@
|
||||
- [Available MCP Tools](#available-mcp-tools)
|
||||
- [Testing MCP Connections](#testing-mcp-connections)
|
||||
- [MCP Schemas](#mcp-schemas)
|
||||
- [Additional API Endpoints](#additional-api-endpoints)
|
||||
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||
- [PDF Export Endpoint](#pdf-export-endpoint)
|
||||
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
||||
- [User-Provided Hooks API](#user-provided-hooks-api)
|
||||
- [Hook Information Endpoint](#hook-information-endpoint)
|
||||
- [Available Hook Points](#available-hook-points)
|
||||
- [Using Hooks in Requests](#using-hooks-in-requests)
|
||||
- [Hook Examples with Real URLs](#hook-examples-with-real-urls)
|
||||
- [Security Best Practices](#security-best-practices)
|
||||
- [Hook Response Information](#hook-response-information)
|
||||
- [Error Handling](#error-handling)
|
||||
- [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
|
||||
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||
- [Using the API](#using-the-api)
|
||||
- [Playground Interface](#playground-interface)
|
||||
- [Python SDK](#python-sdk)
|
||||
- [Understanding Request Schema](#understanding-request-schema)
|
||||
- [REST API Examples](#rest-api-examples)
|
||||
- [LLM Configuration Examples](#llm-configuration-examples)
|
||||
- [Metrics & Monitoring](#metrics--monitoring)
|
||||
- [Deployment Scenarios](#deployment-scenarios)
|
||||
- [Complete Examples](#complete-examples)
|
||||
- [Server Configuration](#server-configuration)
|
||||
- [Understanding config.yml](#understanding-configyml)
|
||||
- [JWT Authentication](#jwt-authentication)
|
||||
@@ -832,6 +839,275 @@ else:
|
||||
|
||||
> 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious.
|
||||
|
||||
### Hooks Utility: Function-Based Approach (Python)
|
||||
|
||||
For Python developers, Crawl4AI provides a more convenient way to work with hooks using the `hooks_to_string()` utility function and Docker client integration.
|
||||
|
||||
#### Why Use Function-Based Hooks?
|
||||
|
||||
**String-Based Approach (shown above)**:
|
||||
```python
|
||||
hooks_code = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
"""
|
||||
}
|
||||
```
|
||||
|
||||
**Function-Based Approach (recommended for Python)**:
|
||||
```python
|
||||
from crawl4ai import Crawl4aiDockerClient
|
||||
|
||||
async def my_hook(page, context, **kwargs):
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||
result = await client.crawl(
|
||||
["https://example.com"],
|
||||
hooks={"on_page_context_created": my_hook}
|
||||
)
|
||||
```
|
||||
|
||||
**Benefits**:
|
||||
- ✅ Write hooks as regular Python functions
|
||||
- ✅ Full IDE support (autocomplete, syntax highlighting, type checking)
|
||||
- ✅ Easy to test and debug
|
||||
- ✅ Reusable hook libraries
|
||||
- ✅ Automatic conversion to API format
|
||||
|
||||
#### Using the Hooks Utility
|
||||
|
||||
The `hooks_to_string()` utility converts Python function objects to the string format required by the API:
|
||||
|
||||
```python
|
||||
from crawl4ai import hooks_to_string
|
||||
|
||||
# Define your hooks as functions
|
||||
async def setup_hook(page, context, **kwargs):
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
await context.add_cookies([{
|
||||
"name": "session",
|
||||
"value": "token",
|
||||
"domain": ".example.com"
|
||||
}])
|
||||
return page
|
||||
|
||||
async def scroll_hook(page, context, **kwargs):
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
return page
|
||||
|
||||
# Convert to string format
|
||||
hooks_dict = {
|
||||
"on_page_context_created": setup_hook,
|
||||
"before_retrieve_html": scroll_hook
|
||||
}
|
||||
hooks_string = hooks_to_string(hooks_dict)
|
||||
|
||||
# Now use with REST API or Docker client
|
||||
# hooks_string contains the string representations
|
||||
```
|
||||
|
||||
#### Docker Client with Automatic Conversion
|
||||
|
||||
The Docker client automatically detects and converts function objects:
|
||||
|
||||
```python
|
||||
from crawl4ai import Crawl4aiDockerClient
|
||||
|
||||
async def auth_hook(page, context, **kwargs):
|
||||
"""Add authentication cookies"""
|
||||
await context.add_cookies([{
|
||||
"name": "auth_token",
|
||||
"value": "your_token",
|
||||
"domain": ".example.com"
|
||||
}])
|
||||
return page
|
||||
|
||||
async def performance_hook(page, context, **kwargs):
|
||||
"""Block unnecessary resources"""
|
||||
await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
|
||||
await context.route("**/analytics/*", lambda r: r.abort())
|
||||
return page
|
||||
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||
# Pass functions directly - automatic conversion!
|
||||
result = await client.crawl(
|
||||
["https://example.com"],
|
||||
hooks={
|
||||
"on_page_context_created": performance_hook,
|
||||
"before_goto": auth_hook
|
||||
},
|
||||
hooks_timeout=30 # Optional timeout in seconds (1-120)
|
||||
)
|
||||
|
||||
print(f"Success: {result.success}")
|
||||
print(f"HTML: {len(result.html)} chars")
|
||||
```
|
||||
|
||||
#### Creating Reusable Hook Libraries
|
||||
|
||||
Build collections of reusable hooks:
|
||||
|
||||
```python
|
||||
# hooks_library.py
|
||||
class CrawlHooks:
|
||||
"""Reusable hook collection for common crawling tasks"""
|
||||
|
||||
@staticmethod
|
||||
async def block_images(page, context, **kwargs):
|
||||
"""Block all images to speed up crawling"""
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda r: r.abort())
|
||||
return page
|
||||
|
||||
@staticmethod
|
||||
async def block_analytics(page, context, **kwargs):
|
||||
"""Block analytics and tracking scripts"""
|
||||
tracking_domains = [
|
||||
"**/google-analytics.com/*",
|
||||
"**/googletagmanager.com/*",
|
||||
"**/facebook.com/tr/*",
|
||||
"**/doubleclick.net/*"
|
||||
]
|
||||
for domain in tracking_domains:
|
||||
await context.route(domain, lambda r: r.abort())
|
||||
return page
|
||||
|
||||
@staticmethod
|
||||
async def scroll_infinite(page, context, **kwargs):
|
||||
"""Handle infinite scroll to load more content"""
|
||||
previous_height = 0
|
||||
for i in range(5): # Max 5 scrolls
|
||||
current_height = await page.evaluate("document.body.scrollHeight")
|
||||
if current_height == previous_height:
|
||||
break
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
previous_height = current_height
|
||||
return page
|
||||
|
||||
@staticmethod
|
||||
async def wait_for_dynamic_content(page, context, url, response, **kwargs):
|
||||
"""Wait for dynamic content to load"""
|
||||
await page.wait_for_timeout(2000)
|
||||
try:
|
||||
# Click "Load More" if present
|
||||
load_more = await page.query_selector('[class*="load-more"]')
|
||||
if load_more:
|
||||
await load_more.click()
|
||||
await page.wait_for_timeout(1000)
|
||||
except:
|
||||
pass
|
||||
return page
|
||||
|
||||
# Use in your application
|
||||
from hooks_library import CrawlHooks
|
||||
from crawl4ai import Crawl4aiDockerClient
|
||||
|
||||
async def crawl_with_optimizations(url):
|
||||
async with Crawl4aiDockerClient() as client:
|
||||
result = await client.crawl(
|
||||
[url],
|
||||
hooks={
|
||||
"on_page_context_created": CrawlHooks.block_images,
|
||||
"before_retrieve_html": CrawlHooks.scroll_infinite
|
||||
}
|
||||
)
|
||||
return result
|
||||
```
|
||||
|
||||
#### Choosing the Right Approach
|
||||
|
||||
| Approach | Best For | IDE Support | Language |
|
||||
|----------|----------|-------------|----------|
|
||||
| **String-based** | Non-Python clients, REST APIs, other languages | ❌ None | Any |
|
||||
| **Function-based** | Python applications, local development | ✅ Full | Python only |
|
||||
| **Docker Client** | Python apps with automatic conversion | ✅ Full | Python only |
|
||||
|
||||
**Recommendation**:
|
||||
- **Python applications**: Use Docker client with function objects (easiest)
|
||||
- **Non-Python or REST API**: Use string-based hooks (most flexible)
|
||||
- **Manual control**: Use `hooks_to_string()` utility (middle ground)
|
||||
|
||||
#### Complete Example with Function Hooks
|
||||
|
||||
```python
|
||||
from crawl4ai import Crawl4aiDockerClient, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
# Define hooks as regular Python functions
|
||||
async def setup_environment(page, context, **kwargs):
|
||||
"""Setup crawling environment"""
|
||||
# Set viewport
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
|
||||
# Block resources for speed
|
||||
await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
|
||||
|
||||
# Add custom headers
|
||||
await page.set_extra_http_headers({
|
||||
"Accept-Language": "en-US",
|
||||
"X-Custom-Header": "Crawl4AI"
|
||||
})
|
||||
|
||||
print("[HOOK] Environment configured")
|
||||
return page
|
||||
|
||||
async def extract_content(page, context, **kwargs):
|
||||
"""Extract and prepare content"""
|
||||
# Scroll to load lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
# Extract metadata
|
||||
metadata = await page.evaluate('''() => ({
|
||||
title: document.title,
|
||||
links: document.links.length,
|
||||
images: document.images.length
|
||||
})''')
|
||||
|
||||
print(f"[HOOK] Page metadata: {metadata}")
|
||||
return page
|
||||
|
||||
async def main():
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
|
||||
# Configure crawl
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
# Crawl with hooks
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
browser_config=browser_config,
|
||||
crawler_config=crawler_config,
|
||||
hooks={
|
||||
"on_page_context_created": setup_environment,
|
||||
"before_retrieve_html": extract_content
|
||||
},
|
||||
hooks_timeout=30
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Crawl successful!")
|
||||
print(f" URL: {result.url}")
|
||||
print(f" HTML: {len(result.html)} chars")
|
||||
print(f" Markdown: {len(result.markdown)} chars")
|
||||
else:
|
||||
print(f"❌ Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
#### Additional Resources
|
||||
|
||||
- **Comprehensive Examples**: See `/docs/examples/hooks_docker_client_example.py` for Python function-based examples
|
||||
- **REST API Examples**: See `/docs/examples/hooks_rest_api_example.py` for string-based examples
|
||||
- **Comparison Guide**: See `/docs/examples/README_HOOKS.md` for detailed comparison
|
||||
- **Utility Documentation**: See `/docs/hooks-utility-guide.md` for complete guide
|
||||
|
||||
---
|
||||
|
||||
## Dockerfile Parameters
|
||||
@@ -892,10 +1168,12 @@ This is the easiest way to translate Python configuration to JSON requests when
|
||||
|
||||
Install the SDK: `pip install crawl4ai`
|
||||
|
||||
The Python SDK provides a convenient way to interact with the Docker API, including **automatic hook conversion** when using function objects.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def main():
|
||||
# Point to the correct server port
|
||||
@@ -907,23 +1185,22 @@ async def main():
|
||||
print("--- Running Non-Streaming Crawl ---")
|
||||
results = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
browser_config=BrowserConfig(headless=True), # Use library classes for config aid
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
if results: # client.crawl returns None on failure
|
||||
print(f"Non-streaming results success: {results.success}")
|
||||
if results.success:
|
||||
for result in results: # Iterate through the CrawlResultContainer
|
||||
print(f"URL: {result.url}, Success: {result.success}")
|
||||
if results:
|
||||
print(f"Non-streaming results success: {results.success}")
|
||||
if results.success:
|
||||
for result in results:
|
||||
print(f"URL: {result.url}, Success: {result.success}")
|
||||
else:
|
||||
print("Non-streaming crawl failed.")
|
||||
|
||||
|
||||
# Example Streaming crawl
|
||||
print("\n--- Running Streaming Crawl ---")
|
||||
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
||||
try:
|
||||
async for result in await client.crawl( # client.crawl returns an async generator for streaming
|
||||
async for result in await client.crawl(
|
||||
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=stream_config
|
||||
@@ -932,17 +1209,56 @@ async def main():
|
||||
except Exception as e:
|
||||
print(f"Streaming crawl failed: {e}")
|
||||
|
||||
# Example with hooks (Python function objects)
|
||||
print("\n--- Crawl with Hooks ---")
|
||||
|
||||
async def my_hook(page, context, **kwargs):
|
||||
"""Custom hook to optimize performance"""
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
await context.route("**/*.{png,jpg}", lambda r: r.abort())
|
||||
print("[HOOK] Page optimized")
|
||||
return page
|
||||
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
hooks={"on_page_context_created": my_hook}, # Pass function directly!
|
||||
hooks_timeout=30
|
||||
)
|
||||
print(f"Crawl with hooks success: {result.success}")
|
||||
|
||||
# Example Get schema
|
||||
print("\n--- Getting Schema ---")
|
||||
schema = await client.get_schema()
|
||||
print(f"Schema received: {bool(schema)}") # Print whether schema was received
|
||||
print(f"Schema received: {bool(schema)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
*(SDK parameters like timeout, verify_ssl etc. remain the same)*
|
||||
#### SDK Parameters
|
||||
|
||||
The Docker client supports the following parameters:
|
||||
|
||||
**Client Initialization**:
|
||||
- `base_url` (str): URL of the Docker server (default: `http://localhost:8000`)
|
||||
- `timeout` (float): Request timeout in seconds (default: 30.0)
|
||||
- `verify_ssl` (bool): Verify SSL certificates (default: True)
|
||||
- `verbose` (bool): Enable verbose logging (default: True)
|
||||
- `log_file` (Optional[str]): Path to log file (default: None)
|
||||
|
||||
**crawl() Method**:
|
||||
- `urls` (List[str]): List of URLs to crawl
|
||||
- `browser_config` (Optional[BrowserConfig]): Browser configuration
|
||||
- `crawler_config` (Optional[CrawlerRunConfig]): Crawler configuration
|
||||
- `hooks` (Optional[Dict]): Hook functions or strings - **automatically converts function objects!**
|
||||
- `hooks_timeout` (int): Timeout for each hook execution in seconds (default: 30)
|
||||
|
||||
**Returns**:
|
||||
- Single URL: `CrawlResult` object
|
||||
- Multiple URLs: `List[CrawlResult]`
|
||||
- Streaming: `AsyncGenerator[CrawlResult]`
|
||||
|
||||
### Second Approach: Direct API Calls
|
||||
|
||||
@@ -1352,19 +1668,40 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
|
||||
|
||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||
- Building and running the Docker container
|
||||
- Configuring the environment
|
||||
- Configuring the environment
|
||||
- Using the interactive playground for testing
|
||||
- Making API requests with proper typing
|
||||
- Using the Python SDK
|
||||
- Using the Python SDK with **automatic hook conversion**
|
||||
- **Working with hooks** - both string-based (REST API) and function-based (Python SDK)
|
||||
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
||||
- Connecting via the Model Context Protocol (MCP)
|
||||
- Monitoring your deployment
|
||||
|
||||
The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
|
||||
### Key Features
|
||||
|
||||
For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
|
||||
**Hooks Support**: Crawl4AI offers two approaches for working with hooks:
|
||||
- **String-based** (REST API): Works with any language, requires manual string formatting
|
||||
- **Function-based** (Python SDK): Write hooks as regular Python functions with full IDE support and automatic conversion
|
||||
|
||||
Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
|
||||
**Playground Interface**: The built-in playground at `http://localhost:11235/playground` makes it easy to test configurations and generate corresponding JSON for API requests.
|
||||
|
||||
**MCP Integration**: For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
|
||||
|
||||
### Next Steps
|
||||
|
||||
1. **Explore Examples**: Check out the comprehensive examples in:
|
||||
- `/docs/examples/hooks_docker_client_example.py` - Python function-based hooks
|
||||
- `/docs/examples/hooks_rest_api_example.py` - REST API string-based hooks
|
||||
- `/docs/examples/README_HOOKS.md` - Comparison and guide
|
||||
|
||||
2. **Read Documentation**:
|
||||
- `/docs/hooks-utility-guide.md` - Complete hooks utility guide
|
||||
- API documentation for detailed configuration options
|
||||
|
||||
3. **Join the Community**:
|
||||
- GitHub: Report issues and contribute
|
||||
- Discord: Get help and share your experiences
|
||||
- Documentation: Comprehensive guides and tutorials
|
||||
|
||||
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
||||
|
||||
|
||||
Reference in New Issue
Block a user