From 4a04b8506a81d28afa2f8039554a00f1e6bf588b Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 13 Oct 2025 12:53:33 +0800 Subject: [PATCH] feat: Add hooks utility for function-based hooks with Docker client integration. ref #1377 Add hooks_to_string() utility function that converts Python function objects to string representations for the Docker API, enabling developers to write hooks as regular Python functions instead of strings. Core Changes: - New hooks_to_string() utility in crawl4ai/utils.py using inspect.getsource() - Docker client now accepts both function objects and strings for hooks - Automatic detection and conversion in Crawl4aiDockerClient._prepare_request() - New hooks and hooks_timeout parameters in client.crawl() method Documentation: - Docker client examples with function-based hooks (docs/examples/docker_client_hooks_example.py) - Updated main Docker deployment guide with comprehensive hooks section - Added unit tests for hooks utility (tests/docker/test_hooks_utility.py) --- docs/blog/release-v0.7.5.md | 82 +++++++++++- docs/md_v2/blog/releases/v0.7.5.md | 82 +++++++++++- docs/releases_review/demo_v0.7.5.py | 185 ++++++++++++++++------------ 3 files changed, 269 insertions(+), 80 deletions(-) diff --git a/docs/blog/release-v0.7.5.md b/docs/blog/release-v0.7.5.md index 5740873f..977d2fd9 100644 --- a/docs/blog/release-v0.7.5.md +++ b/docs/blog/release-v0.7.5.md @@ -8,7 +8,8 @@ Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. Thi ## 🎯 What's New at a Glance -- **Docker Hooks System**: Custom Python functions at key pipeline points +- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API +- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion - **Enhanced LLM Integration**: Custom providers with temperature control - **HTTPS Preservation**: Secure internal link handling - **Bug Fixes**: Resolved multiple community-reported issues @@ -82,6 +83,85 @@ if result.get('success'): - `before_retrieve_html`: Pre-extraction processing - `before_return_html`: Final HTML processing +### Function-Based Hooks API + +Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion! + +**Option 1: Using the `hooks_to_string()` Utility** + +```python +from crawl4ai import hooks_to_string +import requests + +# Define hooks as regular Python functions (with full IDE support!) +async def on_page_context_created(page, context, **kwargs): + """Block images to speed up crawling""" + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + +async def before_goto(page, context, url, **kwargs): + """Add custom headers""" + await page.set_extra_http_headers({ + 'X-Crawl4AI': 'v0.7.5', + 'X-Custom-Header': 'my-value' + }) + return page + +# Convert functions to strings +hooks_code = hooks_to_string({ + "on_page_context_created": on_page_context_created, + "before_goto": before_goto +}) + +# Use with REST API +payload = { + "urls": ["https://httpbin.org/html"], + "hooks": {"code": hooks_code, "timeout": 30} +} +response = requests.post("http://localhost:11235/crawl", json=payload) +``` + +**Option 2: Docker Client with Automatic Conversion (Recommended!)** + +```python +from crawl4ai.docker_client import Crawl4aiDockerClient + +# Define hooks as functions (same as above) +async def on_page_context_created(page, context, **kwargs): + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + return page + +async def before_retrieve_html(page, context, **kwargs): + # Scroll to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + return page + +# Use Docker client - conversion happens automatically! +client = Crawl4aiDockerClient(base_url="http://localhost:11235") + +results = await client.crawl( + urls=["https://httpbin.org/html"], + hooks={ + "on_page_context_created": on_page_context_created, + "before_retrieve_html": before_retrieve_html + }, + hooks_timeout=30 +) + +if results and results.success: + print(f"✅ Hooks executed! HTML length: {len(results.html)}") +``` + +**Benefits of Function-Based Hooks:** +- ✅ Full IDE support (autocomplete, syntax highlighting) +- ✅ Type checking and linting +- ✅ Easier to test and debug +- ✅ Reusable across projects +- ✅ Automatic conversion in Docker client +- ✅ No breaking changes - string hooks still work! + ## 🤖 Enhanced LLM Integration Enhanced LLM integration with custom providers, temperature control, and base URL configuration. diff --git a/docs/md_v2/blog/releases/v0.7.5.md b/docs/md_v2/blog/releases/v0.7.5.md index 5740873f..977d2fd9 100644 --- a/docs/md_v2/blog/releases/v0.7.5.md +++ b/docs/md_v2/blog/releases/v0.7.5.md @@ -8,7 +8,8 @@ Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. Thi ## 🎯 What's New at a Glance -- **Docker Hooks System**: Custom Python functions at key pipeline points +- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API +- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion - **Enhanced LLM Integration**: Custom providers with temperature control - **HTTPS Preservation**: Secure internal link handling - **Bug Fixes**: Resolved multiple community-reported issues @@ -82,6 +83,85 @@ if result.get('success'): - `before_retrieve_html`: Pre-extraction processing - `before_return_html`: Final HTML processing +### Function-Based Hooks API + +Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion! + +**Option 1: Using the `hooks_to_string()` Utility** + +```python +from crawl4ai import hooks_to_string +import requests + +# Define hooks as regular Python functions (with full IDE support!) +async def on_page_context_created(page, context, **kwargs): + """Block images to speed up crawling""" + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + +async def before_goto(page, context, url, **kwargs): + """Add custom headers""" + await page.set_extra_http_headers({ + 'X-Crawl4AI': 'v0.7.5', + 'X-Custom-Header': 'my-value' + }) + return page + +# Convert functions to strings +hooks_code = hooks_to_string({ + "on_page_context_created": on_page_context_created, + "before_goto": before_goto +}) + +# Use with REST API +payload = { + "urls": ["https://httpbin.org/html"], + "hooks": {"code": hooks_code, "timeout": 30} +} +response = requests.post("http://localhost:11235/crawl", json=payload) +``` + +**Option 2: Docker Client with Automatic Conversion (Recommended!)** + +```python +from crawl4ai.docker_client import Crawl4aiDockerClient + +# Define hooks as functions (same as above) +async def on_page_context_created(page, context, **kwargs): + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + return page + +async def before_retrieve_html(page, context, **kwargs): + # Scroll to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + return page + +# Use Docker client - conversion happens automatically! +client = Crawl4aiDockerClient(base_url="http://localhost:11235") + +results = await client.crawl( + urls=["https://httpbin.org/html"], + hooks={ + "on_page_context_created": on_page_context_created, + "before_retrieve_html": before_retrieve_html + }, + hooks_timeout=30 +) + +if results and results.success: + print(f"✅ Hooks executed! HTML length: {len(results.html)}") +``` + +**Benefits of Function-Based Hooks:** +- ✅ Full IDE support (autocomplete, syntax highlighting) +- ✅ Type checking and linting +- ✅ Easier to test and debug +- ✅ Reusable across projects +- ✅ Automatic conversion in Docker client +- ✅ No breaking changes - string hooks still work! + ## 🤖 Enhanced LLM Integration Enhanced LLM integration with custom providers, temperature control, and base URL configuration. diff --git a/docs/releases_review/demo_v0.7.5.py b/docs/releases_review/demo_v0.7.5.py index d25778ee..bda472ab 100644 --- a/docs/releases_review/demo_v0.7.5.py +++ b/docs/releases_review/demo_v0.7.5.py @@ -4,7 +4,7 @@ This demo showcases key features introduced in v0.7.5 with real, executable examples. Featured Demos: -1. ✅ Docker Hooks System - Real API calls with custom hooks +1. ✅ Docker Hooks System - Real API calls with custom hooks (string & function-based) 2. ✅ Enhanced LLM Integration - Working LLM configurations 3. ✅ HTTPS Preservation - Live crawling with HTTPS maintenance @@ -19,8 +19,10 @@ import requests import time import sys -from crawl4ai import (AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, - CacheMode, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy) +from crawl4ai import (AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, + CacheMode, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy, + hooks_to_string) +from crawl4ai.docker_client import Crawl4aiDockerClient def print_section(title: str, description: str = ""): @@ -36,13 +38,13 @@ async def demo_1_docker_hooks_system(): """Demo 1: Docker Hooks System - Real API calls with custom hooks""" print_section( "Demo 1: Docker Hooks System", - "Testing real Docker hooks with live API calls" + "Testing both string-based and function-based hooks (NEW in v0.7.5!)" ) # Check Docker service availability def check_docker_service(): try: - response = requests.get("http://localhost:11234/", timeout=3) + response = requests.get("http://localhost:11235/", timeout=3) return response.status_code == 200 except: return False @@ -60,108 +62,132 @@ async def demo_1_docker_hooks_system(): print("✓ Docker service detected!") - # Define real working hooks - hooks_config = { + # ============================================================================ + # PART 1: Traditional String-Based Hooks (Works with REST API) + # ============================================================================ + print("\n" + "─" * 60) + print("Part 1: String-Based Hooks (REST API)") + print("─" * 60) + + hooks_config_string = { "on_page_context_created": """ async def hook(page, context, **kwargs): - print("Hook: Setting up page context") - # Block images to speed up crawling + print("[String Hook] Setting up page context") await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) - print("Hook: Images blocked") return page """, - "before_retrieve_html": """ async def hook(page, context, **kwargs): - print("Hook: Before retrieving HTML") - # Scroll to bottom to load lazy content + print("[String Hook] Before retrieving HTML") await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1000) - print("Hook: Scrolled to bottom") - return page -""", - - "before_goto": """ -async def hook(page, context, url, **kwargs): - print(f"Hook: About to navigate to {url}") - # Add custom headers - await page.set_extra_http_headers({ - 'X-Test-Header': 'crawl4ai-hooks-test' - }) return page """ } - # Test with a reliable URL - test_url = "https://httpbin.org/html" - payload = { "urls": ["https://httpbin.org/html"], "hooks": { - "code": hooks_config, + "code": hooks_config_string, "timeout": 30 } } - print(f"🎯 Testing URL: {test_url}") - print("🔧 Configured 3 hooks: on_page_context_created, before_retrieve_html, before_goto\n") - - # Make the request - print("🔄 Executing hooks...") - + print("🔧 Using string-based hooks for REST API...") try: start_time = time.time() - response = requests.post( - "http://localhost:11234/crawl", - json=payload, - timeout=60 - ) + response = requests.post("http://localhost:11235/crawl", json=payload, timeout=60) execution_time = time.time() - start_time if response.status_code == 200: result = response.json() - - print(f"🎉 Success! Execution time: {execution_time:.2f}s\n") - - # Display results - success = result.get('success', False) - print(f"✅ Crawl Status: {'Success' if success else 'Failed'}") - - if success: - markdown_content = result.get('markdown', '') - print(f"📄 Content Length: {len(markdown_content)} characters") - - # Show content preview - if markdown_content: - preview = markdown_content[:300] + "..." if len(markdown_content) > 300 else markdown_content - print("\n--- Content Preview ---") - print(preview) - print("--- End Preview ---\n") - - # Check if our hook marker is present - raw_html = result.get('html', '') - if "Crawl4AI v0.7.5 Docker Hook" in raw_html: - print("✓ Hook marker found in HTML - hooks executed successfully!") - - # Display hook execution info if available - print("\nHook Execution Summary:") - print("🔗 before_goto: URL modified with tracking parameter") - print("✅ after_goto: Page navigation completed") - print("📝 before_return_html: Content processed and marked") - + print(f"✅ String-based hooks executed in {execution_time:.2f}s") + if result.get('results') and result['results'][0].get('success'): + html_length = len(result['results'][0].get('html', '')) + print(f" 📄 HTML length: {html_length} characters") else: print(f"❌ Request failed: {response.status_code}") - try: - error_data = response.json() - print(f"Error: {error_data}") - except: - print(f"Raw response: {response.text[:500]}") - - except requests.exceptions.Timeout: - print("⏰ Request timed out after 60 seconds") except Exception as e: print(f"❌ Error: {str(e)}") + # ============================================================================ + # PART 2: NEW Function-Based Hooks with Docker Client (v0.7.5) + # ============================================================================ + print("\n" + "─" * 60) + print("Part 2: Function-Based Hooks with Docker Client (✨ NEW!)") + print("─" * 60) + + # Define hooks as regular Python functions + async def on_page_context_created_func(page, context, **kwargs): + """Block images to speed up crawling""" + print("[Function Hook] Setting up page context") + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + + async def before_goto_func(page, context, url, **kwargs): + """Add custom headers before navigation""" + print(f"[Function Hook] About to navigate to {url}") + await page.set_extra_http_headers({ + 'X-Crawl4AI': 'v0.7.5-function-hooks', + 'X-Test-Header': 'demo' + }) + return page + + async def before_retrieve_html_func(page, context, **kwargs): + """Scroll to load lazy content""" + print("[Function Hook] Scrolling page for lazy-loaded content") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(500) + await page.evaluate("window.scrollTo(0, 0)") + return page + + # Use the hooks_to_string utility (can be used standalone) + print("\n📦 Converting functions to strings with hooks_to_string()...") + hooks_as_strings = hooks_to_string({ + "on_page_context_created": on_page_context_created_func, + "before_goto": before_goto_func, + "before_retrieve_html": before_retrieve_html_func + }) + print(f" ✓ Converted {len(hooks_as_strings)} hooks to string format") + + # OR use Docker Client which does conversion automatically! + print("\n🐳 Using Docker Client with automatic conversion...") + try: + client = Crawl4aiDockerClient(base_url="http://localhost:11235") + + # Pass function objects directly - conversion happens automatically! + results = await client.crawl( + urls=["https://httpbin.org/html"], + hooks={ + "on_page_context_created": on_page_context_created_func, + "before_goto": before_goto_func, + "before_retrieve_html": before_retrieve_html_func + }, + hooks_timeout=30 + ) + + if results and results.success: + print(f"✅ Function-based hooks executed successfully!") + print(f" 📄 HTML length: {len(results.html)} characters") + print(f" 🎯 URL: {results.url}") + else: + print("⚠️ Crawl completed but may have warnings") + + except Exception as e: + print(f"❌ Docker client error: {str(e)}") + + # Show the benefits + print("\n" + "=" * 60) + print("✨ Benefits of Function-Based Hooks:") + print("=" * 60) + print("✓ Full IDE support (autocomplete, syntax highlighting)") + print("✓ Type checking and linting") + print("✓ Easier to test and debug") + print("✓ Reusable across projects") + print("✓ Automatic conversion in Docker client") + print("=" * 60) + async def demo_2_enhanced_llm_integration(): """Demo 2: Enhanced LLM Integration - Working LLM configurations""" @@ -182,7 +208,7 @@ async def demo_2_enhanced_llm_integration(): } try: response = requests.post( - "http://localhost:11234/md", + "http://localhost:11235/md", json=payload, timeout=60 ) @@ -285,7 +311,10 @@ async def main(): print("You've experienced the power of Crawl4AI v0.7.5!") print("") print("Key Features Demonstrated:") - print("🔧 Docker Hooks - Custom pipeline modifications") + print("🔧 Docker Hooks - String-based & function-based (NEW!)") + print(" • hooks_to_string() utility for function conversion") + print(" • Docker client with automatic conversion") + print(" • Full IDE support and type checking") print("🤖 Enhanced LLM - Better AI integration") print("🔒 HTTPS Preservation - Secure link handling") print("")