From aadab30c3dc8f5d92aa754ff2dc03ce0d9260621 Mon Sep 17 00:00:00 2001 From: ntohidi Date: Mon, 13 Oct 2025 13:08:47 +0800 Subject: [PATCH] fix(docs): clarify Docker Hooks System with function-based API in README --- README.md | 56 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 58d4bf4c..ef0002e1 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data [✨ Check out latest update v0.7.5](#-recent-updates) -✨ New in v0.7.5: Docker Hooks System for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md) +✨ New in v0.7.5: Docker Hooks System with function-based API for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md) ✨ Recent v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md) @@ -179,7 +179,7 @@ No rate-limited APIs. No lock-in. Build and own your data pipeline with direct g - 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis. - 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`). - 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content. -- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior. +- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior (supports both string and function-based APIs). - 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches. - 📄 **Metadata Extraction**: Retrieve structured metadata from web pages. - 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content. @@ -549,34 +549,40 @@ async def test_news_crawl():
Version 0.7.5 Release Highlights - The Docker Hooks & Security Update -- **🔧 Docker Hooks System**: Complete pipeline customization with user-provided Python functions: +- **🔧 Docker Hooks System**: Complete pipeline customization with user-provided Python functions at 8 key points +- **✨ Function-Based Hooks API (NEW)**: Write hooks as regular Python functions with full IDE support: ```python - import requests + from crawl4ai import hooks_to_string + from crawl4ai.docker_client import Crawl4aiDockerClient - # Real working hooks for httpbin.org - hooks_config = { - "on_page_context_created": """ - async def hook(page, context, **kwargs): - print("Hook: Setting up page context") - # Block images to speed up crawling + # Define hooks as regular Python functions + async def on_page_context_created(page, context, **kwargs): + """Block images to speed up crawling""" await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) return page - """, - "before_goto": """ - async def hook(page, context, url, **kwargs): - print(f"Hook: About to navigate to {url}") - # Add custom headers - await page.set_extra_http_headers({'X-Test-Header': 'crawl4ai-hooks-test'}) - return page - """ - } - # Test with Docker API - payload = { - "urls": ["https://httpbin.org/html"], - "hooks": {"code": hooks_config, "timeout": 30} - } - response = requests.post("http://localhost:11235/crawl", json=payload) + async def before_goto(page, context, url, **kwargs): + """Add custom headers""" + await page.set_extra_http_headers({'X-Crawl4AI': 'v0.7.5'}) + return page + + # Option 1: Use hooks_to_string() utility for REST API + hooks_code = hooks_to_string({ + "on_page_context_created": on_page_context_created, + "before_goto": before_goto + }) + + # Option 2: Docker client with automatic conversion (Recommended) + client = Crawl4aiDockerClient(base_url="http://localhost:11235") + results = await client.crawl( + urls=["https://httpbin.org/html"], + hooks={ + "on_page_context_created": on_page_context_created, + "before_goto": before_goto + } + ) + # ✓ Full IDE support, type checking, and reusability! ``` - **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration