diff --git a/.gitignore b/.gitignore index aa7cb616..c4313701 100644 --- a/.gitignore +++ b/.gitignore @@ -267,6 +267,8 @@ continue_config.json .llm.env .private/ +.claude/ + CLAUDE_MONITOR.md CLAUDE.md diff --git a/Dockerfile b/Dockerfile index 1267578c..33c33d55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM python:3.12-slim-bookworm AS build # C4ai version -ARG C4AI_VER=0.7.0-r1 +ARG C4AI_VER=0.7.6 ENV C4AI_VERSION=$C4AI_VER LABEL c4ai.version=$C4AI_VER diff --git a/README.md b/README.md index 45f11560..d9a68482 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,13 @@ Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community. -[✨ Check out latest update v0.7.4](#-recent-updates) +[✨ Check out latest update v0.7.6](#-recent-updates) -✨ New in v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md) +✨ **New in v0.7.6**: Complete Webhook Infrastructure for Docker Job Queue API! Real-time notifications for both `/crawl/job` and `/llm/job` endpoints with exponential backoff retry, custom headers, and flexible delivery modes. No more polling! [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.6.md) -✨ Recent v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md) +✨ Recent v0.7.5: Docker Hooks System with function-based API for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md) + +✨ Previous v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
🤓 My Personal Story @@ -177,7 +179,7 @@ No rate-limited APIs. No lock-in. Build and own your data pipeline with direct g - 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis. - 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`). - 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content. -- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior. +- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior (supports both string and function-based APIs). - 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches. - 📄 **Metadata Extraction**: Retrieve structured metadata from web pages. - 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content. @@ -544,6 +546,54 @@ async def test_news_crawl(): ## ✨ Recent Updates +
+Version 0.7.5 Release Highlights - The Docker Hooks & Security Update + +- **🔧 Docker Hooks System**: Complete pipeline customization with user-provided Python functions at 8 key points +- **✨ Function-Based Hooks API (NEW)**: Write hooks as regular Python functions with full IDE support: + ```python + from crawl4ai import hooks_to_string + from crawl4ai.docker_client import Crawl4aiDockerClient + + # Define hooks as regular Python functions + async def on_page_context_created(page, context, **kwargs): + """Block images to speed up crawling""" + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + + async def before_goto(page, context, url, **kwargs): + """Add custom headers""" + await page.set_extra_http_headers({'X-Crawl4AI': 'v0.7.5'}) + return page + + # Option 1: Use hooks_to_string() utility for REST API + hooks_code = hooks_to_string({ + "on_page_context_created": on_page_context_created, + "before_goto": before_goto + }) + + # Option 2: Docker client with automatic conversion (Recommended) + client = Crawl4aiDockerClient(base_url="http://localhost:11235") + results = await client.crawl( + urls=["https://httpbin.org/html"], + hooks={ + "on_page_context_created": on_page_context_created, + "before_goto": before_goto + } + ) + # ✓ Full IDE support, type checking, and reusability! + ``` + +- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration +- **🔒 HTTPS Preservation**: Secure internal link handling with `preserve_https_for_internal_links=True` +- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance +- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration + +[Full v0.7.5 Release Notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md) + +
+
Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update @@ -919,6 +969,36 @@ We envision a future where AI is powered by real human knowledge, ensuring data For more details, see our [full mission statement](./MISSION.md).
+## 🌟 Current Sponsors + +### 🏢 Enterprise Sponsors & Partners + +Our enterprise sponsors and technology partners help scale Crawl4AI to power production-grade data pipelines. + +| Company | About | Sponsorship Tier | +|------|------|----------------------------| +| Capsolver | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥈 Silver | +| DataSync | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold | +| Kidocode

KidoCode

| Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold | +| Aleph null | Singapore-based Aleph Null is Asia’s leading edtech hub, dedicated to student-centric, AI-driven education—empowering learners with the tools to thrive in a fast-changing world. | 🥇 Gold | + +### 🧑‍🤝 Individual Sponsors + +A heartfelt thanks to our individual supporters! Every contribution helps us keep our opensource mission alive and thriving! + +

+ + + + + + + + +

+ +> Want to join them? [Sponsor Crawl4AI →](https://github.com/sponsors/unclecode) + ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 6917f27e..8f1fdef4 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -103,7 +103,8 @@ from .browser_adapter import ( from .utils import ( start_colab_display_server, - setup_colab_environment + setup_colab_environment, + hooks_to_string ) __all__ = [ @@ -183,6 +184,7 @@ __all__ = [ "ProxyConfig", "start_colab_display_server", "setup_colab_environment", + "hooks_to_string", # C4A Script additions "c4a_compile", "c4a_validate", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index b73a591d..36be3827 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,7 +1,7 @@ # crawl4ai/__version__.py # This is the version that will be used for stable releases -__version__ = "0.7.4" +__version__ = "0.7.6" # For nightly builds, this gets set during build process __nightly_version__ = None diff --git a/crawl4ai/docker_client.py b/crawl4ai/docker_client.py index 4e33431f..969fee7c 100644 --- a/crawl4ai/docker_client.py +++ b/crawl4ai/docker_client.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union, AsyncGenerator, Dict, Any +from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable import httpx import json from urllib.parse import urljoin @@ -7,6 +7,7 @@ import asyncio from .async_configs import BrowserConfig, CrawlerRunConfig from .models import CrawlResult from .async_logger import AsyncLogger, LogLevel +from .utils import hooks_to_string class Crawl4aiClientError(Exception): @@ -70,17 +71,41 @@ class Crawl4aiDockerClient: self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR") raise ConnectionError(f"Cannot connect to server: {str(e)}") - def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, - crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]: + def _prepare_request( + self, + urls: List[str], + browser_config: Optional[BrowserConfig] = None, + crawler_config: Optional[CrawlerRunConfig] = None, + hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None, + hooks_timeout: int = 30 + ) -> Dict[str, Any]: """Prepare request data from configs.""" if self._token: self._http_client.headers["Authorization"] = f"Bearer {self._token}" - return { + + request_data = { "urls": urls, "browser_config": browser_config.dump() if browser_config else {}, "crawler_config": crawler_config.dump() if crawler_config else {} } + # Handle hooks if provided + if hooks: + # Check if hooks are already strings or need conversion + if any(callable(v) for v in hooks.values()): + # Convert function objects to strings + hooks_code = hooks_to_string(hooks) + else: + # Already in string format + hooks_code = hooks + + request_data["hooks"] = { + "code": hooks_code, + "timeout": hooks_timeout + } + + return request_data + async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response: """Make an HTTP request with error handling.""" url = urljoin(self.base_url, endpoint) @@ -102,16 +127,42 @@ class Crawl4aiDockerClient: self, urls: List[str], browser_config: Optional[BrowserConfig] = None, - crawler_config: Optional[CrawlerRunConfig] = None + crawler_config: Optional[CrawlerRunConfig] = None, + hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None, + hooks_timeout: int = 30 ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]: - """Execute a crawl operation.""" + """ + Execute a crawl operation. + + Args: + urls: List of URLs to crawl + browser_config: Browser configuration + crawler_config: Crawler configuration + hooks: Optional hooks - can be either: + - Dict[str, Callable]: Function objects that will be converted to strings + - Dict[str, str]: Already stringified hook code + hooks_timeout: Timeout in seconds for each hook execution (1-120) + + Returns: + Single CrawlResult, list of results, or async generator for streaming + + Example with function hooks: + >>> async def my_hook(page, context, **kwargs): + ... await page.set_viewport_size({"width": 1920, "height": 1080}) + ... return page + >>> + >>> result = await client.crawl( + ... ["https://example.com"], + ... hooks={"on_page_context_created": my_hook} + ... ) + """ await self._check_server() - - data = self._prepare_request(urls, browser_config, crawler_config) + + data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout) is_streaming = crawler_config and crawler_config.stream - + self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL") - + if is_streaming: async def stream_results() -> AsyncGenerator[CrawlResult, None]: async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response: @@ -128,12 +179,12 @@ class Crawl4aiDockerClient: else: yield CrawlResult(**result) return stream_results() - + response = await self._request("POST", "/crawl", json=data) result_data = response.json() if not result_data.get("success", False): raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}") - + results = [CrawlResult(**r) for r in result_data.get("results", [])] self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL") return results[0] if len(results) == 1 else results diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 046351e7..bbd7ffa2 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -47,6 +47,7 @@ from urllib.parse import ( urljoin, urlparse, urlunparse, parse_qsl, urlencode, quote, unquote ) +import inspect # Monkey patch to fix wildcard handling in urllib.robotparser @@ -3529,4 +3530,52 @@ def get_memory_stats() -> Tuple[float, float, float]: available_gb = get_true_available_memory_gb() used_percent = get_true_memory_usage_percent() - return used_percent, available_gb, total_gb \ No newline at end of file + return used_percent, available_gb, total_gb + + +# Hook utilities for Docker API +def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]: + """ + Convert hook function objects to string representations for Docker API. + + This utility simplifies the process of using hooks with the Docker API by converting + Python function objects into the string format required by the API. + + Args: + hooks: Dictionary mapping hook point names to Python function objects. + Functions should be async and follow hook signature requirements. + + Returns: + Dictionary mapping hook point names to string representations of the functions. + + Example: + >>> async def my_hook(page, context, **kwargs): + ... await page.set_viewport_size({"width": 1920, "height": 1080}) + ... return page + >>> + >>> hooks_dict = {"on_page_context_created": my_hook} + >>> api_hooks = hooks_to_string(hooks_dict) + >>> # api_hooks is now ready to use with Docker API + + Raises: + ValueError: If a hook is not callable or source cannot be extracted + """ + result = {} + + for hook_name, hook_func in hooks.items(): + if not callable(hook_func): + raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}") + + try: + # Get the source code of the function + source = inspect.getsource(hook_func) + # Remove any leading indentation to get clean source + source = textwrap.dedent(source) + result[hook_name] = source + except (OSError, TypeError) as e: + raise ValueError( + f"Cannot extract source code for hook '{hook_name}'. " + f"Make sure the function is defined in a file (not interactively). Error: {e}" + ) + + return result diff --git a/deploy/docker/README.md b/deploy/docker/README.md index d35050cc..cee8af7f 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -12,6 +12,7 @@ - [Python SDK](#python-sdk) - [Understanding Request Schema](#understanding-request-schema) - [REST API Examples](#rest-api-examples) + - [Asynchronous Jobs with Webhooks](#asynchronous-jobs-with-webhooks) - [Additional API Endpoints](#additional-api-endpoints) - [HTML Extraction Endpoint](#html-extraction-endpoint) - [Screenshot Endpoint](#screenshot-endpoint) @@ -58,15 +59,13 @@ Pull and run images directly from Docker Hub without building locally. #### 1. Pull the Image -Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system. - -> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features. +Our latest stable release is `0.7.6`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system. ```bash -# Pull the release candidate (for testing new features) -docker pull unclecode/crawl4ai:0.7.0-r1 +# Pull the latest stable version (0.7.6) +docker pull unclecode/crawl4ai:0.7.6 -# Or pull the current stable version (0.6.0) +# Or use the latest tag (points to 0.7.6) docker pull unclecode/crawl4ai:latest ``` @@ -101,7 +100,7 @@ EOL -p 11235:11235 \ --name crawl4ai \ --shm-size=1g \ - unclecode/crawl4ai:0.7.0-r1 + unclecode/crawl4ai:0.7.6 ``` * **With LLM support:** @@ -112,7 +111,7 @@ EOL --name crawl4ai \ --env-file .llm.env \ --shm-size=1g \ - unclecode/crawl4ai:0.7.0-r1 + unclecode/crawl4ai:0.7.6 ``` > The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface. @@ -185,7 +184,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach ```bash # Pulls and runs the release candidate from Docker Hub # Automatically selects the correct architecture - IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d + IMAGE=unclecode/crawl4ai:0.7.6 docker compose up -d ``` * **Build and Run Locally:** @@ -648,6 +647,194 @@ async def test_stream_crawl(token: str = None): # Made token optional # asyncio.run(test_stream_crawl()) ``` +### Asynchronous Jobs with Webhooks + +For long-running crawls or when you want to avoid keeping connections open, use the job queue endpoints. Instead of polling for results, configure a webhook to receive notifications when jobs complete. + +#### Why Use Jobs & Webhooks? + +- **No Polling Required** - Get notified when crawls complete instead of constantly checking status +- **Better Resource Usage** - Free up client connections while jobs run in the background +- **Scalable Architecture** - Ideal for high-volume crawling with TypeScript/Node.js clients or microservices +- **Reliable Delivery** - Automatic retry with exponential backoff (5 attempts: 1s → 2s → 4s → 8s → 16s) + +#### How It Works + +1. **Submit Job** → POST to `/crawl/job` with optional `webhook_config` +2. **Get Task ID** → Receive a `task_id` immediately +3. **Job Runs** → Crawl executes in the background +4. **Webhook Fired** → Server POSTs completion notification to your webhook URL +5. **Fetch Results** → If data wasn't included in webhook, GET `/crawl/job/{task_id}` + +#### Quick Example + +```bash +# Submit a crawl job with webhook notification +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl-complete", + "webhook_data_in_payload": false + } + }' + +# Response: {"task_id": "crawl_a1b2c3d4"} +``` + +**Your webhook receives:** +```json +{ + "task_id": "crawl_a1b2c3d4", + "task_type": "crawl", + "status": "completed", + "timestamp": "2025-10-21T10:30:00.000000+00:00", + "urls": ["https://example.com"] +} +``` + +Then fetch the results: +```bash +curl http://localhost:11235/crawl/job/crawl_a1b2c3d4 +``` + +#### Include Data in Webhook + +Set `webhook_data_in_payload: true` to receive the full crawl results directly in the webhook: + +```bash +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl-complete", + "webhook_data_in_payload": true + } + }' +``` + +**Your webhook receives the complete data:** +```json +{ + "task_id": "crawl_a1b2c3d4", + "task_type": "crawl", + "status": "completed", + "timestamp": "2025-10-21T10:30:00.000000+00:00", + "urls": ["https://example.com"], + "data": { + "markdown": "...", + "html": "...", + "links": {...}, + "metadata": {...} + } +} +``` + +#### Webhook Authentication + +Add custom headers for authentication: + +```json +{ + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl", + "webhook_data_in_payload": false, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token", + "X-Service-ID": "crawl4ai-prod" + } + } +} +``` + +#### Global Default Webhook + +Configure a default webhook URL in `config.yml` for all jobs: + +```yaml +webhooks: + enabled: true + default_url: "https://myapp.com/webhooks/default" + data_in_payload: false + retry: + max_attempts: 5 + initial_delay_ms: 1000 + max_delay_ms: 32000 + timeout_ms: 30000 +``` + +Now jobs without `webhook_config` automatically use the default webhook. + +#### Job Status Polling (Without Webhooks) + +If you prefer polling instead of webhooks, just omit `webhook_config`: + +```bash +# Submit job +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{"urls": ["https://example.com"]}' +# Response: {"task_id": "crawl_xyz"} + +# Poll for status +curl http://localhost:11235/crawl/job/crawl_xyz +``` + +The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`. + +#### LLM Extraction Jobs with Webhooks + +The same webhook system works for LLM extraction jobs via `/llm/job`: + +```bash +# Submit LLM extraction job with webhook +curl -X POST http://localhost:11235/llm/job \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com/article", + "q": "Extract the article title, author, and main points", + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/llm-complete", + "webhook_data_in_payload": true, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + }' + +# Response: {"task_id": "llm_1234567890"} +``` + +**Your webhook receives:** +```json +{ + "task_id": "llm_1234567890", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-22T12:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "main_points": ["Point 1", "Point 2", "Point 3"] + } + } +} +``` + +**Key Differences for LLM Jobs:** +- Task type is `"llm_extraction"` instead of `"crawl"` +- Extracted data is in `data.extracted_content` +- Single URL only (not an array) +- Supports schema-based extraction with `schema` parameter + +> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling. + --- ## Metrics & Monitoring @@ -826,10 +1013,11 @@ We're here to help you succeed with Crawl4AI! Here's how to get support: In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment: - Building and running the Docker container -- Configuring the environment +- Configuring the environment - Using the interactive playground for testing - Making API requests with proper typing - Using the Python SDK +- Asynchronous job queues with webhook notifications - Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution - Connecting via the Model Context Protocol (MCP) - Monitoring your deployment diff --git a/deploy/docker/WEBHOOK_EXAMPLES.md b/deploy/docker/WEBHOOK_EXAMPLES.md new file mode 100644 index 00000000..190efb18 --- /dev/null +++ b/deploy/docker/WEBHOOK_EXAMPLES.md @@ -0,0 +1,378 @@ +# Webhook Feature Examples + +This document provides examples of how to use the webhook feature for crawl jobs in Crawl4AI. + +## Overview + +The webhook feature allows you to receive notifications when crawl jobs complete, eliminating the need for polling. Webhooks are sent with exponential backoff retry logic to ensure reliable delivery. + +## Configuration + +### Global Configuration (config.yml) + +You can configure default webhook settings in `config.yml`: + +```yaml +webhooks: + enabled: true + default_url: null # Optional: default webhook URL for all jobs + data_in_payload: false # Optional: default behavior for including data + retry: + max_attempts: 5 + initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff + max_delay_ms: 32000 + timeout_ms: 30000 # 30s timeout per webhook call + headers: # Optional: default headers to include + User-Agent: "Crawl4AI-Webhook/1.0" +``` + +## API Usage Examples + +### Example 1: Basic Webhook (Notification Only) + +Send a webhook notification without including the crawl data in the payload. + +**Request:** +```bash +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl-complete", + "webhook_data_in_payload": false + } + }' +``` + +**Response:** +```json +{ + "task_id": "crawl_a1b2c3d4" +} +``` + +**Webhook Payload Received:** +```json +{ + "task_id": "crawl_a1b2c3d4", + "task_type": "crawl", + "status": "completed", + "timestamp": "2025-10-21T10:30:00.000000+00:00", + "urls": ["https://example.com"] +} +``` + +Your webhook handler should then fetch the results: +```bash +curl http://localhost:11235/crawl/job/crawl_a1b2c3d4 +``` + +### Example 2: Webhook with Data Included + +Include the full crawl results in the webhook payload. + +**Request:** +```bash +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl-complete", + "webhook_data_in_payload": true + } + }' +``` + +**Webhook Payload Received:** +```json +{ + "task_id": "crawl_a1b2c3d4", + "task_type": "crawl", + "status": "completed", + "timestamp": "2025-10-21T10:30:00.000000+00:00", + "urls": ["https://example.com"], + "data": { + "markdown": "...", + "html": "...", + "links": {...}, + "metadata": {...} + } +} +``` + +### Example 3: Webhook with Custom Headers + +Include custom headers for authentication or identification. + +**Request:** +```bash +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl-complete", + "webhook_data_in_payload": false, + "webhook_headers": { + "X-Webhook-Secret": "my-secret-token", + "X-Service-ID": "crawl4ai-production" + } + } + }' +``` + +The webhook will be sent with these additional headers plus the default headers from config. + +### Example 4: Failure Notification + +When a crawl job fails, a webhook is sent with error details. + +**Webhook Payload on Failure:** +```json +{ + "task_id": "crawl_a1b2c3d4", + "task_type": "crawl", + "status": "failed", + "timestamp": "2025-10-21T10:30:00.000000+00:00", + "urls": ["https://example.com"], + "error": "Connection timeout after 30s" +} +``` + +### Example 5: Using Global Default Webhook + +If you set a `default_url` in config.yml, jobs without webhook_config will use it: + +**config.yml:** +```yaml +webhooks: + enabled: true + default_url: "https://myapp.com/webhooks/default" + data_in_payload: false +``` + +**Request (no webhook_config needed):** +```bash +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"] + }' +``` + +The webhook will be sent to the default URL configured in config.yml. + +### Example 6: LLM Extraction Job with Webhook + +Use webhooks with the LLM extraction endpoint for asynchronous processing. + +**Request:** +```bash +curl -X POST http://localhost:11235/llm/job \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com/article", + "q": "Extract the article title, author, and publication date", + "schema": "{\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}, \"author\": {\"type\": \"string\"}, \"date\": {\"type\": \"string\"}}}", + "cache": false, + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/llm-complete", + "webhook_data_in_payload": true + } + }' +``` + +**Response:** +```json +{ + "task_id": "llm_1698765432_12345" +} +``` + +**Webhook Payload Received:** +```json +{ + "task_id": "llm_1698765432_12345", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-21T10:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "date": "2025-10-21" + } + } +} +``` + +## Webhook Handler Example + +Here's a simple Python Flask webhook handler that supports both crawl and LLM extraction jobs: + +```python +from flask import Flask, request, jsonify +import requests + +app = Flask(__name__) + +@app.route('/webhooks/crawl-complete', methods=['POST']) +def handle_crawl_webhook(): + payload = request.json + + task_id = payload['task_id'] + task_type = payload['task_type'] + status = payload['status'] + + if status == 'completed': + # If data not in payload, fetch it + if 'data' not in payload: + # Determine endpoint based on task type + endpoint = 'crawl' if task_type == 'crawl' else 'llm' + response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}') + data = response.json() + else: + data = payload['data'] + + # Process based on task type + if task_type == 'crawl': + print(f"Processing crawl results for {task_id}") + # Handle crawl results + results = data.get('results', []) + for result in results: + print(f" - {result.get('url')}: {len(result.get('markdown', ''))} chars") + + elif task_type == 'llm_extraction': + print(f"Processing LLM extraction for {task_id}") + # Handle LLM extraction + # Note: Webhook sends 'extracted_content', API returns 'result' + extracted = data.get('extracted_content', data.get('result', {})) + print(f" - Extracted: {extracted}") + + # Your business logic here... + + elif status == 'failed': + error = payload.get('error', 'Unknown error') + print(f"{task_type} job {task_id} failed: {error}") + # Handle failure... + + return jsonify({"status": "received"}), 200 + +if __name__ == '__main__': + app.run(port=8080) +``` + +## Retry Logic + +The webhook delivery service uses exponential backoff retry logic: + +- **Attempts:** Up to 5 attempts by default +- **Delays:** 1s → 2s → 4s → 8s → 16s +- **Timeout:** 30 seconds per attempt +- **Retry Conditions:** + - Server errors (5xx status codes) + - Network errors + - Timeouts +- **No Retry:** + - Client errors (4xx status codes) + - Successful delivery (2xx status codes) + +## Benefits + +1. **No Polling Required** - Eliminates constant API calls to check job status +2. **Real-time Notifications** - Immediate notification when jobs complete +3. **Reliable Delivery** - Exponential backoff ensures webhooks are delivered +4. **Flexible** - Choose between notification-only or full data delivery +5. **Secure** - Support for custom headers for authentication +6. **Configurable** - Global defaults or per-job configuration +7. **Universal Support** - Works with both `/crawl/job` and `/llm/job` endpoints + +## TypeScript Client Example + +```typescript +interface WebhookConfig { + webhook_url: string; + webhook_data_in_payload?: boolean; + webhook_headers?: Record; +} + +interface CrawlJobRequest { + urls: string[]; + browser_config?: Record; + crawler_config?: Record; + webhook_config?: WebhookConfig; +} + +interface LLMJobRequest { + url: string; + q: string; + schema?: string; + cache?: boolean; + provider?: string; + webhook_config?: WebhookConfig; +} + +async function createCrawlJob(request: CrawlJobRequest) { + const response = await fetch('http://localhost:11235/crawl/job', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(request) + }); + + const { task_id } = await response.json(); + return task_id; +} + +async function createLLMJob(request: LLMJobRequest) { + const response = await fetch('http://localhost:11235/llm/job', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(request) + }); + + const { task_id } = await response.json(); + return task_id; +} + +// Usage - Crawl Job +const crawlTaskId = await createCrawlJob({ + urls: ['https://example.com'], + webhook_config: { + webhook_url: 'https://myapp.com/webhooks/crawl-complete', + webhook_data_in_payload: false, + webhook_headers: { + 'X-Webhook-Secret': 'my-secret' + } + } +}); + +// Usage - LLM Extraction Job +const llmTaskId = await createLLMJob({ + url: 'https://example.com/article', + q: 'Extract the main points from this article', + provider: 'openai/gpt-4o-mini', + webhook_config: { + webhook_url: 'https://myapp.com/webhooks/llm-complete', + webhook_data_in_payload: true, + webhook_headers: { + 'X-Webhook-Secret': 'my-secret' + } + } +}); +``` + +## Monitoring and Debugging + +Webhook delivery attempts are logged at INFO level: +- Successful deliveries +- Retry attempts with delays +- Final failures after max attempts + +Check the application logs for webhook delivery status: +```bash +docker logs crawl4ai-container | grep -i webhook +``` diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 64ac4a85..4fab27b1 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -46,6 +46,7 @@ from utils import ( get_llm_temperature, get_llm_base_url ) +from webhook import WebhookDeliveryService import psutil, time @@ -127,10 +128,14 @@ async def process_llm_extraction( schema: Optional[str] = None, cache: str = "0", provider: Optional[str] = None, + webhook_config: Optional[Dict] = None, temperature: Optional[float] = None, base_url: Optional[str] = None ) -> None: """Process LLM extraction in background.""" + # Initialize webhook service + webhook_service = WebhookDeliveryService(config) + try: # Validate provider is_valid, error_msg = validate_llm_provider(config, provider) @@ -139,6 +144,16 @@ async def process_llm_extraction( "status": TaskStatus.FAILED, "error": error_msg }) + + # Send webhook notification on failure + await webhook_service.notify_job_completion( + task_id=task_id, + task_type="llm_extraction", + status="failed", + urls=[url], + webhook_config=webhook_config, + error=error_msg + ) return api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it llm_strategy = LLMExtractionStrategy( @@ -169,17 +184,40 @@ async def process_llm_extraction( "status": TaskStatus.FAILED, "error": result.error_message }) + + # Send webhook notification on failure + await webhook_service.notify_job_completion( + task_id=task_id, + task_type="llm_extraction", + status="failed", + urls=[url], + webhook_config=webhook_config, + error=result.error_message + ) return try: content = json.loads(result.extracted_content) except json.JSONDecodeError: content = result.extracted_content + + result_data = {"extracted_content": content} + await redis.hset(f"task:{task_id}", mapping={ "status": TaskStatus.COMPLETED, "result": json.dumps(content) }) + # Send webhook notification on successful completion + await webhook_service.notify_job_completion( + task_id=task_id, + task_type="llm_extraction", + status="completed", + urls=[url], + webhook_config=webhook_config, + result=result_data + ) + except Exception as e: logger.error(f"LLM extraction error: {str(e)}", exc_info=True) await redis.hset(f"task:{task_id}", mapping={ @@ -187,6 +225,16 @@ async def process_llm_extraction( "error": str(e) }) + # Send webhook notification on failure + await webhook_service.notify_job_completion( + task_id=task_id, + task_type="llm_extraction", + status="failed", + urls=[url], + webhook_config=webhook_config, + error=str(e) + ) + async def handle_markdown_request( url: str, filter_type: FilterType, @@ -275,6 +323,7 @@ async def handle_llm_request( cache: str = "0", config: Optional[dict] = None, provider: Optional[str] = None, + webhook_config: Optional[Dict] = None, temperature: Optional[float] = None, api_base_url: Optional[str] = None ) -> JSONResponse: @@ -308,6 +357,7 @@ async def handle_llm_request( base_url, config, provider, + webhook_config, temperature, api_base_url ) @@ -355,6 +405,7 @@ async def create_new_task( base_url: str, config: dict, provider: Optional[str] = None, + webhook_config: Optional[Dict] = None, temperature: Optional[float] = None, api_base_url: Optional[str] = None ) -> JSONResponse: @@ -365,12 +416,18 @@ async def create_new_task( from datetime import datetime task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}" - - await redis.hset(f"task:{task_id}", mapping={ + + task_data = { "status": TaskStatus.PROCESSING, "created_at": datetime.now().isoformat(), "url": decoded_url - }) + } + + # Store webhook config if provided + if webhook_config: + task_data["webhook_config"] = json.dumps(webhook_config) + + await redis.hset(f"task:{task_id}", mapping=task_data) background_tasks.add_task( process_llm_extraction, @@ -382,6 +439,7 @@ async def create_new_task( schema, cache, provider, + webhook_config, temperature, api_base_url ) @@ -723,6 +781,7 @@ async def handle_crawl_job( browser_config: Dict, crawler_config: Dict, config: Dict, + webhook_config: Optional[Dict] = None, ) -> Dict: """ Fire-and-forget version of handle_crawl_request. @@ -730,13 +789,24 @@ async def handle_crawl_job( lets /crawl/job/{task_id} polling fetch the result. """ task_id = f"crawl_{uuid4().hex[:8]}" - await redis.hset(f"task:{task_id}", mapping={ + + # Store task data in Redis + task_data = { "status": TaskStatus.PROCESSING, # <-- keep enum values consistent "created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(), "url": json.dumps(urls), # store list as JSON string "result": "", "error": "", - }) + } + + # Store webhook config if provided + if webhook_config: + task_data["webhook_config"] = json.dumps(webhook_config) + + await redis.hset(f"task:{task_id}", mapping=task_data) + + # Initialize webhook service + webhook_service = WebhookDeliveryService(config) async def _runner(): try: @@ -750,6 +820,17 @@ async def handle_crawl_job( "status": TaskStatus.COMPLETED, "result": json.dumps(result), }) + + # Send webhook notification on successful completion + await webhook_service.notify_job_completion( + task_id=task_id, + task_type="crawl", + status="completed", + urls=urls, + webhook_config=webhook_config, + result=result + ) + await asyncio.sleep(5) # Give Redis time to process the update except Exception as exc: await redis.hset(f"task:{task_id}", mapping={ @@ -757,5 +838,15 @@ async def handle_crawl_job( "error": str(exc), }) + # Send webhook notification on failure + await webhook_service.notify_job_completion( + task_id=task_id, + task_type="crawl", + status="failed", + urls=urls, + webhook_config=webhook_config, + error=str(exc) + ) + background_tasks.add_task(_runner) return {"task_id": task_id} \ No newline at end of file diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml index d09396a5..5790d5be 100644 --- a/deploy/docker/config.yml +++ b/deploy/docker/config.yml @@ -87,4 +87,17 @@ observability: enabled: True endpoint: "/metrics" health_check: - endpoint: "/health" \ No newline at end of file + endpoint: "/health" + +# Webhook Configuration +webhooks: + enabled: true + default_url: null # Optional: default webhook URL for all jobs + data_in_payload: false # Optional: default behavior for including data + retry: + max_attempts: 5 + initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff + max_delay_ms: 32000 + timeout_ms: 30000 # 30s timeout per webhook call + headers: # Optional: default headers to include + User-Agent: "Crawl4AI-Webhook/1.0" \ No newline at end of file diff --git a/deploy/docker/job.py b/deploy/docker/job.py index 823dd8c8..8fae16cd 100644 --- a/deploy/docker/job.py +++ b/deploy/docker/job.py @@ -12,6 +12,7 @@ from api import ( handle_crawl_job, handle_task_status, ) +from schemas import WebhookConfig # ------------- dependency placeholders ------------- _redis = None # will be injected from server.py @@ -37,6 +38,7 @@ class LlmJobPayload(BaseModel): schema: Optional[str] = None cache: bool = False provider: Optional[str] = None + webhook_config: Optional[WebhookConfig] = None temperature: Optional[float] = None base_url: Optional[str] = None @@ -45,6 +47,7 @@ class CrawlJobPayload(BaseModel): urls: list[HttpUrl] browser_config: Dict = {} crawler_config: Dict = {} + webhook_config: Optional[WebhookConfig] = None # ---------- LL​M job --------------------------------------------------------- @@ -55,6 +58,10 @@ async def llm_job_enqueue( request: Request, _td: Dict = Depends(lambda: _token_dep()), # late-bound dep ): + webhook_config = None + if payload.webhook_config: + webhook_config = payload.webhook_config.model_dump(mode='json') + return await handle_llm_request( _redis, background_tasks, @@ -65,6 +72,7 @@ async def llm_job_enqueue( cache=payload.cache, config=_config, provider=payload.provider, + webhook_config=webhook_config, temperature=payload.temperature, api_base_url=payload.base_url, ) @@ -86,6 +94,10 @@ async def crawl_job_enqueue( background_tasks: BackgroundTasks, _td: Dict = Depends(lambda: _token_dep()), ): + webhook_config = None + if payload.webhook_config: + webhook_config = payload.webhook_config.model_dump(mode='json') + return await handle_crawl_job( _redis, background_tasks, @@ -93,6 +105,7 @@ async def crawl_job_enqueue( payload.browser_config, payload.crawler_config, config=_config, + webhook_config=webhook_config, ) diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt index d463c641..b33c081f 100644 --- a/deploy/docker/requirements.txt +++ b/deploy/docker/requirements.txt @@ -12,6 +12,6 @@ pydantic>=2.11 rank-bm25==0.2.2 anyio==4.9.0 PyJWT==2.10.1 -mcp>=1.6.0 +mcp>=1.18.0 websockets>=15.0.1 httpx[http2]>=0.27.2 diff --git a/deploy/docker/schemas.py b/deploy/docker/schemas.py index 792936bb..21d47fc4 100644 --- a/deploy/docker/schemas.py +++ b/deploy/docker/schemas.py @@ -1,6 +1,6 @@ from typing import List, Optional, Dict from enum import Enum -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, HttpUrl from utils import FilterType @@ -85,4 +85,22 @@ class JSEndpointRequest(BaseModel): scripts: List[str] = Field( ..., description="List of separated JavaScript snippets to execute" - ) \ No newline at end of file + ) + + +class WebhookConfig(BaseModel): + """Configuration for webhook notifications.""" + webhook_url: HttpUrl + webhook_data_in_payload: bool = False + webhook_headers: Optional[Dict[str, str]] = None + + +class WebhookPayload(BaseModel): + """Payload sent to webhook endpoints.""" + task_id: str + task_type: str # "crawl", "llm_extraction", etc. + status: str # "completed" or "failed" + timestamp: str # ISO 8601 format + urls: List[str] + error: Optional[str] = None + data: Optional[Dict] = None # Included only if webhook_data_in_payload=True \ No newline at end of file diff --git a/deploy/docker/webhook.py b/deploy/docker/webhook.py new file mode 100644 index 00000000..ebee9dff --- /dev/null +++ b/deploy/docker/webhook.py @@ -0,0 +1,159 @@ +""" +Webhook delivery service for Crawl4AI. + +This module provides webhook notification functionality with exponential backoff retry logic. +""" +import asyncio +import httpx +import logging +from typing import Dict, Optional +from datetime import datetime, timezone + +logger = logging.getLogger(__name__) + + +class WebhookDeliveryService: + """Handles webhook delivery with exponential backoff retry logic.""" + + def __init__(self, config: Dict): + """ + Initialize the webhook delivery service. + + Args: + config: Application configuration dictionary containing webhook settings + """ + self.config = config.get("webhooks", {}) + self.max_attempts = self.config.get("retry", {}).get("max_attempts", 5) + self.initial_delay = self.config.get("retry", {}).get("initial_delay_ms", 1000) / 1000 + self.max_delay = self.config.get("retry", {}).get("max_delay_ms", 32000) / 1000 + self.timeout = self.config.get("retry", {}).get("timeout_ms", 30000) / 1000 + + async def send_webhook( + self, + webhook_url: str, + payload: Dict, + headers: Optional[Dict[str, str]] = None + ) -> bool: + """ + Send webhook with exponential backoff retry logic. + + Args: + webhook_url: The URL to send the webhook to + payload: The JSON payload to send + headers: Optional custom headers + + Returns: + bool: True if delivered successfully, False otherwise + """ + default_headers = self.config.get("headers", {}) + merged_headers = {**default_headers, **(headers or {})} + merged_headers["Content-Type"] = "application/json" + + async with httpx.AsyncClient(timeout=self.timeout) as client: + for attempt in range(self.max_attempts): + try: + logger.info( + f"Sending webhook (attempt {attempt + 1}/{self.max_attempts}) to {webhook_url}" + ) + + response = await client.post( + webhook_url, + json=payload, + headers=merged_headers + ) + + # Success or client error (don't retry client errors) + if response.status_code < 500: + if 200 <= response.status_code < 300: + logger.info(f"Webhook delivered successfully to {webhook_url}") + return True + else: + logger.warning( + f"Webhook rejected with status {response.status_code}: {response.text[:200]}" + ) + return False # Client error - don't retry + + # Server error - retry with backoff + logger.warning( + f"Webhook failed with status {response.status_code}, will retry" + ) + + except httpx.TimeoutException as exc: + logger.error(f"Webhook timeout (attempt {attempt + 1}): {exc}") + except httpx.RequestError as exc: + logger.error(f"Webhook request error (attempt {attempt + 1}): {exc}") + except Exception as exc: + logger.error(f"Webhook delivery error (attempt {attempt + 1}): {exc}") + + # Calculate exponential backoff delay + if attempt < self.max_attempts - 1: + delay = min(self.initial_delay * (2 ** attempt), self.max_delay) + logger.info(f"Retrying in {delay}s...") + await asyncio.sleep(delay) + + logger.error( + f"Webhook delivery failed after {self.max_attempts} attempts to {webhook_url}" + ) + return False + + async def notify_job_completion( + self, + task_id: str, + task_type: str, + status: str, + urls: list, + webhook_config: Optional[Dict], + result: Optional[Dict] = None, + error: Optional[str] = None + ): + """ + Notify webhook of job completion. + + Args: + task_id: The task identifier + task_type: Type of task (e.g., "crawl", "llm_extraction") + status: Task status ("completed" or "failed") + urls: List of URLs that were crawled + webhook_config: Webhook configuration from the job request + result: Optional crawl result data + error: Optional error message if failed + """ + # Determine webhook URL + webhook_url = None + data_in_payload = self.config.get("data_in_payload", False) + custom_headers = None + + if webhook_config: + webhook_url = webhook_config.get("webhook_url") + data_in_payload = webhook_config.get("webhook_data_in_payload", data_in_payload) + custom_headers = webhook_config.get("webhook_headers") + + if not webhook_url: + webhook_url = self.config.get("default_url") + + if not webhook_url: + logger.debug("No webhook URL configured, skipping notification") + return + + # Check if webhooks are enabled + if not self.config.get("enabled", True): + logger.debug("Webhooks are disabled, skipping notification") + return + + # Build payload + payload = { + "task_id": task_id, + "task_type": task_type, + "status": status, + "timestamp": datetime.now(timezone.utc).isoformat(), + "urls": urls + } + + if error: + payload["error"] = error + + if data_in_payload and result: + payload["data"] = result + + # Send webhook (fire and forget - don't block on completion) + await self.send_webhook(webhook_url, payload, custom_headers) diff --git a/docs/blog/release-v0.7.4.md b/docs/blog/release-v0.7.4.md index d9a57845..72cfe3ae 100644 --- a/docs/blog/release-v0.7.4.md +++ b/docs/blog/release-v0.7.4.md @@ -10,7 +10,6 @@ Today I'm releasing Crawl4AI v0.7.4—the Intelligent Table Extraction & Perform - **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables - **⚡ Enhanced Concurrency**: True concurrency improvements for fast-completing tasks in batch operations -- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management - **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation - **⌨️ Cross-Platform Browser Profiler**: Improved keyboard handling and quit mechanisms - **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution @@ -158,40 +157,6 @@ async with AsyncWebCrawler() as crawler: - **Monitoring Systems**: Faster health checks and status page monitoring - **Data Aggregation**: Improved performance for real-time data collection -## 🧹 Memory Management Refactor: Cleaner Architecture - -**The Problem:** Memory utilities were scattered and difficult to maintain, with potential import conflicts and unclear organization. - -**My Solution:** I consolidated all memory-related utilities into the main `utils.py` module, creating a cleaner, more maintainable architecture. - -### Improved Memory Handling - -```python -# All memory utilities now consolidated -from crawl4ai.utils import get_true_memory_usage_percent, MemoryMonitor - -# Enhanced memory monitoring -monitor = MemoryMonitor() -monitor.start_monitoring() - -async with AsyncWebCrawler() as crawler: - # Memory-efficient batch processing - results = await crawler.arun_many(large_url_list) - - # Get accurate memory metrics - memory_usage = get_true_memory_usage_percent() - memory_report = monitor.get_report() - - print(f"Memory efficiency: {memory_report['efficiency']:.1f}%") - print(f"Peak usage: {memory_report['peak_mb']:.1f} MB") -``` - -**Expected Real-World Impact:** -- **Production Stability**: More reliable memory tracking and management -- **Code Maintainability**: Cleaner architecture for easier debugging -- **Import Clarity**: Resolved potential conflicts and import issues -- **Developer Experience**: Simpler API for memory monitoring - ## 🔧 Critical Stability Fixes ### Browser Manager Race Condition Resolution diff --git a/docs/blog/release-v0.7.5.md b/docs/blog/release-v0.7.5.md new file mode 100644 index 00000000..977d2fd9 --- /dev/null +++ b/docs/blog/release-v0.7.5.md @@ -0,0 +1,318 @@ +# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update + +*September 29, 2025 • 8 min read* + +--- + +Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements. + +## 🎯 What's New at a Glance + +- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API +- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion +- **Enhanced LLM Integration**: Custom providers with temperature control +- **HTTPS Preservation**: Secure internal link handling +- **Bug Fixes**: Resolved multiple community-reported issues +- **Improved Docker Error Handling**: Better debugging and reliability + +## 🔧 Docker Hooks System: Pipeline Customization + +Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline. + +### Real Example: Authentication & Performance + +```python +import requests + +# Real working hooks for httpbin.org +hooks_config = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + print("Hook: Setting up page context") + # Block images to speed up crawling + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + print("Hook: Images blocked") + return page +""", + + "before_retrieve_html": """ +async def hook(page, context, **kwargs): + print("Hook: Before retrieving HTML") + # Scroll to bottom to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + print("Hook: Scrolled to bottom") + return page +""", + + "before_goto": """ +async def hook(page, context, url, **kwargs): + print(f"Hook: About to navigate to {url}") + # Add custom headers + await page.set_extra_http_headers({ + 'X-Test-Header': 'crawl4ai-hooks-test' + }) + return page +""" +} + +# Test with Docker API +payload = { + "urls": ["https://httpbin.org/html"], + "hooks": { + "code": hooks_config, + "timeout": 30 + } +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +result = response.json() + +if result.get('success'): + print("✅ Hooks executed successfully!") + print(f"Content length: {len(result.get('markdown', ''))} characters") +``` + +**Available Hook Points:** +- `on_browser_created`: Browser setup +- `on_page_context_created`: Page context configuration +- `before_goto`: Pre-navigation setup +- `after_goto`: Post-navigation processing +- `on_user_agent_updated`: User agent changes +- `on_execution_started`: Crawl initialization +- `before_retrieve_html`: Pre-extraction processing +- `before_return_html`: Final HTML processing + +### Function-Based Hooks API + +Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion! + +**Option 1: Using the `hooks_to_string()` Utility** + +```python +from crawl4ai import hooks_to_string +import requests + +# Define hooks as regular Python functions (with full IDE support!) +async def on_page_context_created(page, context, **kwargs): + """Block images to speed up crawling""" + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + +async def before_goto(page, context, url, **kwargs): + """Add custom headers""" + await page.set_extra_http_headers({ + 'X-Crawl4AI': 'v0.7.5', + 'X-Custom-Header': 'my-value' + }) + return page + +# Convert functions to strings +hooks_code = hooks_to_string({ + "on_page_context_created": on_page_context_created, + "before_goto": before_goto +}) + +# Use with REST API +payload = { + "urls": ["https://httpbin.org/html"], + "hooks": {"code": hooks_code, "timeout": 30} +} +response = requests.post("http://localhost:11235/crawl", json=payload) +``` + +**Option 2: Docker Client with Automatic Conversion (Recommended!)** + +```python +from crawl4ai.docker_client import Crawl4aiDockerClient + +# Define hooks as functions (same as above) +async def on_page_context_created(page, context, **kwargs): + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + return page + +async def before_retrieve_html(page, context, **kwargs): + # Scroll to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + return page + +# Use Docker client - conversion happens automatically! +client = Crawl4aiDockerClient(base_url="http://localhost:11235") + +results = await client.crawl( + urls=["https://httpbin.org/html"], + hooks={ + "on_page_context_created": on_page_context_created, + "before_retrieve_html": before_retrieve_html + }, + hooks_timeout=30 +) + +if results and results.success: + print(f"✅ Hooks executed! HTML length: {len(results.html)}") +``` + +**Benefits of Function-Based Hooks:** +- ✅ Full IDE support (autocomplete, syntax highlighting) +- ✅ Type checking and linting +- ✅ Easier to test and debug +- ✅ Reusable across projects +- ✅ Automatic conversion in Docker client +- ✅ No breaking changes - string hooks still work! + +## 🤖 Enhanced LLM Integration + +Enhanced LLM integration with custom providers, temperature control, and base URL configuration. + +### Multi-Provider Support + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +# Test with different providers +async def test_llm_providers(): + # OpenAI with custom temperature + openai_strategy = LLMExtractionStrategy( + provider="gemini/gemini-2.5-flash-lite", + api_token="your-api-token", + temperature=0.7, # New in v0.7.5 + instruction="Summarize this page in one sentence" + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://example.com", + config=CrawlerRunConfig(extraction_strategy=openai_strategy) + ) + + if result.success: + print("✅ LLM extraction completed") + print(result.extracted_content) + +# Docker API with enhanced LLM config +llm_payload = { + "url": "https://example.com", + "f": "llm", + "q": "Summarize this page in one sentence.", + "provider": "gemini/gemini-2.5-flash-lite", + "temperature": 0.7 +} + +response = requests.post("http://localhost:11235/md", json=llm_payload) +``` + +**New Features:** +- Custom `temperature` parameter for creativity control +- `base_url` for custom API endpoints +- Multi-provider environment variable support +- Docker API integration + +## 🔒 HTTPS Preservation + +**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear. + +**Solution:** HTTPS preservation maintains secure protocols throughout crawling. + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy + +async def test_https_preservation(): + # Enable HTTPS preservation + url_filter = URLPatternFilter( + patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"] + ) + + config = CrawlerRunConfig( + exclude_external_links=True, + preserve_https_for_internal_links=True, # New in v0.7.5 + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + max_pages=5, + filter_chain=FilterChain([url_filter]) + ) + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun( + url="https://quotes.toscrape.com", + config=config + ): + # All internal links maintain HTTPS + internal_links = [link['href'] for link in result.links['internal']] + https_links = [link for link in internal_links if link.startswith('https://')] + + print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}") + for link in https_links[:3]: + print(f" → {link}") +``` + +## 🛠️ Bug Fixes and Improvements + +### Major Fixes +- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332) +- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated) +- **Docker Error Handling**: Comprehensive error messages with status codes +- **Memory Management**: Fixed leaks in long-running sessions +- **JWT Authentication**: Fixed Docker JWT validation issues (#1442) +- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481) +- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505) +- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419) +- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291) +- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989) + +### Community-Reported Issues Fixed +This release addresses multiple issues reported by the community through GitHub issues and Discord discussions: +- Fixed browser configuration reference errors +- Resolved dependency conflicts with cssselect +- Improved error messaging for failed authentications +- Enhanced compatibility with various proxy configurations +- Fixed edge cases in URL normalization + +### Configuration Updates +```python +# Old proxy config (deprecated) +# browser_config = BrowserConfig(proxy="http://proxy:8080") + +# New enhanced proxy config +browser_config = BrowserConfig( + proxy_config={ + "server": "http://proxy:8080", + "username": "optional-user", + "password": "optional-pass" + } +) +``` + +## 🔄 Breaking Changes + +1. **Python 3.10+ Required**: Upgrade from Python 3.9 +2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure +3. **New Dependency**: Added `cssselect` for better CSS handling + +## 🚀 Get Started + +```bash +# Install latest version +pip install crawl4ai==0.7.5 + +# Docker deployment +docker pull unclecode/crawl4ai:latest +docker run -p 11235:11235 unclecode/crawl4ai:latest +``` + +**Try the Demo:** +```bash +# Run working examples +python docs/releases_review/demo_v0.7.5.py +``` + +**Resources:** +- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com) +- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) +- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN) +- 🐦 Twitter: [@unclecode](https://x.com/unclecode) + +Happy crawling! 🕷️ diff --git a/docs/blog/release-v0.7.6.md b/docs/blog/release-v0.7.6.md new file mode 100644 index 00000000..e27d19cc --- /dev/null +++ b/docs/blog/release-v0.7.6.md @@ -0,0 +1,314 @@ +# Crawl4AI v0.7.6 Release Notes + +*Release Date: October 22, 2025* + +I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows. + +## 🎯 What's New + +### Webhook Support for Docker Job Queue API + +The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete! + +**Key Capabilities:** + +- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks +- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload +- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s) +- ✅ **Custom Authentication**: Add custom headers for webhook authentication +- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs +- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks + +### How It Works + +Instead of constantly checking job status: + +**OLD WAY (Polling):** +```python +# Submit job +response = requests.post("http://localhost:11235/crawl/job", json=payload) +task_id = response.json()['task_id'] + +# Poll until complete +while True: + status = requests.get(f"http://localhost:11235/crawl/job/{task_id}") + if status.json()['status'] == 'completed': + break + time.sleep(5) # Wait and try again +``` + +**NEW WAY (Webhooks):** +```python +# Submit job with webhook +payload = { + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhook", + "webhook_data_in_payload": True + } +} +response = requests.post("http://localhost:11235/crawl/job", json=payload) + +# Done! Webhook will notify you when complete +# Your webhook handler receives the results automatically +``` + +### Crawl Job Webhooks + +```bash +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "browser_config": {"headless": true}, + "crawler_config": {"cache_mode": "bypass"}, + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl-complete", + "webhook_data_in_payload": false, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + }' +``` + +### LLM Extraction Job Webhooks (NEW!) + +```bash +curl -X POST http://localhost:11235/llm/job \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com/article", + "q": "Extract the article title, author, and publication date", + "schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}", + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/llm-complete", + "webhook_data_in_payload": true + } + }' +``` + +### Webhook Payload Structure + +**Success (with data):** +```json +{ + "task_id": "llm_1698765432", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-22T10:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "date": "2025-10-22" + } + } +} +``` + +**Failure:** +```json +{ + "task_id": "crawl_abc123", + "task_type": "crawl", + "status": "failed", + "timestamp": "2025-10-22T10:30:00.000000+00:00", + "urls": ["https://example.com"], + "error": "Connection timeout after 30s" +} +``` + +### Simple Webhook Handler Example + +```python +from flask import Flask, request, jsonify + +app = Flask(__name__) + +@app.route('/webhook', methods=['POST']) +def handle_webhook(): + payload = request.json + + task_id = payload['task_id'] + task_type = payload['task_type'] + status = payload['status'] + + if status == 'completed': + if 'data' in payload: + # Process data directly + data = payload['data'] + else: + # Fetch from API + endpoint = 'crawl' if task_type == 'crawl' else 'llm' + response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}') + data = response.json() + + # Your business logic here + print(f"Job {task_id} completed!") + + elif status == 'failed': + error = payload.get('error', 'Unknown error') + print(f"Job {task_id} failed: {error}") + + return jsonify({"status": "received"}), 200 + +app.run(port=8080) +``` + +## 📊 Performance Improvements + +- **Reduced Server Load**: Eliminates constant polling requests +- **Lower Latency**: Instant notification vs. polling interval delay +- **Better Resource Usage**: Frees up client connections while jobs run in background +- **Scalable Architecture**: Handles high-volume crawling workflows efficiently + +## 🐛 Bug Fixes + +- Fixed webhook configuration serialization for Pydantic HttpUrl fields +- Improved error handling in webhook delivery service +- Enhanced Redis task storage for webhook config persistence + +## 🌍 Expected Real-World Impact + +### For Web Scraping Workflows +- **Reduced Costs**: Less API calls = lower bandwidth and server costs +- **Better UX**: Instant notifications improve user experience +- **Scalability**: Handle 100s of concurrent jobs without polling overhead + +### For LLM Extraction Pipelines +- **Async Processing**: Submit LLM extraction jobs and move on +- **Batch Processing**: Queue multiple extractions, get notified as they complete +- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.) + +### For Microservices +- **Event-Driven**: Perfect for event-driven microservice architectures +- **Decoupling**: Decouple job submission from result processing +- **Reliability**: Automatic retries ensure webhooks are delivered + +## 🔄 Breaking Changes + +**None!** This release is fully backward compatible. + +- Webhook configuration is optional +- Existing code continues to work without modification +- Polling is still supported for jobs without webhook config + +## 📚 Documentation + +### New Documentation +- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide +- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples + +### Updated Documentation +- **[Docker README](../deploy/docker/README.md)** - Added webhook sections +- API documentation with webhook examples + +## 🛠️ Migration Guide + +No migration needed! Webhooks are opt-in: + +1. **To use webhooks**: Add `webhook_config` to your job payload +2. **To keep polling**: Continue using your existing code + +### Quick Start + +```python +# Just add webhook_config to your existing payload +payload = { + # Your existing configuration + "urls": ["https://example.com"], + "browser_config": {...}, + "crawler_config": {...}, + + # NEW: Add webhook configuration + "webhook_config": { + "webhook_url": "https://myapp.com/webhook", + "webhook_data_in_payload": True + } +} +``` + +## 🔧 Configuration + +### Global Webhook Configuration (config.yml) + +```yaml +webhooks: + enabled: true + default_url: "https://myapp.com/webhooks/default" # Optional + data_in_payload: false + retry: + max_attempts: 5 + initial_delay_ms: 1000 + max_delay_ms: 32000 + timeout_ms: 30000 + headers: + User-Agent: "Crawl4AI-Webhook/1.0" +``` + +## 🚀 Upgrade Instructions + +### Docker + +```bash +# Pull the latest image +docker pull unclecode/crawl4ai:0.7.6 + +# Or use latest tag +docker pull unclecode/crawl4ai:latest + +# Run with webhook support +docker run -d \ + -p 11235:11235 \ + --env-file .llm.env \ + --name crawl4ai \ + unclecode/crawl4ai:0.7.6 +``` + +### Python Package + +```bash +pip install --upgrade crawl4ai +``` + +## 💡 Pro Tips + +1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads +2. **Set custom headers** for webhook authentication and request tracking +3. **Configure global default webhook** for consistent handling across all jobs +4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry +5. **Use structured schemas** with LLM extraction for predictable webhook data + +## 🎬 Demo + +Try the release demo: + +```bash +python docs/releases_review/demo_v0.7.6.py +``` + +This comprehensive demo showcases: +- Crawl job webhooks (notification-only and with data) +- LLM extraction webhooks (with JSON schema support) +- Custom headers for authentication +- Webhook retry mechanism +- Real-time webhook receiver + +## 🙏 Acknowledgments + +Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing. + +## 📞 Support + +- **Documentation**: https://docs.crawl4ai.com +- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues +- **Discord**: https://discord.gg/crawl4ai + +--- + +**Happy crawling with webhooks!** 🕷️🪝 + +*- unclecode* diff --git a/docs/examples/docker_client_hooks_example.py b/docs/examples/docker_client_hooks_example.py new file mode 100644 index 00000000..1aa27fdc --- /dev/null +++ b/docs/examples/docker_client_hooks_example.py @@ -0,0 +1,522 @@ +#!/usr/bin/env python3 +""" +Comprehensive hooks examples using Docker Client with function objects. + +This approach is recommended because: +- Write hooks as regular Python functions +- Full IDE support (autocomplete, type checking) +- Automatic conversion to API format +- Reusable and testable code +- Clean, readable syntax +""" + +import asyncio +from crawl4ai import Crawl4aiDockerClient + +# API_BASE_URL = "http://localhost:11235" +API_BASE_URL = "http://localhost:11234" + + +# ============================================================================ +# Hook Function Definitions +# ============================================================================ + +# --- All Hooks Demo --- +async def browser_created_hook(browser, **kwargs): + """Called after browser is created""" + print("[HOOK] Browser created and ready") + return browser + + +async def page_context_hook(page, context, **kwargs): + """Setup page environment""" + print("[HOOK] Setting up page environment") + + # Set viewport + await page.set_viewport_size({"width": 1920, "height": 1080}) + + # Add cookies + await context.add_cookies([{ + "name": "test_session", + "value": "abc123xyz", + "domain": ".httpbin.org", + "path": "/" + }]) + + # Block resources + await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort()) + await context.route("**/analytics/*", lambda route: route.abort()) + + print("[HOOK] Environment configured") + return page + + +async def user_agent_hook(page, context, user_agent, **kwargs): + """Called when user agent is updated""" + print(f"[HOOK] User agent: {user_agent[:50]}...") + return page + + +async def before_goto_hook(page, context, url, **kwargs): + """Called before navigating to URL""" + print(f"[HOOK] Navigating to: {url}") + + await page.set_extra_http_headers({ + "X-Custom-Header": "crawl4ai-test", + "Accept-Language": "en-US" + }) + + return page + + +async def after_goto_hook(page, context, url, response, **kwargs): + """Called after page loads""" + print(f"[HOOK] Page loaded: {url}") + + await page.wait_for_timeout(1000) + + try: + await page.wait_for_selector("body", timeout=2000) + print("[HOOK] Body element ready") + except: + print("[HOOK] Timeout, continuing") + + return page + + +async def execution_started_hook(page, context, **kwargs): + """Called when custom JS execution starts""" + print("[HOOK] JS execution started") + await page.evaluate("console.log('[HOOK] Custom JS');") + return page + + +async def before_retrieve_hook(page, context, **kwargs): + """Called before retrieving HTML""" + print("[HOOK] Preparing HTML retrieval") + + # Scroll for lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + await page.wait_for_timeout(500) + await page.evaluate("window.scrollTo(0, 0);") + + print("[HOOK] Scrolling complete") + return page + + +async def before_return_hook(page, context, html, **kwargs): + """Called before returning HTML""" + print(f"[HOOK] HTML ready: {len(html)} chars") + + metrics = await page.evaluate('''() => ({ + images: document.images.length, + links: document.links.length, + scripts: document.scripts.length + })''') + + print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}") + return page + + +# --- Authentication Hooks --- +async def auth_context_hook(page, context, **kwargs): + """Setup authentication context""" + print("[HOOK] Setting up authentication") + + # Add auth cookies + await context.add_cookies([{ + "name": "auth_token", + "value": "fake_jwt_token", + "domain": ".httpbin.org", + "path": "/", + "httpOnly": True + }]) + + # Set localStorage + await page.evaluate(''' + localStorage.setItem('user_id', '12345'); + localStorage.setItem('auth_time', new Date().toISOString()); + ''') + + print("[HOOK] Auth context ready") + return page + + +async def auth_headers_hook(page, context, url, **kwargs): + """Add authentication headers""" + print(f"[HOOK] Adding auth headers for {url}") + + import base64 + credentials = base64.b64encode(b"user:passwd").decode('ascii') + + await page.set_extra_http_headers({ + 'Authorization': f'Basic {credentials}', + 'X-API-Key': 'test-key-123' + }) + + return page + + +# --- Performance Optimization Hooks --- +async def performance_hook(page, context, **kwargs): + """Optimize page for performance""" + print("[HOOK] Optimizing for performance") + + # Block resource-heavy content + await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort()) + await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort()) + await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort()) + await context.route("**/googletagmanager.com/*", lambda r: r.abort()) + await context.route("**/google-analytics.com/*", lambda r: r.abort()) + await context.route("**/facebook.com/*", lambda r: r.abort()) + + # Disable animations + await page.add_style_tag(content=''' + *, *::before, *::after { + animation-duration: 0s !important; + transition-duration: 0s !important; + } + ''') + + print("[HOOK] Optimizations applied") + return page + + +async def cleanup_hook(page, context, **kwargs): + """Clean page before extraction""" + print("[HOOK] Cleaning page") + + await page.evaluate('''() => { + const selectors = [ + '.ad', '.ads', '.advertisement', + '.popup', '.modal', '.overlay', + '.cookie-banner', '.newsletter' + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(el => el.remove()); + }); + + document.querySelectorAll('script, style').forEach(el => el.remove()); + }''') + + print("[HOOK] Page cleaned") + return page + + +# --- Content Extraction Hooks --- +async def wait_dynamic_content_hook(page, context, url, response, **kwargs): + """Wait for dynamic content to load""" + print(f"[HOOK] Waiting for dynamic content on {url}") + + await page.wait_for_timeout(2000) + + # Click "Load More" if exists + try: + load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")') + if load_more: + await load_more.click() + await page.wait_for_timeout(1000) + print("[HOOK] Clicked 'Load More'") + except: + pass + + return page + + +async def extract_metadata_hook(page, context, **kwargs): + """Extract page metadata""" + print("[HOOK] Extracting metadata") + + metadata = await page.evaluate('''() => { + const getMeta = (name) => { + const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`); + return el ? el.getAttribute('content') : null; + }; + + return { + title: document.title, + description: getMeta('description'), + author: getMeta('author'), + keywords: getMeta('keywords'), + }; + }''') + + print(f"[HOOK] Metadata: {metadata}") + + # Infinite scroll + for i in range(3): + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + await page.wait_for_timeout(1000) + print(f"[HOOK] Scroll {i+1}/3") + + return page + + +# --- Multi-URL Hooks --- +async def url_specific_hook(page, context, url, **kwargs): + """Apply URL-specific logic""" + print(f"[HOOK] Processing URL: {url}") + + # URL-specific headers + if 'html' in url: + await page.set_extra_http_headers({"X-Type": "HTML"}) + elif 'json' in url: + await page.set_extra_http_headers({"X-Type": "JSON"}) + + return page + + +async def track_progress_hook(page, context, url, response, **kwargs): + """Track crawl progress""" + status = response.status if response else 'unknown' + print(f"[HOOK] Loaded {url} - Status: {status}") + return page + + +# ============================================================================ +# Test Functions +# ============================================================================ + +async def test_all_hooks_comprehensive(): + """Test all 8 hook types""" + print("=" * 70) + print("Test 1: All Hooks Comprehensive Demo (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nCrawling with all 8 hooks...") + + # Define hooks with function objects + hooks = { + "on_browser_created": browser_created_hook, + "on_page_context_created": page_context_hook, + "on_user_agent_updated": user_agent_hook, + "before_goto": before_goto_hook, + "after_goto": after_goto_hook, + "on_execution_started": execution_started_hook, + "before_retrieve_html": before_retrieve_hook, + "before_return_html": before_return_hook + } + + result = await client.crawl( + ["https://httpbin.org/html"], + hooks=hooks, + hooks_timeout=30 + ) + + print("\n✅ Success!") + print(f" URL: {result.url}") + print(f" Success: {result.success}") + print(f" HTML: {len(result.html)} chars") + + +async def test_authentication_workflow(): + """Test authentication with hooks""" + print("\n" + "=" * 70) + print("Test 2: Authentication Workflow (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nTesting authentication...") + + hooks = { + "on_page_context_created": auth_context_hook, + "before_goto": auth_headers_hook + } + + result = await client.crawl( + ["https://httpbin.org/basic-auth/user/passwd"], + hooks=hooks, + hooks_timeout=15 + ) + + print("\n✅ Authentication completed") + + if result.success: + if '"authenticated"' in result.html and 'true' in result.html: + print(" ✅ Basic auth successful!") + else: + print(" ⚠️ Auth status unclear") + else: + print(f" ❌ Failed: {result.error_message}") + + +async def test_performance_optimization(): + """Test performance optimization""" + print("\n" + "=" * 70) + print("Test 3: Performance Optimization (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nTesting performance hooks...") + + hooks = { + "on_page_context_created": performance_hook, + "before_retrieve_html": cleanup_hook + } + + result = await client.crawl( + ["https://httpbin.org/html"], + hooks=hooks, + hooks_timeout=10 + ) + + print("\n✅ Optimization completed") + print(f" HTML size: {len(result.html):,} chars") + print(" Resources blocked, ads removed") + + +async def test_content_extraction(): + """Test content extraction""" + print("\n" + "=" * 70) + print("Test 4: Content Extraction (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nTesting extraction hooks...") + + hooks = { + "after_goto": wait_dynamic_content_hook, + "before_retrieve_html": extract_metadata_hook + } + + result = await client.crawl( + ["https://www.kidocode.com/"], + hooks=hooks, + hooks_timeout=20 + ) + + print("\n✅ Extraction completed") + print(f" URL: {result.url}") + print(f" Success: {result.success}") + print(f" Metadata: {result.metadata}") + + +async def test_multi_url_crawl(): + """Test hooks with multiple URLs""" + print("\n" + "=" * 70) + print("Test 5: Multi-URL Crawl (Docker Client)") + print("=" * 70) + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nCrawling multiple URLs...") + + hooks = { + "before_goto": url_specific_hook, + "after_goto": track_progress_hook + } + + results = await client.crawl( + [ + "https://httpbin.org/html", + "https://httpbin.org/json", + "https://httpbin.org/xml" + ], + hooks=hooks, + hooks_timeout=15 + ) + + print("\n✅ Multi-URL crawl completed") + print(f"\n Crawled {len(results)} URLs:") + for i, result in enumerate(results, 1): + status = "✅" if result.success else "❌" + print(f" {status} {i}. {result.url}") + + +async def test_reusable_hook_library(): + """Test using reusable hook library""" + print("\n" + "=" * 70) + print("Test 6: Reusable Hook Library (Docker Client)") + print("=" * 70) + + # Create a library of reusable hooks + class HookLibrary: + @staticmethod + async def block_images(page, context, **kwargs): + """Block all images""" + await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort()) + print("[LIBRARY] Images blocked") + return page + + @staticmethod + async def block_analytics(page, context, **kwargs): + """Block analytics""" + await context.route("**/analytics/*", lambda r: r.abort()) + await context.route("**/google-analytics.com/*", lambda r: r.abort()) + print("[LIBRARY] Analytics blocked") + return page + + @staticmethod + async def scroll_infinite(page, context, **kwargs): + """Handle infinite scroll""" + for i in range(5): + prev = await page.evaluate("document.body.scrollHeight") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + await page.wait_for_timeout(1000) + curr = await page.evaluate("document.body.scrollHeight") + if curr == prev: + break + print("[LIBRARY] Infinite scroll complete") + return page + + async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client: + print("\nUsing hook library...") + + hooks = { + "on_page_context_created": HookLibrary.block_images, + "before_retrieve_html": HookLibrary.scroll_infinite + } + + result = await client.crawl( + ["https://www.kidocode.com/"], + hooks=hooks, + hooks_timeout=20 + ) + + print("\n✅ Library hooks completed") + print(f" Success: {result.success}") + + +# ============================================================================ +# Main +# ============================================================================ + +async def main(): + """Run all Docker client hook examples""" + print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)") + print("Using Python function objects with automatic conversion") + print("=" * 70) + + tests = [ + ("All Hooks Demo", test_all_hooks_comprehensive), + ("Authentication", test_authentication_workflow), + ("Performance", test_performance_optimization), + ("Extraction", test_content_extraction), + ("Multi-URL", test_multi_url_crawl), + ("Hook Library", test_reusable_hook_library) + ] + + for i, (name, test_func) in enumerate(tests, 1): + try: + await test_func() + print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n") + except Exception as e: + print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n") + import traceback + traceback.print_exc() + + print("=" * 70) + print("🎉 All Docker client hook examples completed!") + print("\n💡 Key Benefits of Function-Based Hooks:") + print(" • Write as regular Python functions") + print(" • Full IDE support (autocomplete, types)") + print(" • Automatic conversion to API format") + print(" • Reusable across projects") + print(" • Clean, readable code") + print(" • Easy to test and debug") + print("=" * 70) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/docs/examples/docker_hooks_examples.py b/docs/examples/docker_hooks_examples.py index a9c94d03..b64caf02 100644 --- a/docs/examples/docker_hooks_examples.py +++ b/docs/examples/docker_hooks_examples.py @@ -1,235 +1,451 @@ #!/usr/bin/env python3 """ -Comprehensive test demonstrating all hook types from hooks_example.py -adapted for the Docker API with real URLs +🚀 Crawl4AI Docker Hooks System - Complete Examples +==================================================== + +This file demonstrates the Docker Hooks System with three different approaches: + +1. String-based hooks for REST API +2. hooks_to_string() utility to convert functions +3. Docker Client with automatic conversion (most convenient) + +Requirements: +- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest +- crawl4ai installed: pip install crawl4ai """ +import asyncio import requests import json import time from typing import Dict, Any -# API_BASE_URL = "http://localhost:11234" -API_BASE_URL = "http://localhost:11235" +# Import Crawl4AI components +from crawl4ai import hooks_to_string +from crawl4ai.docker_client import Crawl4aiDockerClient + +# Configuration +DOCKER_URL = "http://localhost:11235" +TEST_URLS = [ + "https://www.kidocode.com", + "https://quotes.toscrape.com", + "https://httpbin.org/html", +] -def test_all_hooks_demo(): - """Demonstrate all 8 hook types with practical examples""" - print("=" * 70) - print("Testing: All Hooks Comprehensive Demo") - print("=" * 70) - - hooks_code = { - "on_browser_created": """ -async def hook(browser, **kwargs): - # Hook called after browser is created - print("[HOOK] on_browser_created - Browser is ready!") - # Browser-level configurations would go here - return browser -""", - - "on_page_context_created": """ -async def hook(page, context, **kwargs): - # Hook called after a new page and context are created - print("[HOOK] on_page_context_created - New page created!") - - # Set viewport size for consistent rendering - await page.set_viewport_size({"width": 1920, "height": 1080}) - - # Add cookies for the session (using httpbin.org domain) - await context.add_cookies([ - { - "name": "test_session", - "value": "abc123xyz", - "domain": ".httpbin.org", - "path": "/", - "httpOnly": True, - "secure": True - } - ]) - - # Block ads and tracking scripts to speed up crawling - await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda route: route.abort()) +def print_section(title: str, description: str = ""): + """Print a formatted section header""" + print("\n" + "=" * 70) + print(f" {title}") + if description: + print(f" {description}") + print("=" * 70 + "\n") + + +def check_docker_service() -> bool: + """Check if Docker service is running""" + try: + response = requests.get(f"{DOCKER_URL}/health", timeout=3) + return response.status_code == 200 + except: + return False + + +# ============================================================================ +# REUSABLE HOOK LIBRARY +# ============================================================================ + +async def performance_optimization_hook(page, context, **kwargs): + """ + Performance Hook: Block unnecessary resources to speed up crawling + """ + print(" [Hook] 🚀 Optimizing performance - blocking images and ads...") + + # Block images + await context.route( + "**/*.{png,jpg,jpeg,gif,webp,svg,ico}", + lambda route: route.abort() + ) + + # Block ads and analytics await context.route("**/analytics/*", lambda route: route.abort()) await context.route("**/ads/*", lambda route: route.abort()) - - print("[HOOK] Viewport set, cookies added, and ads blocked") + await context.route("**/google-analytics.com/*", lambda route: route.abort()) + + print(" [Hook] ✓ Performance optimization applied") return page -""", - - "on_user_agent_updated": """ -async def hook(page, context, user_agent, **kwargs): - # Hook called when user agent is updated - print(f"[HOOK] on_user_agent_updated - User agent: {user_agent[:50]}...") + + +async def viewport_setup_hook(page, context, **kwargs): + """ + Viewport Hook: Set consistent viewport size for rendering + """ + print(" [Hook] 🖥️ Setting viewport to 1920x1080...") + await page.set_viewport_size({"width": 1920, "height": 1080}) + print(" [Hook] ✓ Viewport configured") return page -""", - - "before_goto": """ -async def hook(page, context, url, **kwargs): - # Hook called before navigating to each URL - print(f"[HOOK] before_goto - About to visit: {url}") - - # Add custom headers for the request + + +async def authentication_headers_hook(page, context, url, **kwargs): + """ + Headers Hook: Add custom authentication and tracking headers + """ + print(f" [Hook] 🔐 Adding custom headers for {url[:50]}...") + await page.set_extra_http_headers({ - "X-Custom-Header": "crawl4ai-test", - "Accept-Language": "en-US,en;q=0.9", - "DNT": "1" + 'X-Crawl4AI': 'docker-hooks', + 'X-Custom-Hook': 'function-based', + 'Accept-Language': 'en-US,en;q=0.9', }) - + + print(" [Hook] ✓ Custom headers added") return page -""", - - "after_goto": """ -async def hook(page, context, url, response, **kwargs): - # Hook called after navigating to each URL - print(f"[HOOK] after_goto - Successfully loaded: {url}") - - # Wait a moment for dynamic content to load + + +async def lazy_loading_handler_hook(page, context, **kwargs): + """ + Content Hook: Handle lazy-loaded content by scrolling + """ + print(" [Hook] 📜 Scrolling to load lazy content...") + + # Scroll to bottom + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.wait_for_timeout(1000) - - # Check if specific elements exist (with error handling) - try: - # For httpbin.org, wait for body element - await page.wait_for_selector("body", timeout=2000) - print("[HOOK] Body element found and loaded") - except: - print("[HOOK] Timeout waiting for body, continuing anyway") - - return page -""", - - "on_execution_started": """ -async def hook(page, context, **kwargs): - # Hook called after custom JavaScript execution - print("[HOOK] on_execution_started - Custom JS executed!") - - # You could inject additional JavaScript here if needed - await page.evaluate("console.log('[INJECTED] Hook JS running');") - - return page -""", - - "before_retrieve_html": """ -async def hook(page, context, **kwargs): - # Hook called before retrieving the HTML content - print("[HOOK] before_retrieve_html - Preparing to get HTML") - - # Scroll to bottom to trigger lazy loading - await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + + # Scroll to middle + await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)") await page.wait_for_timeout(500) - + # Scroll back to top - await page.evaluate("window.scrollTo(0, 0);") + await page.evaluate("window.scrollTo(0, 0)") await page.wait_for_timeout(500) - - # One more scroll to middle for good measure - await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2);") - - print("[HOOK] Scrolling completed for lazy-loaded content") + + print(" [Hook] ✓ Lazy content loaded") return page -""", - - "before_return_html": """ -async def hook(page, context, html, **kwargs): - # Hook called before returning the HTML content - print(f"[HOOK] before_return_html - HTML length: {len(html)} characters") - - # Log some page metrics - metrics = await page.evaluate('''() => { - return { + + +async def page_analytics_hook(page, context, **kwargs): + """ + Analytics Hook: Log page metrics before extraction + """ + print(" [Hook] 📊 Collecting page analytics...") + + metrics = await page.evaluate(''' + () => ({ + title: document.title, images: document.images.length, links: document.links.length, - scripts: document.scripts.length - } - }''') - - print(f"[HOOK] Page metrics - Images: {metrics['images']}, Links: {metrics['links']}, Scripts: {metrics['scripts']}") - + scripts: document.scripts.length, + headings: document.querySelectorAll('h1, h2, h3').length, + paragraphs: document.querySelectorAll('p').length + }) + ''') + + print(f" [Hook] 📈 Page: {metrics['title'][:50]}...") + print(f" Links: {metrics['links']}, Images: {metrics['images']}, " + f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}") + + return page + + +# ============================================================================ +# APPROACH 1: String-Based Hooks (REST API) +# ============================================================================ + +def example_1_string_based_hooks(): + """ + Demonstrate string-based hooks with REST API + Use this when working with REST API directly or non-Python clients + """ + print_section( + "APPROACH 1: String-Based Hooks (REST API)", + "Define hooks as strings for REST API requests" + ) + + # Define hooks as strings + hooks_config = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + print(" [String Hook] Setting up page context...") + # Block images for performance + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page +""", + + "before_goto": """ +async def hook(page, context, url, **kwargs): + print(f" [String Hook] Navigating to {url[:50]}...") + await page.set_extra_http_headers({ + 'X-Crawl4AI': 'string-based-hooks', + }) + return page +""", + + "before_retrieve_html": """ +async def hook(page, context, **kwargs): + print(" [String Hook] Scrolling page...") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) return page """ } - - # Create request payload + + # Prepare request payload payload = { - "urls": ["https://httpbin.org/html"], + "urls": [TEST_URLS[2]], # httpbin.org "hooks": { - "code": hooks_code, + "code": hooks_config, "timeout": 30 }, "crawler_config": { - "js_code": "window.scrollTo(0, document.body.scrollHeight);", - "wait_for": "body", "cache_mode": "bypass" } } - - print("\nSending request with all 8 hooks...") - start_time = time.time() - - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - - elapsed_time = time.time() - start_time - print(f"Request completed in {elapsed_time:.2f} seconds") - - if response.status_code == 200: - data = response.json() - print("\n✅ Request successful!") - - # Check hooks execution - if 'hooks' in data: - hooks_info = data['hooks'] - print("\n📊 Hooks Execution Summary:") - print(f" Status: {hooks_info['status']['status']}") - print(f" Attached hooks: {len(hooks_info['status']['attached_hooks'])}") - - for hook_name in hooks_info['status']['attached_hooks']: - print(f" ✓ {hook_name}") - - if 'summary' in hooks_info: - summary = hooks_info['summary'] - print(f"\n📈 Execution Statistics:") - print(f" Total executions: {summary['total_executions']}") - print(f" Successful: {summary['successful']}") - print(f" Failed: {summary['failed']}") - print(f" Timed out: {summary['timed_out']}") - print(f" Success rate: {summary['success_rate']:.1f}%") - - if hooks_info.get('execution_log'): - print(f"\n📝 Execution Log:") - for log_entry in hooks_info['execution_log']: - status_icon = "✅" if log_entry['status'] == 'success' else "❌" - exec_time = log_entry.get('execution_time', 0) - print(f" {status_icon} {log_entry['hook_point']}: {exec_time:.3f}s") - - # Check crawl results - if 'results' in data and len(data['results']) > 0: - print(f"\n📄 Crawl Results:") - for result in data['results']: - print(f" URL: {result['url']}") - print(f" Success: {result.get('success', False)}") - if result.get('html'): - print(f" HTML length: {len(result['html'])} characters") - - else: - print(f"❌ Error: {response.status_code}") - try: - error_data = response.json() - print(f"Error details: {json.dumps(error_data, indent=2)}") - except: - print(f"Error text: {response.text[:500]}") + + print(f"🎯 Target URL: {TEST_URLS[2]}") + print(f"🔧 Configured {len(hooks_config)} string-based hooks") + print(f"📡 Sending request to Docker API...\n") + + try: + start_time = time.time() + response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) + execution_time = time.time() - start_time + + if response.status_code == 200: + result = response.json() + + print(f"\n✅ Request successful! (took {execution_time:.2f}s)") + + # Display results + if result.get('results') and result['results'][0].get('success'): + crawl_result = result['results'][0] + html_length = len(crawl_result.get('html', '')) + markdown_length = len(crawl_result.get('markdown', '')) + + print(f"\n📊 Results:") + print(f" • HTML length: {html_length:,} characters") + print(f" • Markdown length: {markdown_length:,} characters") + print(f" • URL: {crawl_result.get('url')}") + + # Check hooks execution + if 'hooks' in result: + hooks_info = result['hooks'] + print(f"\n🎣 Hooks Execution:") + print(f" • Status: {hooks_info['status']['status']}") + print(f" • Attached hooks: {len(hooks_info['status']['attached_hooks'])}") + + if 'summary' in hooks_info: + summary = hooks_info['summary'] + print(f" • Total executions: {summary['total_executions']}") + print(f" • Successful: {summary['successful']}") + print(f" • Success rate: {summary['success_rate']:.1f}%") + else: + print(f"⚠️ Crawl completed but no results") + + else: + print(f"❌ Request failed with status {response.status_code}") + print(f" Error: {response.text[:200]}") + + except requests.exceptions.Timeout: + print("⏰ Request timed out after 60 seconds") + except Exception as e: + print(f"❌ Error: {str(e)}") + + print("\n" + "─" * 70) + print("✓ String-based hooks example complete\n") -def test_authentication_flow(): - """Test a complete authentication flow with multiple hooks""" - print("\n" + "=" * 70) - print("Testing: Authentication Flow with Multiple Hooks") - print("=" * 70) - +# ============================================================================ +# APPROACH 2: Function-Based Hooks with hooks_to_string() Utility +# ============================================================================ + +def example_2_hooks_to_string_utility(): + """ + Demonstrate the hooks_to_string() utility for converting functions + Use this when you want to write hooks as functions but use REST API + """ + print_section( + "APPROACH 2: hooks_to_string() Utility", + "Convert Python functions to strings for REST API" + ) + + print("📦 Creating hook functions...") + print(" • performance_optimization_hook") + print(" • authentication_headers_hook") + print(" • lazy_loading_handler_hook") + + # Convert function objects to strings using the utility + print("\n🔄 Converting functions to strings with hooks_to_string()...") + + hooks_dict = { + "on_page_context_created": performance_optimization_hook, + "before_goto": authentication_headers_hook, + "before_retrieve_html": lazy_loading_handler_hook, + } + + hooks_as_strings = hooks_to_string(hooks_dict) + + print(f"✅ Successfully converted {len(hooks_as_strings)} functions to strings") + + # Show a preview + print("\n📝 Sample converted hook (first 200 characters):") + print("─" * 70) + sample_hook = list(hooks_as_strings.values())[0] + print(sample_hook[:200] + "...") + print("─" * 70) + + # Use the converted hooks with REST API + print("\n📡 Using converted hooks with REST API...") + + payload = { + "urls": [TEST_URLS[2]], + "hooks": { + "code": hooks_as_strings, + "timeout": 30 + } + } + + try: + start_time = time.time() + response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60) + execution_time = time.time() - start_time + + if response.status_code == 200: + result = response.json() + print(f"\n✅ Request successful! (took {execution_time:.2f}s)") + + if result.get('results') and result['results'][0].get('success'): + crawl_result = result['results'][0] + print(f" • HTML length: {len(crawl_result.get('html', '')):,} characters") + print(f" • Hooks executed successfully!") + else: + print(f"❌ Request failed: {response.status_code}") + + except Exception as e: + print(f"❌ Error: {str(e)}") + + print("\n💡 Benefits of hooks_to_string():") + print(" ✓ Write hooks as regular Python functions") + print(" ✓ Full IDE support (autocomplete, syntax highlighting)") + print(" ✓ Type checking and linting") + print(" ✓ Easy to test and debug") + print(" ✓ Reusable across projects") + print(" ✓ Works with any REST API client") + + print("\n" + "─" * 70) + print("✓ hooks_to_string() utility example complete\n") + + +# ============================================================================ +# APPROACH 3: Docker Client with Automatic Conversion (RECOMMENDED) +# ============================================================================ + +async def example_3_docker_client_auto_conversion(): + """ + Demonstrate Docker Client with automatic hook conversion (RECOMMENDED) + Use this for the best developer experience with Python + """ + print_section( + "APPROACH 3: Docker Client with Auto-Conversion (RECOMMENDED)", + "Pass function objects directly - conversion happens automatically!" + ) + + print("🐳 Initializing Crawl4AI Docker Client...") + client = Crawl4aiDockerClient(base_url=DOCKER_URL) + + print("✅ Client ready!\n") + + # Use our reusable hook library - just pass the function objects! + print("📚 Using reusable hook library:") + print(" • performance_optimization_hook") + print(" • authentication_headers_hook") + print(" • lazy_loading_handler_hook") + print(" • page_analytics_hook") + + print("\n🎯 Target URL: " + TEST_URLS[0]) + print("🚀 Starting crawl with automatic hook conversion...\n") + + try: + start_time = time.time() + + # Pass function objects directly - NO manual conversion needed! ✨ + results = await client.crawl( + urls=[TEST_URLS[0]], + hooks={ + "on_page_context_created": performance_optimization_hook, + "before_goto": authentication_headers_hook, + "before_retrieve_html": lazy_loading_handler_hook, + "before_return_html": page_analytics_hook, + }, + hooks_timeout=30 + ) + + execution_time = time.time() - start_time + + print(f"\n✅ Crawl completed! (took {execution_time:.2f}s)\n") + + # Display results + if results and results.success: + result = results + print(f"📊 Results:") + print(f" • URL: {result.url}") + print(f" • Success: {result.success}") + print(f" • HTML length: {len(result.html):,} characters") + print(f" • Markdown length: {len(result.markdown):,} characters") + + # Show metadata + if result.metadata: + print(f"\n📋 Metadata:") + print(f" • Title: {result.metadata.get('title', 'N/A')[:50]}...") + + # Show links + if result.links: + internal_count = len(result.links.get('internal', [])) + external_count = len(result.links.get('external', [])) + print(f"\n🔗 Links Found:") + print(f" • Internal: {internal_count}") + print(f" • External: {external_count}") + else: + print(f"⚠️ Crawl completed but no successful results") + if results: + print(f" Error: {results.error_message}") + + except Exception as e: + print(f"❌ Error: {str(e)}") + import traceback + traceback.print_exc() + + print("\n🌟 Why Docker Client is RECOMMENDED:") + print(" ✓ Automatic function-to-string conversion") + print(" ✓ No manual hooks_to_string() calls needed") + print(" ✓ Cleaner, more Pythonic code") + print(" ✓ Full type hints and IDE support") + print(" ✓ Built-in error handling") + print(" ✓ Async/await support") + + print("\n" + "─" * 70) + print("✓ Docker Client auto-conversion example complete\n") + + +# ============================================================================ +# APPROACH 4: Authentication Example +# ============================================================================ + +def example_4_authentication_flow(): + """ + Demonstrate authentication flow with multiple hooks + """ + print_section( + "EXAMPLE 4: Authentication Flow", + "Using hooks for authentication with cookies and headers" + ) + hooks_code = { "on_page_context_created": """ async def hook(page, context, **kwargs): print("[HOOK] Setting up authentication context") - + # Add authentication cookies await context.add_cookies([ { @@ -241,50 +457,42 @@ async def hook(page, context, **kwargs): "secure": True } ]) - - # Set localStorage items (for SPA authentication) - await page.evaluate(''' - localStorage.setItem('user_id', '12345'); - localStorage.setItem('auth_time', new Date().toISOString()); - ''') - + return page """, - + "before_goto": """ async def hook(page, context, url, **kwargs): print(f"[HOOK] Adding auth headers for {url}") - + # Add Authorization header import base64 credentials = base64.b64encode(b"user:passwd").decode('ascii') - + await page.set_extra_http_headers({ 'Authorization': f'Basic {credentials}', 'X-API-Key': 'test-api-key-123' }) - + return page """ } - + payload = { - "urls": [ - "https://httpbin.org/basic-auth/user/passwd" - ], + "urls": ["https://httpbin.org/basic-auth/user/passwd"], "hooks": { "code": hooks_code, "timeout": 15 } } - + print("\nTesting authentication with httpbin endpoints...") - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - + response = requests.post(f"{DOCKER_URL}/crawl", json=payload) + if response.status_code == 200: data = response.json() print("✅ Authentication test completed") - + if 'results' in data: for i, result in enumerate(data['results']): print(f"\n URL {i+1}: {result['url']}") @@ -300,214 +508,120 @@ async def hook(page, context, url, **kwargs): else: print(f"❌ Error: {response.status_code}") + print("\n" + "─" * 70) + print("✓ Authentication example complete\n") -def test_performance_optimization_hooks(): - """Test hooks for performance optimization""" + +# ============================================================================ +# MAIN EXECUTION +# ============================================================================ + +async def main(): + """ + Run all example demonstrations + """ print("\n" + "=" * 70) - print("Testing: Performance Optimization Hooks") + print(" 🚀 Crawl4AI - Docker Hooks System Examples") print("=" * 70) - - hooks_code = { - "on_page_context_created": """ -async def hook(page, context, **kwargs): - print("[HOOK] Optimizing page for performance") - - # Block resource-heavy content - await context.route("**/*.{png,jpg,jpeg,gif,webp,svg,ico}", lambda route: route.abort()) - await context.route("**/*.{woff,woff2,ttf,otf}", lambda route: route.abort()) - await context.route("**/*.{mp4,webm,ogg,mp3,wav}", lambda route: route.abort()) - await context.route("**/googletagmanager.com/*", lambda route: route.abort()) - await context.route("**/google-analytics.com/*", lambda route: route.abort()) - await context.route("**/doubleclick.net/*", lambda route: route.abort()) - await context.route("**/facebook.com/*", lambda route: route.abort()) - - # Disable animations and transitions - await page.add_style_tag(content=''' - *, *::before, *::after { - animation-duration: 0s !important; - animation-delay: 0s !important; - transition-duration: 0s !important; - transition-delay: 0s !important; - } - ''') - - print("[HOOK] Performance optimizations applied") - return page -""", - - "before_retrieve_html": """ -async def hook(page, context, **kwargs): - print("[HOOK] Removing unnecessary elements before extraction") - - # Remove ads, popups, and other unnecessary elements - await page.evaluate('''() => { - // Remove common ad containers - const adSelectors = [ - '.ad', '.ads', '.advertisement', '[id*="ad-"]', '[class*="ad-"]', - '.popup', '.modal', '.overlay', '.cookie-banner', '.newsletter-signup' - ]; - - adSelectors.forEach(selector => { - document.querySelectorAll(selector).forEach(el => el.remove()); - }); - - // Remove script tags to clean up HTML - document.querySelectorAll('script').forEach(el => el.remove()); - - // Remove style tags we don't need - document.querySelectorAll('style').forEach(el => el.remove()); - }''') - - return page -""" - } - - payload = { - "urls": ["https://httpbin.org/html"], - "hooks": { - "code": hooks_code, - "timeout": 10 - } - } - - print("\nTesting performance optimization hooks...") - start_time = time.time() - - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - - elapsed_time = time.time() - start_time - print(f"Request completed in {elapsed_time:.2f} seconds") - - if response.status_code == 200: - data = response.json() - print("✅ Performance optimization test completed") - - if 'results' in data and len(data['results']) > 0: - result = data['results'][0] - if result.get('html'): - print(f" HTML size: {len(result['html'])} characters") - print(" Resources blocked, ads removed, animations disabled") - else: - print(f"❌ Error: {response.status_code}") + # Check Docker service + print("\n🔍 Checking Docker service status...") + if not check_docker_service(): + print("❌ Docker service is not running!") + print("\n📋 To start the Docker service:") + print(" docker run -p 11235:11235 unclecode/crawl4ai:latest") + print("\nPlease start the service and run this example again.") + return -def test_content_extraction_hooks(): - """Test hooks for intelligent content extraction""" - print("\n" + "=" * 70) - print("Testing: Content Extraction Hooks") - print("=" * 70) - - hooks_code = { - "after_goto": """ -async def hook(page, context, url, response, **kwargs): - print(f"[HOOK] Waiting for dynamic content on {url}") - - # Wait for any lazy-loaded content - await page.wait_for_timeout(2000) - - # Trigger any "Load More" buttons - try: - load_more = await page.query_selector('[class*="load-more"], [class*="show-more"], button:has-text("Load More")') - if load_more: - await load_more.click() - await page.wait_for_timeout(1000) - print("[HOOK] Clicked 'Load More' button") - except: - pass - - return page -""", - - "before_retrieve_html": """ -async def hook(page, context, **kwargs): - print("[HOOK] Extracting structured data") - - # Extract metadata - metadata = await page.evaluate('''() => { - const getMeta = (name) => { - const element = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`); - return element ? element.getAttribute('content') : null; - }; - - return { - title: document.title, - description: getMeta('description') || getMeta('og:description'), - author: getMeta('author'), - keywords: getMeta('keywords'), - ogTitle: getMeta('og:title'), - ogImage: getMeta('og:image'), - canonical: document.querySelector('link[rel="canonical"]')?.href, - jsonLd: Array.from(document.querySelectorAll('script[type="application/ld+json"]')) - .map(el => el.textContent).filter(Boolean) - }; - }''') - - print(f"[HOOK] Extracted metadata: {json.dumps(metadata, indent=2)}") - - # Infinite scroll handling - for i in range(3): - await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") - await page.wait_for_timeout(1000) - print(f"[HOOK] Scroll iteration {i+1}/3") - - return page -""" - } - - payload = { - "urls": ["https://httpbin.org/html", "https://httpbin.org/json"], - "hooks": { - "code": hooks_code, - "timeout": 20 - } - } - - print("\nTesting content extraction hooks...") - response = requests.post(f"{API_BASE_URL}/crawl", json=payload) - - if response.status_code == 200: - data = response.json() - print("✅ Content extraction test completed") - - if 'hooks' in data and 'summary' in data['hooks']: - summary = data['hooks']['summary'] - print(f" Hooks executed: {summary['successful']}/{summary['total_executions']}") - - if 'results' in data: - for result in data['results']: - print(f"\n URL: {result['url']}") - print(f" Success: {result.get('success', False)}") - else: - print(f"❌ Error: {response.status_code}") + print("✅ Docker service is running!\n") - -def main(): - """Run comprehensive hook tests""" - print("🔧 Crawl4AI Docker API - Comprehensive Hooks Testing") - print("Based on docs/examples/hooks_example.py") - print("=" * 70) - - tests = [ - ("All Hooks Demo", test_all_hooks_demo), - ("Authentication Flow", test_authentication_flow), - ("Performance Optimization", test_performance_optimization_hooks), - ("Content Extraction", test_content_extraction_hooks), + # Run all examples + examples = [ + ("String-Based Hooks (REST API)", example_1_string_based_hooks, False), + ("hooks_to_string() Utility", example_2_hooks_to_string_utility, False), + ("Docker Client Auto-Conversion (Recommended)", example_3_docker_client_auto_conversion, True), + ("Authentication Flow", example_4_authentication_flow, False), ] - - for i, (name, test_func) in enumerate(tests, 1): - print(f"\n📌 Test {i}/{len(tests)}: {name}") + + for i, (name, example_func, is_async) in enumerate(examples, 1): + print(f"\n{'🔷' * 35}") + print(f"Example {i}/{len(examples)}: {name}") + print(f"{'🔷' * 35}\n") + try: - test_func() - print(f"✅ {name} completed") + if is_async: + await example_func() + else: + example_func() + + print(f"✅ Example {i} completed successfully!") + + # Pause between examples (except the last one) + if i < len(examples): + print("\n⏸️ Press Enter to continue to next example...") + input() + + except KeyboardInterrupt: + print(f"\n⏹️ Examples interrupted by user") + break except Exception as e: - print(f"❌ {name} failed: {e}") + print(f"\n❌ Example {i} failed: {str(e)}") import traceback traceback.print_exc() - + print("\nContinuing to next example...\n") + continue + + # Final summary print("\n" + "=" * 70) - print("🎉 All comprehensive hook tests completed!") + print(" 🎉 All Examples Complete!") print("=" * 70) + print("\n📊 Summary - Three Approaches to Docker Hooks:") + + print("\n✨ 1. String-Based Hooks:") + print(" • Write hooks as strings directly in JSON") + print(" • Best for: REST API, non-Python clients, simple use cases") + print(" • Cons: No IDE support, harder to debug") + + print("\n✨ 2. hooks_to_string() Utility:") + print(" • Write hooks as Python functions, convert to strings") + print(" • Best for: Python with REST API, reusable hook libraries") + print(" • Pros: IDE support, type checking, easy debugging") + + print("\n✨ 3. Docker Client (RECOMMENDED):") + print(" • Pass function objects directly, automatic conversion") + print(" • Best for: Python applications, best developer experience") + print(" • Pros: All benefits of #2 + cleaner code, no manual conversion") + + print("\n💡 Recommendation:") + print(" Use Docker Client (#3) for Python applications") + print(" Use hooks_to_string() (#2) when you need REST API flexibility") + print(" Use string-based (#1) for non-Python clients or simple scripts") + + print("\n🎯 8 Hook Points Available:") + print(" • on_browser_created, on_page_context_created") + print(" • on_user_agent_updated, before_goto, after_goto") + print(" • on_execution_started, before_retrieve_html, before_return_html") + + print("\n📚 Resources:") + print(" • Docs: https://docs.crawl4ai.com/core/docker-deployment") + print(" • GitHub: https://github.com/unclecode/crawl4ai") + print(" • Discord: https://discord.gg/jP8KfhDhyN") + + print("\n" + "=" * 70) + print(" Happy Crawling! 🕷️") + print("=" * 70 + "\n") + if __name__ == "__main__": - main() \ No newline at end of file + print("\n🎬 Starting Crawl4AI Docker Hooks Examples...") + print("Press Ctrl+C anytime to exit\n") + + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\n\n👋 Examples stopped by user. Thanks for exploring Crawl4AI!") + except Exception as e: + print(f"\n\n❌ Error: {str(e)}") + import traceback + traceback.print_exc() diff --git a/docs/examples/docker_webhook_example.py b/docs/examples/docker_webhook_example.py new file mode 100644 index 00000000..f05d3501 --- /dev/null +++ b/docs/examples/docker_webhook_example.py @@ -0,0 +1,461 @@ +""" +Docker Webhook Example for Crawl4AI + +This example demonstrates how to use webhooks with the Crawl4AI job queue API. +Instead of polling for results, webhooks notify your application when jobs complete. + +Supports both: +- /crawl/job - Raw crawling with markdown extraction +- /llm/job - LLM-powered content extraction + +Prerequisites: +1. Crawl4AI Docker container running on localhost:11235 +2. Flask installed: pip install flask requests +3. LLM API key configured in .llm.env (for LLM extraction examples) + +Usage: +1. Run this script: python docker_webhook_example.py +2. The webhook server will start on http://localhost:8080 +3. Jobs will be submitted and webhooks will be received automatically +""" + +import requests +import json +import time +from flask import Flask, request, jsonify +from threading import Thread + +# Configuration +CRAWL4AI_BASE_URL = "http://localhost:11235" +WEBHOOK_BASE_URL = "http://localhost:8080" # Your webhook receiver URL + +# Initialize Flask app for webhook receiver +app = Flask(__name__) + +# Store received webhook data for demonstration +received_webhooks = [] + + +@app.route('/webhooks/crawl-complete', methods=['POST']) +def handle_crawl_webhook(): + """ + Webhook handler that receives notifications when crawl jobs complete. + + Payload structure: + { + "task_id": "crawl_abc123", + "task_type": "crawl", + "status": "completed" or "failed", + "timestamp": "2025-10-21T10:30:00.000000+00:00", + "urls": ["https://example.com"], + "error": "error message" (only if failed), + "data": {...} (only if webhook_data_in_payload=True) + } + """ + payload = request.json + print(f"\n{'='*60}") + print(f"📬 Webhook received for task: {payload['task_id']}") + print(f" Status: {payload['status']}") + print(f" Timestamp: {payload['timestamp']}") + print(f" URLs: {payload['urls']}") + + if payload['status'] == 'completed': + # If data is in payload, process it directly + if 'data' in payload: + print(f" ✅ Data included in webhook") + data = payload['data'] + # Process the crawl results here + for result in data.get('results', []): + print(f" - Crawled: {result.get('url')}") + print(f" - Markdown length: {len(result.get('markdown', ''))}") + else: + # Fetch results from API if not included + print(f" 📥 Fetching results from API...") + task_id = payload['task_id'] + result_response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}") + if result_response.ok: + data = result_response.json() + print(f" ✅ Results fetched successfully") + # Process the crawl results here + for result in data['result'].get('results', []): + print(f" - Crawled: {result.get('url')}") + print(f" - Markdown length: {len(result.get('markdown', ''))}") + + elif payload['status'] == 'failed': + print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}") + + print(f"{'='*60}\n") + + # Store webhook for demonstration + received_webhooks.append(payload) + + # Return 200 OK to acknowledge receipt + return jsonify({"status": "received"}), 200 + + +@app.route('/webhooks/llm-complete', methods=['POST']) +def handle_llm_webhook(): + """ + Webhook handler that receives notifications when LLM extraction jobs complete. + + Payload structure: + { + "task_id": "llm_1698765432_12345", + "task_type": "llm_extraction", + "status": "completed" or "failed", + "timestamp": "2025-10-21T10:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "error": "error message" (only if failed), + "data": {"extracted_content": {...}} (only if webhook_data_in_payload=True) + } + """ + payload = request.json + print(f"\n{'='*60}") + print(f"🤖 LLM Webhook received for task: {payload['task_id']}") + print(f" Task Type: {payload['task_type']}") + print(f" Status: {payload['status']}") + print(f" Timestamp: {payload['timestamp']}") + print(f" URL: {payload['urls'][0]}") + + if payload['status'] == 'completed': + # If data is in payload, process it directly + if 'data' in payload: + print(f" ✅ Data included in webhook") + data = payload['data'] + # Webhook wraps extracted content in 'extracted_content' field + extracted = data.get('extracted_content', {}) + print(f" - Extracted content:") + print(f" {json.dumps(extracted, indent=8)}") + else: + # Fetch results from API if not included + print(f" 📥 Fetching results from API...") + task_id = payload['task_id'] + result_response = requests.get(f"{CRAWL4AI_BASE_URL}/llm/job/{task_id}") + if result_response.ok: + data = result_response.json() + print(f" ✅ Results fetched successfully") + # API returns unwrapped content in 'result' field + extracted = data['result'] + print(f" - Extracted content:") + print(f" {json.dumps(extracted, indent=8)}") + + elif payload['status'] == 'failed': + print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}") + + print(f"{'='*60}\n") + + # Store webhook for demonstration + received_webhooks.append(payload) + + # Return 200 OK to acknowledge receipt + return jsonify({"status": "received"}), 200 + + +def start_webhook_server(): + """Start the Flask webhook server in a separate thread""" + app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False) + + +def submit_crawl_job_with_webhook(urls, webhook_url, include_data=False): + """ + Submit a crawl job with webhook notification. + + Args: + urls: List of URLs to crawl + webhook_url: URL to receive webhook notifications + include_data: Whether to include full results in webhook payload + + Returns: + task_id: The job's task identifier + """ + payload = { + "urls": urls, + "browser_config": {"headless": True}, + "crawler_config": {"cache_mode": "bypass"}, + "webhook_config": { + "webhook_url": webhook_url, + "webhook_data_in_payload": include_data, + # Optional: Add custom headers for authentication + # "webhook_headers": { + # "X-Webhook-Secret": "your-secret-token" + # } + } + } + + print(f"\n🚀 Submitting crawl job...") + print(f" URLs: {urls}") + print(f" Webhook: {webhook_url}") + print(f" Include data: {include_data}") + + response = requests.post( + f"{CRAWL4AI_BASE_URL}/crawl/job", + json=payload, + headers={"Content-Type": "application/json"} + ) + + if response.ok: + data = response.json() + task_id = data['task_id'] + print(f" ✅ Job submitted successfully") + print(f" Task ID: {task_id}") + return task_id + else: + print(f" ❌ Failed to submit job: {response.text}") + return None + + +def submit_llm_job_with_webhook(url, query, webhook_url, include_data=False, schema=None, provider=None): + """ + Submit an LLM extraction job with webhook notification. + + Args: + url: URL to extract content from + query: Instruction for the LLM (e.g., "Extract article title and author") + webhook_url: URL to receive webhook notifications + include_data: Whether to include full results in webhook payload + schema: Optional JSON schema for structured extraction + provider: Optional LLM provider (e.g., "openai/gpt-4o-mini") + + Returns: + task_id: The job's task identifier + """ + payload = { + "url": url, + "q": query, + "cache": False, + "webhook_config": { + "webhook_url": webhook_url, + "webhook_data_in_payload": include_data, + # Optional: Add custom headers for authentication + # "webhook_headers": { + # "X-Webhook-Secret": "your-secret-token" + # } + } + } + + if schema: + payload["schema"] = schema + + if provider: + payload["provider"] = provider + + print(f"\n🤖 Submitting LLM extraction job...") + print(f" URL: {url}") + print(f" Query: {query}") + print(f" Webhook: {webhook_url}") + print(f" Include data: {include_data}") + if provider: + print(f" Provider: {provider}") + + response = requests.post( + f"{CRAWL4AI_BASE_URL}/llm/job", + json=payload, + headers={"Content-Type": "application/json"} + ) + + if response.ok: + data = response.json() + task_id = data['task_id'] + print(f" ✅ Job submitted successfully") + print(f" Task ID: {task_id}") + return task_id + else: + print(f" ❌ Failed to submit job: {response.text}") + return None + + +def submit_job_without_webhook(urls): + """ + Submit a job without webhook (traditional polling approach). + + Args: + urls: List of URLs to crawl + + Returns: + task_id: The job's task identifier + """ + payload = { + "urls": urls, + "browser_config": {"headless": True}, + "crawler_config": {"cache_mode": "bypass"} + } + + print(f"\n🚀 Submitting crawl job (without webhook)...") + print(f" URLs: {urls}") + + response = requests.post( + f"{CRAWL4AI_BASE_URL}/crawl/job", + json=payload + ) + + if response.ok: + data = response.json() + task_id = data['task_id'] + print(f" ✅ Job submitted successfully") + print(f" Task ID: {task_id}") + return task_id + else: + print(f" ❌ Failed to submit job: {response.text}") + return None + + +def poll_job_status(task_id, timeout=60): + """ + Poll for job status (used when webhook is not configured). + + Args: + task_id: The job's task identifier + timeout: Maximum time to wait in seconds + """ + print(f"\n⏳ Polling for job status...") + start_time = time.time() + + while time.time() - start_time < timeout: + response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}") + + if response.ok: + data = response.json() + status = data.get('status', 'unknown') + + if status == 'completed': + print(f" ✅ Job completed!") + return data + elif status == 'failed': + print(f" ❌ Job failed: {data.get('error', 'Unknown error')}") + return data + else: + print(f" ⏳ Status: {status}, waiting...") + time.sleep(2) + else: + print(f" ❌ Failed to get status: {response.text}") + return None + + print(f" ⏰ Timeout reached") + return None + + +def main(): + """Run the webhook demonstration""" + + # Check if Crawl4AI is running + try: + health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5) + print(f"✅ Crawl4AI is running: {health.json()}") + except: + print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}") + print(" Please make sure Docker container is running:") + print(" docker run -d -p 11235:11235 --name crawl4ai unclecode/crawl4ai:latest") + return + + # Start webhook server in background thread + print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...") + webhook_thread = Thread(target=start_webhook_server, daemon=True) + webhook_thread.start() + time.sleep(2) # Give server time to start + + # Example 1: Job with webhook (notification only, fetch data separately) + print(f"\n{'='*60}") + print("Example 1: Webhook Notification Only") + print(f"{'='*60}") + task_id_1 = submit_crawl_job_with_webhook( + urls=["https://example.com"], + webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete", + include_data=False + ) + + # Example 2: Job with webhook (data included in payload) + time.sleep(5) # Wait a bit between requests + print(f"\n{'='*60}") + print("Example 2: Webhook with Full Data") + print(f"{'='*60}") + task_id_2 = submit_crawl_job_with_webhook( + urls=["https://www.python.org"], + webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete", + include_data=True + ) + + # Example 3: LLM extraction with webhook (notification only) + time.sleep(5) # Wait a bit between requests + print(f"\n{'='*60}") + print("Example 3: LLM Extraction with Webhook (Notification Only)") + print(f"{'='*60}") + task_id_3 = submit_llm_job_with_webhook( + url="https://www.example.com", + query="Extract the main heading and description from this page.", + webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete", + include_data=False, + provider="openai/gpt-4o-mini" + ) + + # Example 4: LLM extraction with webhook (data included + schema) + time.sleep(5) # Wait a bit between requests + print(f"\n{'='*60}") + print("Example 4: LLM Extraction with Schema and Full Data") + print(f"{'='*60}") + + # Define a schema for structured extraction + schema = json.dumps({ + "type": "object", + "properties": { + "title": {"type": "string", "description": "Page title"}, + "description": {"type": "string", "description": "Page description"} + }, + "required": ["title"] + }) + + task_id_4 = submit_llm_job_with_webhook( + url="https://www.python.org", + query="Extract the title and description of this website", + webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete", + include_data=True, + schema=schema, + provider="openai/gpt-4o-mini" + ) + + # Example 5: Traditional polling (no webhook) + time.sleep(5) # Wait a bit between requests + print(f"\n{'='*60}") + print("Example 5: Traditional Polling (No Webhook)") + print(f"{'='*60}") + task_id_5 = submit_job_without_webhook( + urls=["https://github.com"] + ) + if task_id_5: + result = poll_job_status(task_id_5) + if result and result.get('status') == 'completed': + print(f" ✅ Results retrieved via polling") + + # Wait for webhooks to arrive + print(f"\n⏳ Waiting for webhooks to be received...") + time.sleep(30) # Give jobs time to complete and webhooks to arrive (longer for LLM) + + # Summary + print(f"\n{'='*60}") + print("Summary") + print(f"{'='*60}") + print(f"Total webhooks received: {len(received_webhooks)}") + + crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl'] + llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction'] + + print(f"\n📊 Breakdown:") + print(f" - Crawl webhooks: {len(crawl_webhooks)}") + print(f" - LLM extraction webhooks: {len(llm_webhooks)}") + + print(f"\n📋 Details:") + for i, webhook in enumerate(received_webhooks, 1): + task_type = webhook['task_type'] + icon = "🕷️" if task_type == "crawl" else "🤖" + print(f"{i}. {icon} Task {webhook['task_id']}: {webhook['status']} ({task_type})") + + print(f"\n✅ Demo completed!") + print(f"\n💡 Pro tips:") + print(f" - In production, your webhook URL should be publicly accessible") + print(f" (e.g., https://myapp.com/webhooks) or use ngrok for testing") + print(f" - Both /crawl/job and /llm/job support the same webhook configuration") + print(f" - Use webhook_data_in_payload=true to get results directly in the webhook") + print(f" - LLM jobs may take longer, adjust timeouts accordingly") + + +if __name__ == "__main__": + main() diff --git a/docs/md_v2/assets/crawl4ai-skill.zip b/docs/md_v2/assets/crawl4ai-skill.zip new file mode 100644 index 00000000..21785b02 Binary files /dev/null and b/docs/md_v2/assets/crawl4ai-skill.zip differ diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md index 6eb6112b..c955572e 100644 --- a/docs/md_v2/blog/index.md +++ b/docs/md_v2/blog/index.md @@ -20,17 +20,43 @@ Ever wondered why your AI coding assistant struggles with your library despite c ## Latest Release +### [Crawl4AI v0.7.6 – The Webhook Infrastructure Update](../blog/release-v0.7.6.md) +*October 22, 2025* + +Crawl4AI v0.7.6 introduces comprehensive webhook support for the Docker job queue API, bringing real-time notifications to both crawling and LLM extraction workflows. No more polling! + +Key highlights: +- **🪝 Complete Webhook Support**: Real-time notifications for both `/crawl/job` and `/llm/job` endpoints +- **🔄 Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s) +- **🔐 Custom Authentication**: Add custom headers for webhook authentication +- **📊 Flexible Delivery**: Choose notification-only or include full data in payload +- **⚙️ Global Configuration**: Set default webhook URL in config.yml for all jobs +- **🎯 Zero Breaking Changes**: Fully backward compatible, webhooks are opt-in + +[Read full release notes →](../blog/release-v0.7.6.md) + +## Recent Releases + +### [Crawl4AI v0.7.5 – The Docker Hooks & Security Update](../blog/release-v0.7.5.md) +*September 29, 2025* + +Crawl4AI v0.7.5 introduces the powerful Docker Hooks System for complete pipeline customization, enhanced LLM integration with custom providers, HTTPS preservation for modern web security, and resolves multiple community-reported issues. + +Key highlights: +- **🔧 Docker Hooks System**: Custom Python functions at 8 key pipeline points for unprecedented customization +- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration +- **🔒 HTTPS Preservation**: Secure internal link handling for modern web applications +- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance +- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration + +[Read full release notes →](../blog/release-v0.7.5.md) + +## Recent Releases + ### [Crawl4AI v0.7.4 – The Intelligent Table Extraction & Performance Update](../blog/release-v0.7.4.md) *August 17, 2025* -Crawl4AI v0.7.4 introduces revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes that make Crawl4AI more robust for production workloads. - -Key highlights: -- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables -- **⚡ Dispatcher Bug Fix**: Fixed sequential processing issue in arun_many for fast-completing tasks -- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management -- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation -- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution +Revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes. [Read full release notes →](../blog/release-v0.7.4.md) diff --git a/docs/md_v2/blog/releases/0.7.6.md b/docs/md_v2/blog/releases/0.7.6.md new file mode 100644 index 00000000..e27d19cc --- /dev/null +++ b/docs/md_v2/blog/releases/0.7.6.md @@ -0,0 +1,314 @@ +# Crawl4AI v0.7.6 Release Notes + +*Release Date: October 22, 2025* + +I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows. + +## 🎯 What's New + +### Webhook Support for Docker Job Queue API + +The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete! + +**Key Capabilities:** + +- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks +- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload +- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s) +- ✅ **Custom Authentication**: Add custom headers for webhook authentication +- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs +- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks + +### How It Works + +Instead of constantly checking job status: + +**OLD WAY (Polling):** +```python +# Submit job +response = requests.post("http://localhost:11235/crawl/job", json=payload) +task_id = response.json()['task_id'] + +# Poll until complete +while True: + status = requests.get(f"http://localhost:11235/crawl/job/{task_id}") + if status.json()['status'] == 'completed': + break + time.sleep(5) # Wait and try again +``` + +**NEW WAY (Webhooks):** +```python +# Submit job with webhook +payload = { + "urls": ["https://example.com"], + "webhook_config": { + "webhook_url": "https://myapp.com/webhook", + "webhook_data_in_payload": True + } +} +response = requests.post("http://localhost:11235/crawl/job", json=payload) + +# Done! Webhook will notify you when complete +# Your webhook handler receives the results automatically +``` + +### Crawl Job Webhooks + +```bash +curl -X POST http://localhost:11235/crawl/job \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "browser_config": {"headless": true}, + "crawler_config": {"cache_mode": "bypass"}, + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/crawl-complete", + "webhook_data_in_payload": false, + "webhook_headers": { + "X-Webhook-Secret": "your-secret-token" + } + } + }' +``` + +### LLM Extraction Job Webhooks (NEW!) + +```bash +curl -X POST http://localhost:11235/llm/job \ + -H "Content-Type: application/json" \ + -d '{ + "url": "https://example.com/article", + "q": "Extract the article title, author, and publication date", + "schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}", + "provider": "openai/gpt-4o-mini", + "webhook_config": { + "webhook_url": "https://myapp.com/webhooks/llm-complete", + "webhook_data_in_payload": true + } + }' +``` + +### Webhook Payload Structure + +**Success (with data):** +```json +{ + "task_id": "llm_1698765432", + "task_type": "llm_extraction", + "status": "completed", + "timestamp": "2025-10-22T10:30:00.000000+00:00", + "urls": ["https://example.com/article"], + "data": { + "extracted_content": { + "title": "Understanding Web Scraping", + "author": "John Doe", + "date": "2025-10-22" + } + } +} +``` + +**Failure:** +```json +{ + "task_id": "crawl_abc123", + "task_type": "crawl", + "status": "failed", + "timestamp": "2025-10-22T10:30:00.000000+00:00", + "urls": ["https://example.com"], + "error": "Connection timeout after 30s" +} +``` + +### Simple Webhook Handler Example + +```python +from flask import Flask, request, jsonify + +app = Flask(__name__) + +@app.route('/webhook', methods=['POST']) +def handle_webhook(): + payload = request.json + + task_id = payload['task_id'] + task_type = payload['task_type'] + status = payload['status'] + + if status == 'completed': + if 'data' in payload: + # Process data directly + data = payload['data'] + else: + # Fetch from API + endpoint = 'crawl' if task_type == 'crawl' else 'llm' + response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}') + data = response.json() + + # Your business logic here + print(f"Job {task_id} completed!") + + elif status == 'failed': + error = payload.get('error', 'Unknown error') + print(f"Job {task_id} failed: {error}") + + return jsonify({"status": "received"}), 200 + +app.run(port=8080) +``` + +## 📊 Performance Improvements + +- **Reduced Server Load**: Eliminates constant polling requests +- **Lower Latency**: Instant notification vs. polling interval delay +- **Better Resource Usage**: Frees up client connections while jobs run in background +- **Scalable Architecture**: Handles high-volume crawling workflows efficiently + +## 🐛 Bug Fixes + +- Fixed webhook configuration serialization for Pydantic HttpUrl fields +- Improved error handling in webhook delivery service +- Enhanced Redis task storage for webhook config persistence + +## 🌍 Expected Real-World Impact + +### For Web Scraping Workflows +- **Reduced Costs**: Less API calls = lower bandwidth and server costs +- **Better UX**: Instant notifications improve user experience +- **Scalability**: Handle 100s of concurrent jobs without polling overhead + +### For LLM Extraction Pipelines +- **Async Processing**: Submit LLM extraction jobs and move on +- **Batch Processing**: Queue multiple extractions, get notified as they complete +- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.) + +### For Microservices +- **Event-Driven**: Perfect for event-driven microservice architectures +- **Decoupling**: Decouple job submission from result processing +- **Reliability**: Automatic retries ensure webhooks are delivered + +## 🔄 Breaking Changes + +**None!** This release is fully backward compatible. + +- Webhook configuration is optional +- Existing code continues to work without modification +- Polling is still supported for jobs without webhook config + +## 📚 Documentation + +### New Documentation +- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide +- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples + +### Updated Documentation +- **[Docker README](../deploy/docker/README.md)** - Added webhook sections +- API documentation with webhook examples + +## 🛠️ Migration Guide + +No migration needed! Webhooks are opt-in: + +1. **To use webhooks**: Add `webhook_config` to your job payload +2. **To keep polling**: Continue using your existing code + +### Quick Start + +```python +# Just add webhook_config to your existing payload +payload = { + # Your existing configuration + "urls": ["https://example.com"], + "browser_config": {...}, + "crawler_config": {...}, + + # NEW: Add webhook configuration + "webhook_config": { + "webhook_url": "https://myapp.com/webhook", + "webhook_data_in_payload": True + } +} +``` + +## 🔧 Configuration + +### Global Webhook Configuration (config.yml) + +```yaml +webhooks: + enabled: true + default_url: "https://myapp.com/webhooks/default" # Optional + data_in_payload: false + retry: + max_attempts: 5 + initial_delay_ms: 1000 + max_delay_ms: 32000 + timeout_ms: 30000 + headers: + User-Agent: "Crawl4AI-Webhook/1.0" +``` + +## 🚀 Upgrade Instructions + +### Docker + +```bash +# Pull the latest image +docker pull unclecode/crawl4ai:0.7.6 + +# Or use latest tag +docker pull unclecode/crawl4ai:latest + +# Run with webhook support +docker run -d \ + -p 11235:11235 \ + --env-file .llm.env \ + --name crawl4ai \ + unclecode/crawl4ai:0.7.6 +``` + +### Python Package + +```bash +pip install --upgrade crawl4ai +``` + +## 💡 Pro Tips + +1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads +2. **Set custom headers** for webhook authentication and request tracking +3. **Configure global default webhook** for consistent handling across all jobs +4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry +5. **Use structured schemas** with LLM extraction for predictable webhook data + +## 🎬 Demo + +Try the release demo: + +```bash +python docs/releases_review/demo_v0.7.6.py +``` + +This comprehensive demo showcases: +- Crawl job webhooks (notification-only and with data) +- LLM extraction webhooks (with JSON schema support) +- Custom headers for authentication +- Webhook retry mechanism +- Real-time webhook receiver + +## 🙏 Acknowledgments + +Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing. + +## 📞 Support + +- **Documentation**: https://docs.crawl4ai.com +- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues +- **Discord**: https://discord.gg/crawl4ai + +--- + +**Happy crawling with webhooks!** 🕷️🪝 + +*- unclecode* diff --git a/docs/md_v2/blog/releases/v0.7.5.md b/docs/md_v2/blog/releases/v0.7.5.md new file mode 100644 index 00000000..977d2fd9 --- /dev/null +++ b/docs/md_v2/blog/releases/v0.7.5.md @@ -0,0 +1,318 @@ +# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update + +*September 29, 2025 • 8 min read* + +--- + +Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements. + +## 🎯 What's New at a Glance + +- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API +- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion +- **Enhanced LLM Integration**: Custom providers with temperature control +- **HTTPS Preservation**: Secure internal link handling +- **Bug Fixes**: Resolved multiple community-reported issues +- **Improved Docker Error Handling**: Better debugging and reliability + +## 🔧 Docker Hooks System: Pipeline Customization + +Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline. + +### Real Example: Authentication & Performance + +```python +import requests + +# Real working hooks for httpbin.org +hooks_config = { + "on_page_context_created": """ +async def hook(page, context, **kwargs): + print("Hook: Setting up page context") + # Block images to speed up crawling + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + print("Hook: Images blocked") + return page +""", + + "before_retrieve_html": """ +async def hook(page, context, **kwargs): + print("Hook: Before retrieving HTML") + # Scroll to bottom to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + print("Hook: Scrolled to bottom") + return page +""", + + "before_goto": """ +async def hook(page, context, url, **kwargs): + print(f"Hook: About to navigate to {url}") + # Add custom headers + await page.set_extra_http_headers({ + 'X-Test-Header': 'crawl4ai-hooks-test' + }) + return page +""" +} + +# Test with Docker API +payload = { + "urls": ["https://httpbin.org/html"], + "hooks": { + "code": hooks_config, + "timeout": 30 + } +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +result = response.json() + +if result.get('success'): + print("✅ Hooks executed successfully!") + print(f"Content length: {len(result.get('markdown', ''))} characters") +``` + +**Available Hook Points:** +- `on_browser_created`: Browser setup +- `on_page_context_created`: Page context configuration +- `before_goto`: Pre-navigation setup +- `after_goto`: Post-navigation processing +- `on_user_agent_updated`: User agent changes +- `on_execution_started`: Crawl initialization +- `before_retrieve_html`: Pre-extraction processing +- `before_return_html`: Final HTML processing + +### Function-Based Hooks API + +Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion! + +**Option 1: Using the `hooks_to_string()` Utility** + +```python +from crawl4ai import hooks_to_string +import requests + +# Define hooks as regular Python functions (with full IDE support!) +async def on_page_context_created(page, context, **kwargs): + """Block images to speed up crawling""" + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + await page.set_viewport_size({"width": 1920, "height": 1080}) + return page + +async def before_goto(page, context, url, **kwargs): + """Add custom headers""" + await page.set_extra_http_headers({ + 'X-Crawl4AI': 'v0.7.5', + 'X-Custom-Header': 'my-value' + }) + return page + +# Convert functions to strings +hooks_code = hooks_to_string({ + "on_page_context_created": on_page_context_created, + "before_goto": before_goto +}) + +# Use with REST API +payload = { + "urls": ["https://httpbin.org/html"], + "hooks": {"code": hooks_code, "timeout": 30} +} +response = requests.post("http://localhost:11235/crawl", json=payload) +``` + +**Option 2: Docker Client with Automatic Conversion (Recommended!)** + +```python +from crawl4ai.docker_client import Crawl4aiDockerClient + +# Define hooks as functions (same as above) +async def on_page_context_created(page, context, **kwargs): + await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort()) + return page + +async def before_retrieve_html(page, context, **kwargs): + # Scroll to load lazy content + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await page.wait_for_timeout(1000) + return page + +# Use Docker client - conversion happens automatically! +client = Crawl4aiDockerClient(base_url="http://localhost:11235") + +results = await client.crawl( + urls=["https://httpbin.org/html"], + hooks={ + "on_page_context_created": on_page_context_created, + "before_retrieve_html": before_retrieve_html + }, + hooks_timeout=30 +) + +if results and results.success: + print(f"✅ Hooks executed! HTML length: {len(results.html)}") +``` + +**Benefits of Function-Based Hooks:** +- ✅ Full IDE support (autocomplete, syntax highlighting) +- ✅ Type checking and linting +- ✅ Easier to test and debug +- ✅ Reusable across projects +- ✅ Automatic conversion in Docker client +- ✅ No breaking changes - string hooks still work! + +## 🤖 Enhanced LLM Integration + +Enhanced LLM integration with custom providers, temperature control, and base URL configuration. + +### Multi-Provider Support + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +# Test with different providers +async def test_llm_providers(): + # OpenAI with custom temperature + openai_strategy = LLMExtractionStrategy( + provider="gemini/gemini-2.5-flash-lite", + api_token="your-api-token", + temperature=0.7, # New in v0.7.5 + instruction="Summarize this page in one sentence" + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + "https://example.com", + config=CrawlerRunConfig(extraction_strategy=openai_strategy) + ) + + if result.success: + print("✅ LLM extraction completed") + print(result.extracted_content) + +# Docker API with enhanced LLM config +llm_payload = { + "url": "https://example.com", + "f": "llm", + "q": "Summarize this page in one sentence.", + "provider": "gemini/gemini-2.5-flash-lite", + "temperature": 0.7 +} + +response = requests.post("http://localhost:11235/md", json=llm_payload) +``` + +**New Features:** +- Custom `temperature` parameter for creativity control +- `base_url` for custom API endpoints +- Multi-provider environment variable support +- Docker API integration + +## 🔒 HTTPS Preservation + +**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear. + +**Solution:** HTTPS preservation maintains secure protocols throughout crawling. + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy + +async def test_https_preservation(): + # Enable HTTPS preservation + url_filter = URLPatternFilter( + patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"] + ) + + config = CrawlerRunConfig( + exclude_external_links=True, + preserve_https_for_internal_links=True, # New in v0.7.5 + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + max_pages=5, + filter_chain=FilterChain([url_filter]) + ) + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun( + url="https://quotes.toscrape.com", + config=config + ): + # All internal links maintain HTTPS + internal_links = [link['href'] for link in result.links['internal']] + https_links = [link for link in internal_links if link.startswith('https://')] + + print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}") + for link in https_links[:3]: + print(f" → {link}") +``` + +## 🛠️ Bug Fixes and Improvements + +### Major Fixes +- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332) +- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated) +- **Docker Error Handling**: Comprehensive error messages with status codes +- **Memory Management**: Fixed leaks in long-running sessions +- **JWT Authentication**: Fixed Docker JWT validation issues (#1442) +- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481) +- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505) +- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419) +- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291) +- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989) + +### Community-Reported Issues Fixed +This release addresses multiple issues reported by the community through GitHub issues and Discord discussions: +- Fixed browser configuration reference errors +- Resolved dependency conflicts with cssselect +- Improved error messaging for failed authentications +- Enhanced compatibility with various proxy configurations +- Fixed edge cases in URL normalization + +### Configuration Updates +```python +# Old proxy config (deprecated) +# browser_config = BrowserConfig(proxy="http://proxy:8080") + +# New enhanced proxy config +browser_config = BrowserConfig( + proxy_config={ + "server": "http://proxy:8080", + "username": "optional-user", + "password": "optional-pass" + } +) +``` + +## 🔄 Breaking Changes + +1. **Python 3.10+ Required**: Upgrade from Python 3.9 +2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure +3. **New Dependency**: Added `cssselect` for better CSS handling + +## 🚀 Get Started + +```bash +# Install latest version +pip install crawl4ai==0.7.5 + +# Docker deployment +docker pull unclecode/crawl4ai:latest +docker run -p 11235:11235 unclecode/crawl4ai:latest +``` + +**Try the Demo:** +```bash +# Run working examples +python docs/releases_review/demo_v0.7.5.py +``` + +**Resources:** +- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com) +- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai) +- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN) +- 🐦 Twitter: [@unclecode](https://x.com/unclecode) + +Happy crawling! 🕷️ diff --git a/docs/md_v2/complete-sdk-reference.md b/docs/md_v2/complete-sdk-reference.md new file mode 100644 index 00000000..d4a5ba65 --- /dev/null +++ b/docs/md_v2/complete-sdk-reference.md @@ -0,0 +1,5196 @@ +# Crawl4AI Complete SDK Documentation + +**Generated:** 2025-10-19 12:56 +**Format:** Ultra-Dense Reference (Optimized for AI Assistants) +**Crawl4AI Version:** 0.7.4 + +--- + +## Navigation + + +- [Installation & Setup](#installation--setup) +- [Quick Start](#quick-start) +- [Core API](#core-api) +- [Configuration](#configuration) +- [Crawling Patterns](#crawling-patterns) +- [Content Processing](#content-processing) +- [Extraction Strategies](#extraction-strategies) +- [Advanced Features](#advanced-features) + +--- + + +# Installation & Setup + +# Installation & Setup (2023 Edition) +## 1. Basic Installation +```bash +pip install crawl4ai +``` +## 2. Initial Setup & Diagnostics +### 2.1 Run the Setup Command +```bash +crawl4ai-setup +``` +- Performs OS-level checks (e.g., missing libs on Linux) +- Confirms your environment is ready to crawl +### 2.2 Diagnostics +```bash +crawl4ai-doctor +``` +- Check Python version compatibility +- Verify Playwright installation +- Inspect environment variables or library conflicts +If any issues arise, follow its suggestions (e.g., installing additional system packages) and re-run `crawl4ai-setup`. +## 3. Verifying Installation: A Simple Crawl (Skip this step if you already run `crawl4ai-doctor`) +Below is a minimal Python script demonstrating a **basic** crawl. It uses our new **`BrowserConfig`** and **`CrawlerRunConfig`** for clarity, though no custom settings are passed in this example: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.example.com", + ) + print(result.markdown[:300]) # Show the first 300 characters of extracted text + +if __name__ == "__main__": + asyncio.run(main()) +``` +- A headless browser session loads `example.com` +- Crawl4AI returns ~300 characters of markdown. +If errors occur, rerun `crawl4ai-doctor` or manually ensure Playwright is installed correctly. +## 4. Advanced Installation (Optional) +### 4.1 Torch, Transformers, or All +- **Text Clustering (Torch)** + ```bash + pip install crawl4ai[torch] + crawl4ai-setup + ``` +- **Transformers** + ```bash + pip install crawl4ai[transformer] + crawl4ai-setup + ``` +- **All Features** + ```bash + pip install crawl4ai[all] + crawl4ai-setup + ``` +```bash +crawl4ai-download-models +``` +## 5. Docker (Experimental) +```bash +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic +``` +You can then make POST requests to `http://localhost:11235/crawl` to perform crawls. **Production usage** is discouraged until our new Docker approach is ready (planned in Jan or Feb 2025). +## 6. Local Server Mode (Legacy) +## Summary +1. **Install** with `pip install crawl4ai` and run `crawl4ai-setup`. +2. **Diagnose** with `crawl4ai-doctor` if you see errors. +3. **Verify** by crawling `example.com` with minimal `BrowserConfig` + `CrawlerRunConfig`. + + + +# Quick Start + +# Getting Started with Crawl4AI +1. Run your **first crawl** using minimal configuration. +3. Experiment with a simple **CSS-based extraction** strategy. +5. Crawl a **dynamic** page that loads content via JavaScript. +## 1. Introduction +- An asynchronous crawler, **`AsyncWebCrawler`**. +- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**. +- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports optional filters). +- Multiple extraction strategies (LLM-based or “traditional” CSS/XPath-based). +## 2. Your First Crawl +Here’s a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output: +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown[:300]) # Print first 300 chars + +if __name__ == "__main__": + asyncio.run(main()) +``` +- **`AsyncWebCrawler`** launches a headless browser (Chromium by default). +- It fetches `https://example.com`. +- Crawl4AI automatically converts the HTML into Markdown. +## 3. Basic Configuration (Light Introduction) +1. **`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.). +2. **`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.). +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + browser_conf = BrowserConfig(headless=True) # or False to see the browser + run_conf = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler(config=browser_conf) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_conf + ) + print(result.markdown) + +if __name__ == "__main__": + asyncio.run(main()) +``` +> IMPORTANT: By default cache mode is set to `CacheMode.BYPASS` to have fresh content. Set `CacheMode.ENABLED` to enable caching. +## 4. Generating Markdown Output +- **`result.markdown`**: +- **`result.markdown.fit_markdown`**: + The same content after applying any configured **content filter** (e.g., `PruningContentFilter`). +### Example: Using a Filter with `DefaultMarkdownGenerator` +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +md_generator = DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed") +) + +config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + markdown_generator=md_generator +) + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://news.ycombinator.com", config=config) + print("Raw Markdown length:", len(result.markdown.raw_markdown)) + print("Fit Markdown length:", len(result.markdown.fit_markdown)) +``` +**Note**: If you do **not** specify a content filter or markdown generator, you’ll typically see only the raw Markdown. `PruningContentFilter` may adds around `50ms` in processing time. We’ll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial. +## 5. Simple Data Extraction (CSS-based) +```python +from crawl4ai import JsonCssExtractionStrategy +from crawl4ai import LLMConfig + +# Generate a schema (one-time cost) +html = "

Gaming Laptop

$999.99
" + +# Using OpenAI (requires API token) +schema = JsonCssExtractionStrategy.generate_schema( + html, + llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI +) + +# Or using Ollama (open source, no token needed) +schema = JsonCssExtractionStrategy.generate_schema( + html, + llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama +) + +# Use the schema for fast, repeated extractions +strategy = JsonCssExtractionStrategy(schema) +``` +```python +import asyncio +import json +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai import JsonCssExtractionStrategy + +async def main(): + schema = { + "name": "Example Items", + "baseSelector": "div.item", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] + } + + raw_html = "

Item 1

Link 1
" + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="raw://" + raw_html, + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema) + ) + ) + # The JSON output is stored in 'extracted_content' + data = json.loads(result.extracted_content) + print(data) + +if __name__ == "__main__": + asyncio.run(main()) +``` +- Great for repetitive page structures (e.g., item listings, articles). +- No AI usage or costs. +- The crawler returns a JSON string you can parse or store. +> Tips: You can pass raw HTML to the crawler instead of a URL. To do so, prefix the HTML with `raw://`. +## 6. Simple Data Extraction (LLM-based) +- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`) +- **OpenAI Models** (e.g., `openai/gpt-4`, requires `api_token`) +- Or any provider supported by the underlying library +```python +import os +import json +import asyncio +from pydantic import BaseModel, Field +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig +from crawl4ai import LLMExtractionStrategy + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field( + ..., description="Fee for output token for the OpenAI model." + ) + +async def extract_structured_data_using_llm( + provider: str, api_token: str = None, extra_headers: Dict[str, str] = None +): + print(f"\n--- Extracting Structured Data with {provider} ---") + + if api_token is None and provider != "ollama": + print(f"API token is required for {provider}. Skipping this example.") + return + + browser_config = BrowserConfig(headless=True) + + extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000} + if extra_headers: + extra_args["extra_headers"] = extra_headers + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=1, + page_timeout=80000, + extraction_strategy=LLMExtractionStrategy( + llm_config = LLMConfig(provider=provider,api_token=api_token), + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. + Do not miss any models in the entire content.""", + extra_args=extra_args, + ), + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://openai.com/api/pricing/", config=crawler_config + ) + print(result.extracted_content) + +if __name__ == "__main__": + + asyncio.run( + extract_structured_data_using_llm( + provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY") + ) + ) +``` +- We define a Pydantic schema (`PricingInfo`) describing the fields we want. +## 7. Adaptive Crawling (New!) +```python +import asyncio +from crawl4ai import AsyncWebCrawler, AdaptiveCrawler + +async def adaptive_example(): + async with AsyncWebCrawler() as crawler: + adaptive = AdaptiveCrawler(crawler) + + # Start adaptive crawling + result = await adaptive.digest( + start_url="https://docs.python.org/3/", + query="async context managers" + ) + + # View results + adaptive.print_stats() + print(f"Crawled {len(result.crawled_urls)} pages") + print(f"Achieved {adaptive.confidence:.0%} confidence") + +if __name__ == "__main__": + asyncio.run(adaptive_example()) +``` +- **Automatic stopping**: Stops when sufficient information is gathered +- **Intelligent link selection**: Follows only relevant links +- **Confidence scoring**: Know how complete your information is +## 8. Multi-URL Concurrency (Preview) +If you need to crawl multiple URLs in **parallel**, you can use `arun_many()`. By default, Crawl4AI employs a **MemoryAdaptiveDispatcher**, automatically adjusting concurrency based on system resources. Here’s a quick glimpse: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def quick_parallel_example(): + urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3" + ] + + run_conf = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + stream=True # Enable streaming mode + ) + + async with AsyncWebCrawler() as crawler: + # Stream results as they complete + async for result in await crawler.arun_many(urls, config=run_conf): + if result.success: + print(f"[OK] {result.url}, length: {len(result.markdown.raw_markdown)}") + else: + print(f"[ERROR] {result.url} => {result.error_message}") + + # Or get all results at once (default behavior) + run_conf = run_conf.clone(stream=False) + results = await crawler.arun_many(urls, config=run_conf) + for res in results: + if res.success: + print(f"[OK] {res.url}, length: {len(res.markdown.raw_markdown)}") + else: + print(f"[ERROR] {res.url} => {res.error_message}") + +if __name__ == "__main__": + asyncio.run(quick_parallel_example()) +``` +1. **Streaming mode** (`stream=True`): Process results as they become available using `async for` +2. **Batch mode** (`stream=False`): Wait for all results to complete +## 8. Dynamic Content Example +Some sites require multiple “page clicks” or dynamic JavaScript updates. Below is an example showing how to **click** a “Next Page” button and wait for new commits to load on GitHub, using **`BrowserConfig`** and **`CrawlerRunConfig`**: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai import JsonCssExtractionStrategy + +async def extract_structured_data_using_css_extractor(): + print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") + schema = { + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src", + }, + ], + } + + browser_config = BrowserConfig(headless=True, java_script_enabled=True) + + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + for(let tab of tabs) { + tab.scrollIntoView(); + tab.click(); + await new Promise(r => setTimeout(r, 500)); + } + })(); + """ + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + js_code=[js_click_tabs], + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://www.kidocode.com/degrees/technology", config=crawler_config + ) + + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) + +async def main(): + await extract_structured_data_using_css_extractor() + +if __name__ == "__main__": + asyncio.run(main()) +``` +- **`BrowserConfig(headless=False)`**: We want to watch it click “Next Page.” +- **`CrawlerRunConfig(...)`**: We specify the extraction strategy, pass `session_id` to reuse the same page. +- **`js_code`** and **`wait_for`** are used for subsequent pages (`page > 0`) to click the “Next” button and wait for new commits to load. +- **`js_only=True`** indicates we’re not re-navigating but continuing the existing session. +- Finally, we call `kill_session()` to clean up the page and browser session. +## 9. Next Steps +1. Performed a basic crawl and printed Markdown. +2. Used **content filters** with a markdown generator. +3. Extracted JSON via **CSS** or **LLM** strategies. +4. Handled **dynamic** pages with JavaScript triggers. + + + +# Core API + +# AsyncWebCrawler +The **`AsyncWebCrawler`** is the core class for asynchronous web crawling in Crawl4AI. You typically create it **once**, optionally customize it with a **`BrowserConfig`** (e.g., headless, user agent), then **run** multiple **`arun()`** calls with different **`CrawlerRunConfig`** objects. +1. **Create** a `BrowserConfig` for global browser settings.  +2. **Instantiate** `AsyncWebCrawler(config=browser_config)`.  +3. **Use** the crawler in an async context manager (`async with`) or manage start/close manually.  +4. **Call** `arun(url, config=crawler_run_config)` for each page you want. +## 1. Constructor Overview +```python +class AsyncWebCrawler: + def __init__( + self, + crawler_strategy: Optional[AsyncCrawlerStrategy] = None, + config: Optional[BrowserConfig] = None, + always_bypass_cache: bool = False, # deprecated + always_by_pass_cache: Optional[bool] = None, # also deprecated + base_directory: str = ..., + thread_safe: bool = False, + **kwargs, + ): + """ + Create an AsyncWebCrawler instance. + + Args: + crawler_strategy: + (Advanced) Provide a custom crawler strategy if needed. + config: + A BrowserConfig object specifying how the browser is set up. + always_bypass_cache: + (Deprecated) Use CrawlerRunConfig.cache_mode instead. + base_directory: + Folder for storing caches/logs (if relevant). + thread_safe: + If True, attempts some concurrency safeguards. Usually False. + **kwargs: + Additional legacy or debugging parameters. + """ + ) + +### Typical Initialization + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig +browser_cfg = BrowserConfig( + browser_type="chromium", + headless=True, + verbose=True +crawler = AsyncWebCrawler(config=browser_cfg) +``` + +**Notes**: + +- **Legacy** parameters like `always_bypass_cache` remain for backward compatibility, but prefer to set **caching** in `CrawlerRunConfig`. + +--- + +## 2. Lifecycle: Start/Close or Context Manager + +### 2.1 Context Manager (Recommended) + +```python +async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun("https://example.com") + # The crawler automatically starts/closes resources +``` + +When the `async with` block ends, the crawler cleans up (closes the browser, etc.). + +### 2.2 Manual Start & Close + +```python +crawler = AsyncWebCrawler(config=browser_cfg) +await crawler.start() +result1 = await crawler.arun("https://example.com") +result2 = await crawler.arun("https://another.com") +await crawler.close() +``` + +Use this style if you have a **long-running** application or need full control of the crawler’s lifecycle. + +--- + +## 3. Primary Method: `arun()` + +```python +async def arun( + url: str, + config: Optional[CrawlerRunConfig] = None, + # Legacy parameters for backward compatibility... +``` + +### 3.1 New Approach + +You pass a `CrawlerRunConfig` object that sets up everything about a crawl—content filtering, caching, session reuse, JS code, screenshots, etc. + +```python +import asyncio +from crawl4ai import CrawlerRunConfig, CacheMode +run_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + css_selector="main.article", + word_count_threshold=10, + screenshot=True +async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun("https://example.com/news", config=run_cfg) +``` + +### 3.2 Legacy Parameters Still Accepted + +For **backward** compatibility, `arun()` can still accept direct arguments like `css_selector=...`, `word_count_threshold=...`, etc., but we strongly advise migrating them into a **`CrawlerRunConfig`**. + +--- + +## 4. Batch Processing: `arun_many()` + +```python +async def arun_many( + urls: List[str], + config: Optional[CrawlerRunConfig] = None, + # Legacy parameters maintained for backwards compatibility... +``` + +### 4.1 Resource-Aware Crawling + +The `arun_many()` method now uses an intelligent dispatcher that: + +- Monitors system memory usage +- Implements adaptive rate limiting +- Provides detailed progress monitoring +- Manages concurrent crawls efficiently + +### 4.2 Example Usage + +Check page [Multi-url Crawling](../advanced/multi-url-crawling.md) for a detailed example of how to use `arun_many()`. + +```python +### 4.3 Key Features +1. **Rate Limiting** + - Automatic delay between requests + - Exponential backoff on rate limit detection + - Domain-specific rate limiting + - Configurable retry strategy +2. **Resource Monitoring** + - Memory usage tracking + - Adaptive concurrency based on system load + - Automatic pausing when resources are constrained +3. **Progress Monitoring** + - Detailed or aggregated progress display + - Real-time status updates + - Memory usage statistics +4. **Error Handling** + - Graceful handling of rate limits + - Automatic retries with backoff + - Detailed error reporting +## 5. `CrawlResult` Output +Each `arun()` returns a **`CrawlResult`** containing: +- `url`: Final URL (if redirected). +- `html`: Original HTML. +- `cleaned_html`: Sanitized HTML. +- `markdown_v2`: Deprecated. Instead just use regular `markdown` +- `extracted_content`: If an extraction strategy was used (JSON for CSS/LLM strategies). +- `screenshot`, `pdf`: If screenshots/PDF requested. +- `media`, `links`: Information about discovered images/links. +- `success`, `error_message`: Status info. +## 6. Quick Example +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai import JsonCssExtractionStrategy +import json + +async def main(): + # 1. Browser config + browser_cfg = BrowserConfig( + browser_type="firefox", + headless=False, + verbose=True + ) + + # 2. Run config + schema = { + "name": "Articles", + "baseSelector": "article.post", + "fields": [ + { + "name": "title", + "selector": "h2", + "type": "text" + }, + { + "name": "url", + "selector": "a", + "type": "attribute", + "attribute": "href" + } + ] + } + + run_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(schema), + word_count_threshold=15, + remove_overlay_elements=True, + wait_for="css:.post" # Wait for posts to appear + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun( + url="https://example.com/blog", + config=run_cfg + ) + + if result.success: + print("Cleaned HTML length:", len(result.cleaned_html)) + if result.extracted_content: + articles = json.loads(result.extracted_content) + print("Extracted articles:", articles[:2]) + else: + print("Error:", result.error_message) + +asyncio.run(main()) +``` +- We define a **`BrowserConfig`** with Firefox, no headless, and `verbose=True`.  +- We define a **`CrawlerRunConfig`** that **bypasses cache**, uses a **CSS** extraction schema, has a `word_count_threshold=15`, etc.  +- We pass them to `AsyncWebCrawler(config=...)` and `arun(url=..., config=...)`. +## 7. Best Practices & Migration Notes +1. **Use** `BrowserConfig` for **global** settings about the browser’s environment.  +2. **Use** `CrawlerRunConfig` for **per-crawl** logic (caching, content filtering, extraction strategies, wait conditions).  +3. **Avoid** legacy parameters like `css_selector` or `word_count_threshold` directly in `arun()`. Instead: + ```python + run_cfg = CrawlerRunConfig(css_selector=".main-content", word_count_threshold=20) + result = await crawler.arun(url="...", config=run_cfg) + ``` +## 8. Summary +- **Constructor** accepts **`BrowserConfig`** (or defaults).  +- **`arun(url, config=CrawlerRunConfig)`** is the main method for single-page crawls.  +- **`arun_many(urls, config=CrawlerRunConfig)`** handles concurrency across multiple URLs.  +- For advanced lifecycle control, use `start()` and `close()` explicitly.  +- If you used `AsyncWebCrawler(browser_type="chromium", css_selector="...")`, move browser settings to `BrowserConfig(...)` and content/crawl logic to `CrawlerRunConfig(...)`. + + +# `arun()` Parameter Guide (New Approach) +In Crawl4AI’s **latest** configuration model, nearly all parameters that once went directly to `arun()` are now part of **`CrawlerRunConfig`**. When calling `arun()`, you provide: +```python +await crawler.arun( + url="https://example.com", + config=my_run_config +) +``` +Below is an organized look at the parameters that can go inside `CrawlerRunConfig`, divided by their functional areas. For **Browser** settings (e.g., `headless`, `browser_type`), see [BrowserConfig](./parameters.md). +## 1. Core Usage +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def main(): + run_config = CrawlerRunConfig( + verbose=True, # Detailed logging + cache_mode=CacheMode.ENABLED, # Use normal read/write cache + check_robots_txt=True, # Respect robots.txt rules + # ... other parameters + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + + # Check if blocked by robots.txt + if not result.success and result.status_code == 403: + print(f"Error: {result.error_message}") +``` +- `verbose=True` logs each crawl step.  +- `cache_mode` decides how to read/write the local crawl cache. +## 2. Cache Control +**`cache_mode`** (default: `CacheMode.ENABLED`) +Use a built-in enum from `CacheMode`: +- `ENABLED`: Normal caching—reads if available, writes if missing. +- `DISABLED`: No caching—always refetch pages. +- `READ_ONLY`: Reads from cache only; no new writes. +- `WRITE_ONLY`: Writes to cache but doesn’t read existing data. +- `BYPASS`: Skips reading cache for this crawl (though it might still write if set up that way). +```python +run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS +) +``` +- `bypass_cache=True` acts like `CacheMode.BYPASS`. +- `disable_cache=True` acts like `CacheMode.DISABLED`. +- `no_cache_read=True` acts like `CacheMode.WRITE_ONLY`. +- `no_cache_write=True` acts like `CacheMode.READ_ONLY`. +## 3. Content Processing & Selection +### 3.1 Text Processing +```python +run_config = CrawlerRunConfig( + word_count_threshold=10, # Ignore text blocks <10 words + only_text=False, # If True, tries to remove non-text elements + keep_data_attributes=False # Keep or discard data-* attributes +) +``` +### 3.2 Content Selection +```python +run_config = CrawlerRunConfig( + css_selector=".main-content", # Focus on .main-content region only + excluded_tags=["form", "nav"], # Remove entire tag blocks + remove_forms=True, # Specifically strip
elements + remove_overlay_elements=True, # Attempt to remove modals/popups +) +``` +### 3.3 Link Handling +```python +run_config = CrawlerRunConfig( + exclude_external_links=True, # Remove external links from final content + exclude_social_media_links=True, # Remove links to known social sites + exclude_domains=["ads.example.com"], # Exclude links to these domains + exclude_social_media_domains=["facebook.com","twitter.com"], # Extend the default list +) +``` +### 3.4 Media Filtering +```python +run_config = CrawlerRunConfig( + exclude_external_images=True # Strip images from other domains +) +``` +## 4. Page Navigation & Timing +### 4.1 Basic Browser Flow +```python +run_config = CrawlerRunConfig( + wait_for="css:.dynamic-content", # Wait for .dynamic-content + delay_before_return_html=2.0, # Wait 2s before capturing final HTML + page_timeout=60000, # Navigation & script timeout (ms) +) +``` +- `wait_for`: + - `"css:selector"` or + - `"js:() => boolean"` + e.g. `js:() => document.querySelectorAll('.item').length > 10`. +- `mean_delay` & `max_range`: define random delays for `arun_many()` calls.  +- `semaphore_count`: concurrency limit when crawling multiple URLs. +### 4.2 JavaScript Execution +```python +run_config = CrawlerRunConfig( + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more')?.click();" + ], + js_only=False +) +``` +- `js_code` can be a single string or a list of strings.  +- `js_only=True` means “I’m continuing in the same session with new JS steps, no new full navigation.” +### 4.3 Anti-Bot +```python +run_config = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True +) +``` +- `magic=True` tries multiple stealth features.  +- `simulate_user=True` mimics mouse movements or random delays.  +- `override_navigator=True` fakes some navigator properties (like user agent checks). +## 5. Session Management +**`session_id`**: +```python +run_config = CrawlerRunConfig( + session_id="my_session123" +) +``` +If re-used in subsequent `arun()` calls, the same tab/page context is continued (helpful for multi-step tasks or stateful browsing). +## 6. Screenshot, PDF & Media Options +```python +run_config = CrawlerRunConfig( + screenshot=True, # Grab a screenshot as base64 + screenshot_wait_for=1.0, # Wait 1s before capturing + pdf=True, # Also produce a PDF + image_description_min_word_threshold=5, # If analyzing alt text + image_score_threshold=3, # Filter out low-score images +) +``` +- `result.screenshot` → Base64 screenshot string. +- `result.pdf` → Byte array with PDF data. +## 7. Extraction Strategy +**For advanced data extraction** (CSS/LLM-based), set `extraction_strategy`: +```python +run_config = CrawlerRunConfig( + extraction_strategy=my_css_or_llm_strategy +) +``` +The extracted data will appear in `result.extracted_content`. +## 8. Comprehensive Example +Below is a snippet combining many parameters: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode +from crawl4ai import JsonCssExtractionStrategy + +async def main(): + # Example schema + schema = { + "name": "Articles", + "baseSelector": "article.post", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] + } + + run_config = CrawlerRunConfig( + # Core + verbose=True, + cache_mode=CacheMode.ENABLED, + check_robots_txt=True, # Respect robots.txt rules + + # Content + word_count_threshold=10, + css_selector="main.content", + excluded_tags=["nav", "footer"], + exclude_external_links=True, + + # Page & JS + js_code="document.querySelector('.show-more')?.click();", + wait_for="css:.loaded-block", + page_timeout=30000, + + # Extraction + extraction_strategy=JsonCssExtractionStrategy(schema), + + # Session + session_id="persistent_session", + + # Media + screenshot=True, + pdf=True, + + # Anti-bot + simulate_user=True, + magic=True, + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/posts", config=run_config) + if result.success: + print("HTML length:", len(result.cleaned_html)) + print("Extraction JSON:", result.extracted_content) + if result.screenshot: + print("Screenshot length:", len(result.screenshot)) + if result.pdf: + print("PDF bytes length:", len(result.pdf)) + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` +1. **Crawling** the main content region, ignoring external links.  +2. Running **JavaScript** to click “.show-more”.  +3. **Waiting** for “.loaded-block” to appear.  +4. Generating a **screenshot** & **PDF** of the final page.  +## 9. Best Practices +1. **Use `BrowserConfig` for global browser** settings (headless, user agent).  +2. **Use `CrawlerRunConfig`** to handle the **specific** crawl needs: content filtering, caching, JS, screenshot, extraction, etc.  +4. **Limit** large concurrency (`semaphore_count`) if the site or your system can’t handle it.  +5. For dynamic pages, set `js_code` or `scan_full_page` so you load all content. +## 10. Conclusion +All parameters that used to be direct arguments to `arun()` now belong in **`CrawlerRunConfig`**. This approach: +- Makes code **clearer** and **more maintainable**.  + + +# `arun_many(...)` Reference +> **Note**: This function is very similar to [`arun()`](./arun.md) but focused on **concurrent** or **batch** crawling. If you’re unfamiliar with `arun()` usage, please read that doc first, then review this for differences. +## Function Signature +```python +async def arun_many( + urls: Union[List[str], List[Any]], + config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None, + dispatcher: Optional[BaseDispatcher] = None, + ... +) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]: + """ + Crawl multiple URLs concurrently or in batches. + + :param urls: A list of URLs (or tasks) to crawl. + :param config: (Optional) Either: + - A single `CrawlerRunConfig` applying to all URLs + - A list of `CrawlerRunConfig` objects with url_matcher patterns + :param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher). + ... + :return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled. + """ +``` +## Differences from `arun()` +1. **Multiple URLs**: + - Instead of crawling a single URL, you pass a list of them (strings or tasks).  + - The function returns either a **list** of `CrawlResult` or an **async generator** if streaming is enabled. +2. **Concurrency & Dispatchers**: + - **`dispatcher`** param allows advanced concurrency control.  + - If omitted, a default dispatcher (like `MemoryAdaptiveDispatcher`) is used internally.  +3. **Streaming Support**: + - Enable streaming by setting `stream=True` in your `CrawlerRunConfig`. + - When streaming, use `async for` to process results as they become available. +4. **Parallel** Execution**: + - `arun_many()` can run multiple requests concurrently under the hood.  + - Each `CrawlResult` might also include a **`dispatch_result`** with concurrency details (like memory usage, start/end times). +### Basic Example (Batch Mode) +```python +# Minimal usage: The default dispatcher will be used +results = await crawler.arun_many( + urls=["https://site1.com", "https://site2.com"], + config=CrawlerRunConfig(stream=False) # Default behavior +) + +for res in results: + if res.success: + print(res.url, "crawled OK!") + else: + print("Failed:", res.url, "-", res.error_message) +``` +### Streaming Example +```python +config = CrawlerRunConfig( + stream=True, # Enable streaming mode + cache_mode=CacheMode.BYPASS +) + +# Process results as they complete +async for result in await crawler.arun_many( + urls=["https://site1.com", "https://site2.com", "https://site3.com"], + config=config +): + if result.success: + print(f"Just completed: {result.url}") + # Process each result immediately + process_result(result) +``` +### With a Custom Dispatcher +```python +dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=70.0, + max_session_permit=10 +) +results = await crawler.arun_many( + urls=["https://site1.com", "https://site2.com", "https://site3.com"], + config=my_run_config, + dispatcher=dispatcher +) +``` +### URL-Specific Configurations +Instead of using one config for all URLs, provide a list of configs with `url_matcher` patterns: +```python +from crawl4ai import CrawlerRunConfig, MatchMode +from crawl4ai.processors.pdf import PDFContentScrapingStrategy +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +# PDF files - specialized extraction +pdf_config = CrawlerRunConfig( + url_matcher="*.pdf", + scraping_strategy=PDFContentScrapingStrategy() +) + +# Blog/article pages - content filtering +blog_config = CrawlerRunConfig( + url_matcher=["*/blog/*", "*/article/*", "*python.org*"], + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48) + ) +) + +# Dynamic pages - JavaScript execution +github_config = CrawlerRunConfig( + url_matcher=lambda url: 'github.com' in url, + js_code="window.scrollTo(0, 500);" +) + +# API endpoints - JSON extraction +api_config = CrawlerRunConfig( + url_matcher=lambda url: 'api' in url or url.endswith('.json'), + # Custome settings for JSON extraction +) + +# Default fallback config +default_config = CrawlerRunConfig() # No url_matcher means it never matches except as fallback + +# Pass the list of configs - first match wins! +results = await crawler.arun_many( + urls=[ + "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → pdf_config + "https://blog.python.org/", # → blog_config + "https://github.com/microsoft/playwright", # → github_config + "https://httpbin.org/json", # → api_config + "https://example.com/" # → default_config + ], + config=[pdf_config, blog_config, github_config, api_config, default_config] +) +``` +- **String patterns**: `"*.pdf"`, `"*/blog/*"`, `"*python.org*"` +- **Function matchers**: `lambda url: 'api' in url` +- **Mixed patterns**: Combine strings and functions with `MatchMode.OR` or `MatchMode.AND` +- **First match wins**: Configs are evaluated in order +- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.  +- **Important**: Always include a default config (without `url_matcher`) as the last item if you want to handle all URLs. Otherwise, unmatched URLs will fail. +### Return Value +Either a **list** of [`CrawlResult`](./crawl-result.md) objects, or an **async generator** if streaming is enabled. You can iterate to check `result.success` or read each item’s `extracted_content`, `markdown`, or `dispatch_result`. +## Dispatcher Reference +- **`MemoryAdaptiveDispatcher`**: Dynamically manages concurrency based on system memory usage.  +- **`SemaphoreDispatcher`**: Fixed concurrency limit, simpler but less adaptive.  +## Common Pitfalls +3. **Error Handling**: Each `CrawlResult` might fail for different reasons—always check `result.success` or the `error_message` before proceeding. +## Conclusion +Use `arun_many()` when you want to **crawl multiple URLs** simultaneously or in controlled parallel tasks. If you need advanced concurrency features (like memory-based adaptive throttling or complex rate-limiting), provide a **dispatcher**. Each result is a standard `CrawlResult`, possibly augmented with concurrency stats (`dispatch_result`) for deeper inspection. For more details on concurrency logic and dispatchers, see the [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) docs. + + +# `CrawlResult` Reference +The **`CrawlResult`** class encapsulates everything returned after a single crawl operation. It provides the **raw or processed content**, details on links and media, plus optional metadata (like screenshots, PDFs, or extracted JSON). +**Location**: `crawl4ai/crawler/models.py` (for reference) +```python +class CrawlResult(BaseModel): + url: str + html: str + success: bool + cleaned_html: Optional[str] = None + fit_html: Optional[str] = None # Preprocessed HTML optimized for extraction + media: Dict[str, List[Dict]] = {} + links: Dict[str, List[Dict]] = {} + downloaded_files: Optional[List[str]] = None + screenshot: Optional[str] = None + pdf : Optional[bytes] = None + mhtml: Optional[str] = None + markdown: Optional[Union[str, MarkdownGenerationResult]] = None + extracted_content: Optional[str] = None + metadata: Optional[dict] = None + error_message: Optional[str] = None + session_id: Optional[str] = None + response_headers: Optional[dict] = None + status_code: Optional[int] = None + ssl_certificate: Optional[SSLCertificate] = None + dispatch_result: Optional[DispatchResult] = None + ... +``` +## 1. Basic Crawl Info +### 1.1 **`url`** *(str)* +```python +print(result.url) # e.g., "https://example.com/" +``` +### 1.2 **`success`** *(bool)* +**What**: `True` if the crawl pipeline ended without major errors; `False` otherwise. +```python +if not result.success: + print(f"Crawl failed: {result.error_message}") +``` +### 1.3 **`status_code`** *(Optional[int])* +```python +if result.status_code == 404: + print("Page not found!") +``` +### 1.4 **`error_message`** *(Optional[str])* +**What**: If `success=False`, a textual description of the failure. +```python +if not result.success: + print("Error:", result.error_message) +``` +### 1.5 **`session_id`** *(Optional[str])* +```python +# If you used session_id="login_session" in CrawlerRunConfig, see it here: +print("Session:", result.session_id) +``` +### 1.6 **`response_headers`** *(Optional[dict])* +```python +if result.response_headers: + print("Server:", result.response_headers.get("Server", "Unknown")) +``` +### 1.7 **`ssl_certificate`** *(Optional[SSLCertificate])* +**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the site's certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`, + `subject`, `valid_from`, `valid_until`, etc. +```python +if result.ssl_certificate: + print("Issuer:", result.ssl_certificate.issuer) +``` +## 2. Raw / Cleaned Content +### 2.1 **`html`** *(str)* +```python +# Possibly large +print(len(result.html)) +``` +### 2.2 **`cleaned_html`** *(Optional[str])* +**What**: A sanitized HTML version—scripts, styles, or excluded tags are removed based on your `CrawlerRunConfig`. +```python +print(result.cleaned_html[:500]) # Show a snippet +``` +## 3. Markdown Fields +### 3.1 The Markdown Generation Approach +- **Raw** markdown +- **Links as citations** (with a references section) +- **Fit** markdown if a **content filter** is used (like Pruning or BM25) +**`MarkdownGenerationResult`** includes: +- **`raw_markdown`** *(str)*: The full HTML→Markdown conversion. +- **`markdown_with_citations`** *(str)*: Same markdown, but with link references as academic-style citations. +- **`references_markdown`** *(str)*: The reference list or footnotes at the end. +- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered "fit" text. +- **`fit_html`** *(Optional[str])*: The HTML that led to `fit_markdown`. +```python +if result.markdown: + md_res = result.markdown + print("Raw MD:", md_res.raw_markdown[:300]) + print("Citations MD:", md_res.markdown_with_citations[:300]) + print("References:", md_res.references_markdown) + if md_res.fit_markdown: + print("Pruned text:", md_res.fit_markdown[:300]) +``` +### 3.2 **`markdown`** *(Optional[Union[str, MarkdownGenerationResult]])* +**What**: Holds the `MarkdownGenerationResult`. +```python +print(result.markdown.raw_markdown[:200]) +print(result.markdown.fit_markdown) +print(result.markdown.fit_html) +``` +**Important**: "Fit" content (in `fit_markdown`/`fit_html`) exists in result.markdown, only if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`. +## 4. Media & Links +### 4.1 **`media`** *(Dict[str, List[Dict]])* +**What**: Contains info about discovered images, videos, or audio. Typically keys: `"images"`, `"videos"`, `"audios"`. +- `src` *(str)*: Media URL +- `alt` or `title` *(str)*: Descriptive text +- `score` *(float)*: Relevance score if the crawler's heuristic found it "important" +- `desc` or `description` *(Optional[str])*: Additional context extracted from surrounding text +```python +images = result.media.get("images", []) +for img in images: + if img.get("score", 0) > 5: + print("High-value image:", img["src"]) +``` +### 4.2 **`links`** *(Dict[str, List[Dict]])* +**What**: Holds internal and external link data. Usually two keys: `"internal"` and `"external"`. +- `href` *(str)*: The link target +- `text` *(str)*: Link text +- `title` *(str)*: Title attribute +- `context` *(str)*: Surrounding text snippet +- `domain` *(str)*: If external, the domain +```python +for link in result.links["internal"]: + print(f"Internal link to {link['href']} with text {link['text']}") +``` +## 5. Additional Fields +### 5.1 **`extracted_content`** *(Optional[str])* +**What**: If you used **`extraction_strategy`** (CSS, LLM, etc.), the structured output (JSON). +```python +if result.extracted_content: + data = json.loads(result.extracted_content) + print(data) +``` +### 5.2 **`downloaded_files`** *(Optional[List[str]])* +**What**: If `accept_downloads=True` in your `BrowserConfig` + `downloads_path`, lists local file paths for downloaded items. +```python +if result.downloaded_files: + for file_path in result.downloaded_files: + print("Downloaded:", file_path) +``` +### 5.3 **`screenshot`** *(Optional[str])* +**What**: Base64-encoded screenshot if `screenshot=True` in `CrawlerRunConfig`. +```python +import base64 +if result.screenshot: + with open("page.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) +``` +### 5.4 **`pdf`** *(Optional[bytes])* +**What**: Raw PDF bytes if `pdf=True` in `CrawlerRunConfig`. +```python +if result.pdf: + with open("page.pdf", "wb") as f: + f.write(result.pdf) +``` +### 5.5 **`mhtml`** *(Optional[str])* +**What**: MHTML snapshot of the page if `capture_mhtml=True` in `CrawlerRunConfig`. MHTML (MIME HTML) format preserves the entire web page with all its resources (CSS, images, scripts, etc.) in a single file. +```python +if result.mhtml: + with open("page.mhtml", "w", encoding="utf-8") as f: + f.write(result.mhtml) +``` +### 5.6 **`metadata`** *(Optional[dict])* +```python +if result.metadata: + print("Title:", result.metadata.get("title")) + print("Author:", result.metadata.get("author")) +``` +## 6. `dispatch_result` (optional) +A `DispatchResult` object providing additional concurrency and resource usage information when crawling URLs in parallel (e.g., via `arun_many()` with custom dispatchers). It contains: +- **`task_id`**: A unique identifier for the parallel task. +- **`memory_usage`** (float): The memory (in MB) used at the time of completion. +- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task's execution. +- **`start_time`** / **`end_time`** (datetime): Time range for this crawling task. +- **`error_message`** (str): Any dispatcher- or concurrency-related error encountered. +```python +# Example usage: +for result in results: + if result.success and result.dispatch_result: + dr = result.dispatch_result + print(f"URL: {result.url}, Task ID: {dr.task_id}") + print(f"Memory: {dr.memory_usage:.1f} MB (Peak: {dr.peak_memory:.1f} MB)") + print(f"Duration: {dr.end_time - dr.start_time}") +``` +> **Note**: This field is typically populated when using `arun_many(...)` alongside a **dispatcher** (e.g., `MemoryAdaptiveDispatcher` or `SemaphoreDispatcher`). If no concurrency or dispatcher is used, `dispatch_result` may remain `None`. +## 7. Network Requests & Console Messages +When you enable network and console message capturing in `CrawlerRunConfig` using `capture_network_requests=True` and `capture_console_messages=True`, the `CrawlResult` will include these fields: +### 7.1 **`network_requests`** *(Optional[List[Dict[str, Any]]])* +- Each item has an `event_type` field that can be `"request"`, `"response"`, or `"request_failed"`. +- Request events include `url`, `method`, `headers`, `post_data`, `resource_type`, and `is_navigation_request`. +- Response events include `url`, `status`, `status_text`, `headers`, and `request_timing`. +- Failed request events include `url`, `method`, `resource_type`, and `failure_text`. +- All events include a `timestamp` field. +```python +if result.network_requests: + # Count different types of events + requests = [r for r in result.network_requests if r.get("event_type") == "request"] + responses = [r for r in result.network_requests if r.get("event_type") == "response"] + failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"] + + print(f"Captured {len(requests)} requests, {len(responses)} responses, and {len(failures)} failures") + + # Analyze API calls + api_calls = [r for r in requests if "api" in r.get("url", "")] + + # Identify failed resources + for failure in failures: + print(f"Failed to load: {failure.get('url')} - {failure.get('failure_text')}") +``` +### 7.2 **`console_messages`** *(Optional[List[Dict[str, Any]]])* +- Each item has a `type` field indicating the message type (e.g., `"log"`, `"error"`, `"warning"`, etc.). +- The `text` field contains the actual message text. +- Some messages include `location` information (URL, line, column). +- All messages include a `timestamp` field. +```python +if result.console_messages: + # Count messages by type + message_types = {} + for msg in result.console_messages: + msg_type = msg.get("type", "unknown") + message_types[msg_type] = message_types.get(msg_type, 0) + 1 + + print(f"Message type counts: {message_types}") + + # Display errors (which are usually most important) + for msg in result.console_messages: + if msg.get("type") == "error": + print(f"Error: {msg.get('text')}") +``` +## 8. Example: Accessing Everything +```python +async def handle_result(result: CrawlResult): + if not result.success: + print("Crawl error:", result.error_message) + return + + # Basic info + print("Crawled URL:", result.url) + print("Status code:", result.status_code) + + # HTML + print("Original HTML size:", len(result.html)) + print("Cleaned HTML size:", len(result.cleaned_html or "")) + + # Markdown output + if result.markdown: + print("Raw Markdown:", result.markdown.raw_markdown[:300]) + print("Citations Markdown:", result.markdown.markdown_with_citations[:300]) + if result.markdown.fit_markdown: + print("Fit Markdown:", result.markdown.fit_markdown[:200]) + + # Media & Links + if "images" in result.media: + print("Image count:", len(result.media["images"])) + if "internal" in result.links: + print("Internal link count:", len(result.links["internal"])) + + # Extraction strategy result + if result.extracted_content: + print("Structured data:", result.extracted_content) + + # Screenshot/PDF/MHTML + if result.screenshot: + print("Screenshot length:", len(result.screenshot)) + if result.pdf: + print("PDF bytes length:", len(result.pdf)) + if result.mhtml: + print("MHTML length:", len(result.mhtml)) + + # Network and console capturing + if result.network_requests: + print(f"Network requests captured: {len(result.network_requests)}") + # Analyze request types + req_types = {} + for req in result.network_requests: + if "resource_type" in req: + req_types[req["resource_type"]] = req_types.get(req["resource_type"], 0) + 1 + print(f"Resource types: {req_types}") + + if result.console_messages: + print(f"Console messages captured: {len(result.console_messages)}") + # Count by message type + msg_types = {} + for msg in result.console_messages: + msg_types[msg.get("type", "unknown")] = msg_types.get(msg.get("type", "unknown"), 0) + 1 + print(f"Message types: {msg_types}") +``` +## 9. Key Points & Future +1. **Deprecated legacy properties of CrawlResult** + - `markdown_v2` - Deprecated in v0.5. Just use `markdown`. It holds the `MarkdownGenerationResult` now! + - `fit_markdown` and `fit_html` - Deprecated in v0.5. They can now be accessed via `MarkdownGenerationResult` in `result.markdown`. eg: `result.markdown.fit_markdown` and `result.markdown.fit_html` +2. **Fit Content** + - **`fit_markdown`** and **`fit_html`** appear in MarkdownGenerationResult, only if you used a content filter (like **PruningContentFilter** or **BM25ContentFilter**) inside your **MarkdownGenerationStrategy** or set them directly. + - If no filter is used, they remain `None`. +3. **References & Citations** + - If you enable link citations in your `DefaultMarkdownGenerator` (`options={"citations": True}`), you’ll see `markdown_with_citations` plus a **`references_markdown`** block. This helps large language models or academic-like referencing. +4. **Links & Media** + - `links["internal"]` and `links["external"]` group discovered anchors by domain. + - `media["images"]` / `["videos"]` / `["audios"]` store extracted media elements with optional scoring or context. +5. **Error Cases** + - If `success=False`, check `error_message` (e.g., timeouts, invalid URLs). + - `status_code` might be `None` if we failed before an HTTP response. +Use **`CrawlResult`** to glean all final outputs and feed them into your data pipelines, AI models, or archives. With the synergy of a properly configured **BrowserConfig** and **CrawlerRunConfig**, the crawler can produce robust, structured results here in **`CrawlResult`**. + + + +# Configuration + +# Browser, Crawler & LLM Configuration (Quick Overview) +Crawl4AI's flexibility stems from two key classes: +1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent). +2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.). +3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.) +In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md). +## 1. BrowserConfig Essentials +```python +class BrowserConfig: + def __init__( + browser_type="chromium", + headless=True, + proxy_config=None, + viewport_width=1080, + viewport_height=600, + verbose=True, + use_persistent_context=False, + user_data_dir=None, + cookies=None, + headers=None, + user_agent=None, + text_mode=False, + light_mode=False, + extra_args=None, + enable_stealth=False, + # ... other advanced parameters omitted here + ): + ... +``` +### Key Fields to Note +1. **`browser_type`** +- Options: `"chromium"`, `"firefox"`, or `"webkit"`. +- Defaults to `"chromium"`. +- If you need a different engine, specify it here. +2. **`headless`** + - `True`: Runs the browser in headless mode (invisible browser). + - `False`: Runs the browser in visible mode, which helps with debugging. +3. **`proxy_config`** + - A dictionary with fields like: +```json +{ + "server": "http://proxy.example.com:8080", + "username": "...", + "password": "..." +} +``` + - Leave as `None` if a proxy is not required. +4. **`viewport_width` & `viewport_height`**: + - The initial window size. + - Some sites behave differently with smaller or bigger viewports. +5. **`verbose`**: + - If `True`, prints extra logs. + - Handy for debugging. +6. **`use_persistent_context`**: + - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs. + - Typically also set `user_data_dir` to point to a folder. +7. **`cookies`** & **`headers`**: + - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`. +8. **`user_agent`**: + - Custom User-Agent string. If `None`, a default is used. + - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection). +9. **`text_mode`** & **`light_mode`**: + - `text_mode=True` disables images, possibly speeding up text-only crawls. + - `light_mode=True` turns off certain background features for performance. +10. **`extra_args`**: + - Additional flags for the underlying browser. + - E.g. `["--disable-extensions"]`. +11. **`enable_stealth`**: + - If `True`, enables stealth mode using playwright-stealth. + - Modifies browser fingerprints to avoid basic bot detection. + - Default is `False`. Recommended for sites with bot protection. +### Helper Methods +Both configuration classes provide a `clone()` method to create modified copies: +```python +# Create a base browser config +base_browser = BrowserConfig( + browser_type="chromium", + headless=True, + text_mode=True +) + +# Create a visible browser config for debugging +debug_browser = base_browser.clone( + headless=False, + verbose=True +) +``` +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig + +browser_conf = BrowserConfig( + browser_type="firefox", + headless=False, + text_mode=True +) + +async with AsyncWebCrawler(config=browser_conf) as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown[:300]) +``` +## 2. CrawlerRunConfig Essentials +```python +class CrawlerRunConfig: + def __init__( + word_count_threshold=200, + extraction_strategy=None, + markdown_generator=None, + cache_mode=None, + js_code=None, + wait_for=None, + screenshot=False, + pdf=False, + capture_mhtml=False, + # Location and Identity Parameters + locale=None, # e.g. "en-US", "fr-FR" + timezone_id=None, # e.g. "America/New_York" + geolocation=None, # GeolocationConfig object + # Resource Management + enable_rate_limiting=False, + rate_limit_config=None, + memory_threshold_percent=70.0, + check_interval=1.0, + max_session_permit=20, + display_mode=None, + verbose=True, + stream=False, # Enable streaming for arun_many() + # ... other advanced parameters omitted + ): + ... +``` +### Key Fields to Note +1. **`word_count_threshold`**: + - The minimum word count before a block is considered. + - If your site has lots of short paragraphs or items, you can lower it. +2. **`extraction_strategy`**: + - Where you plug in JSON-based extraction (CSS, LLM, etc.). + - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown). +3. **`markdown_generator`**: + - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done. + - If `None`, a default approach is used. +4. **`cache_mode`**: + - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.). + - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`. +5. **`js_code`**: + - A string or list of JS strings to execute. + - Great for "Load More" buttons or user interactions. +6. **`wait_for`**: + - A CSS or JS expression to wait for before extracting content. + - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`. +7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**: + - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded. + - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string). +8. **Location Parameters**: + - **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences + - **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`) + - **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)` +9. **`verbose`**: + - Logs additional runtime details. + - Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`. +10. **`enable_rate_limiting`**: + - If `True`, enables rate limiting for batch processing. + - Requires `rate_limit_config` to be set. +11. **`memory_threshold_percent`**: + - The memory threshold (as a percentage) to monitor. + - If exceeded, the crawler will pause or slow down. +12. **`check_interval`**: + - The interval (in seconds) to check system resources. + - Affects how often memory and CPU usage are monitored. +13. **`max_session_permit`**: + - The maximum number of concurrent crawl sessions. + - Helps prevent overwhelming the system. +14. **`url_matcher`** & **`match_mode`**: + - Enable URL-specific configurations when used with `arun_many()`. + - Set `url_matcher` to patterns (glob, function, or list) to match specific URLs. + - Use `match_mode` (OR/AND) to control how multiple patterns combine. +15. **`display_mode`**: + - The display mode for progress information (`DETAILED`, `BRIEF`, etc.). + - Affects how much information is printed during the crawl. +### Helper Methods +The `clone()` method is particularly useful for creating variations of your crawler configuration: +```python +# Create a base configuration +base_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + word_count_threshold=200, + wait_until="networkidle" +) + +# Create variations for different use cases +stream_config = base_config.clone( + stream=True, # Enable streaming mode + cache_mode=CacheMode.BYPASS +) + +debug_config = base_config.clone( + page_timeout=120000, # Longer timeout for debugging + verbose=True +) +``` +The `clone()` method: +- Creates a new instance with all the same settings +- Updates only the specified parameters +- Leaves the original configuration unchanged +- Perfect for creating variations without repeating all parameters +## 3. LLMConfig Essentials +### Key fields to note +1. **`provider`**: +- Which LLM provider to use. +- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`
*(default: `"openai/gpt-4o-mini"`)* +2. **`api_token`**: + - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables + - API token of LLM provider
eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` + - Environment variable - use with prefix "env:"
eg:`api_token = "env: GROQ_API_KEY"` +3. **`base_url`**: + - If your provider has a custom endpoint +```python +llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) +``` +## 4. Putting It All Together +In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call's needs: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator +from crawl4ai import JsonCssExtractionStrategy + +async def main(): + # 1) Browser config: headless, bigger viewport, no proxy + browser_conf = BrowserConfig( + headless=True, + viewport_width=1280, + viewport_height=720 + ) + + # 2) Example extraction strategy + schema = { + "name": "Articles", + "baseSelector": "div.article", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] + } + extraction = JsonCssExtractionStrategy(schema) + + # 3) Example LLM content filtering + + gemini_config = LLMConfig( + provider="gemini/gemini-1.5-pro", + api_token = "env:GEMINI_API_TOKEN" + ) + + # Initialize LLM filter with specific instruction + filter = LLMContentFilter( + llm_config=gemini_config, # or your preferred provider + instruction=""" + Focus on extracting the core educational content. + Include: + - Key concepts and explanations + - Important code examples + - Essential technical details + Exclude: + - Navigation elements + - Sidebars + - Footer content + Format the output as clean markdown with proper code blocks and headers. + """, + chunk_token_threshold=500, # Adjust based on your needs + verbose=True + ) + + md_generator = DefaultMarkdownGenerator( + content_filter=filter, + options={"ignore_links": True} + ) + + # 4) Crawler run config: skip cache, use extraction + run_conf = CrawlerRunConfig( + markdown_generator=md_generator, + extraction_strategy=extraction, + cache_mode=CacheMode.BYPASS, + ) + + async with AsyncWebCrawler(config=browser_conf) as crawler: + # 4) Execute the crawl + result = await crawler.arun(url="https://example.com/news", config=run_conf) + + if result.success: + print("Extracted content:", result.extracted_content) + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` +## 5. Next Steps +- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md) +- **Custom Hooks & Auth** (Inject JavaScript or handle login forms). +- **Session Management** (Re-use pages, preserve state across multiple calls). +- **Advanced Caching** (Fine-tune read/write cache modes). +## 6. Conclusion + + +# 1. **BrowserConfig** – Controlling the Browser +`BrowserConfig` focuses on **how** the browser is launched and behaves. This includes headless mode, proxies, user agents, and other environment tweaks. +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig + +browser_cfg = BrowserConfig( + browser_type="chromium", + headless=True, + viewport_width=1280, + viewport_height=720, + proxy="http://user:pass@proxy:8080", + user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36", +) +``` +## 1.1 Parameter Highlights +| **Parameter** | **Type / Default** | **What It Does** | +|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------| +| **`browser_type`** | `"chromium"`, `"firefox"`, `"webkit"`
*(default: `"chromium"`)* | Which browser engine to use. `"chromium"` is typical for many sites, `"firefox"` or `"webkit"` for specialized tests. | +| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. | +| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. | +| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). | +| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. | +| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. | +| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. | +| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. | +| **`ignore_https_errors`** | `bool` (default: `True`) | If `True`, continues despite invalid certificates (common in dev/staging). | +| **`java_script_enabled`** | `bool` (default: `True`) | Disable if you want no JS overhead, or if only static content is needed. | +| **`cookies`** | `list` (default: `[]`) | Pre-set cookies, each a dict like `{"name": "session", "value": "...", "url": "..."}`. | +| **`headers`** | `dict` (default: `{}`) | Extra HTTP headers for every request, e.g. `{"Accept-Language": "en-US"}`. | +| **`user_agent`** | `str` (default: Chrome-based UA) | Your custom or random user agent. `user_agent_mode="random"` can shuffle it. | +| **`light_mode`** | `bool` (default: `False`) | Disables some background features for performance gains. | +| **`text_mode`** | `bool` (default: `False`) | If `True`, tries to disable images/other heavy content for speed. | +| **`use_managed_browser`** | `bool` (default: `False`) | For advanced “managed” interactions (debugging, CDP usage). Typically set automatically if persistent context is on. | +| **`extra_args`** | `list` (default: `[]`) | Additional flags for the underlying browser process, e.g. `["--disable-extensions"]`. | +- Set `headless=False` to visually **debug** how pages load or how interactions proceed. +- If you need **authentication** storage or repeated sessions, consider `use_persistent_context=True` and specify `user_data_dir`. +- For large pages, you might need a bigger `viewport_width` and `viewport_height` to handle dynamic content. +# 2. **CrawlerRunConfig** – Controlling Each Crawl +While `BrowserConfig` sets up the **environment**, `CrawlerRunConfig` details **how** each **crawl operation** should behave: caching, content filtering, link or domain blocking, timeouts, JavaScript code, etc. +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +run_cfg = CrawlerRunConfig( + wait_for="css:.main-content", + word_count_threshold=15, + excluded_tags=["nav", "footer"], + exclude_external_links=True, + stream=True, # Enable streaming for arun_many() +) +``` +## 2.1 Parameter Highlights +### A) **Content Processing** +| **Parameter** | **Type / Default** | **What It Does** | +|------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------| +| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. | +| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). | +| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html'). | +| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. | +| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. | +| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). | +| **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. | +| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. | +| **`prettiify`** | `bool` (False) | If `True`, beautifies final HTML (slower, purely cosmetic). | +| **`keep_data_attributes`** | `bool` (False) | If `True`, preserve `data-*` attributes in cleaned HTML. | +| **`remove_forms`** | `bool` (False) | If `True`, remove all `` elements. | +### B) **Caching & Session** +| **Parameter** | **Type / Default** | **What It Does** | +|-------------------------|------------------------|------------------------------------------------------------------------------------------------------------------------------| +| **`cache_mode`** | `CacheMode or None` | Controls how caching is handled (`ENABLED`, `BYPASS`, `DISABLED`, etc.). If `None`, typically defaults to `ENABLED`. | +| **`session_id`** | `str or None` | Assign a unique ID to reuse a single browser session across multiple `arun()` calls. | +| **`bypass_cache`** | `bool` (False) | If `True`, acts like `CacheMode.BYPASS`. | +| **`disable_cache`** | `bool` (False) | If `True`, acts like `CacheMode.DISABLED`. | +| **`no_cache_read`** | `bool` (False) | If `True`, acts like `CacheMode.WRITE_ONLY` (writes cache but never reads). | +| **`no_cache_write`** | `bool` (False) | If `True`, acts like `CacheMode.READ_ONLY` (reads cache but never writes). | +### C) **Page Navigation & Timing** +| **Parameter** | **Type / Default** | **What It Does** | +|----------------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------| +| **`wait_until`** | `str` (domcontentloaded)| Condition for navigation to “complete”. Often `"networkidle"` or `"domcontentloaded"`. | +| **`page_timeout`** | `int` (60000 ms) | Timeout for page navigation or JS steps. Increase for slow sites. | +| **`wait_for`** | `str or None` | Wait for a CSS (`"css:selector"`) or JS (`"js:() => bool"`) condition before content extraction. | +| **`wait_for_images`** | `bool` (False) | Wait for images to load before finishing. Slows down if you only want text. | +| **`delay_before_return_html`** | `float` (0.1) | Additional pause (seconds) before final HTML is captured. Good for last-second updates. | +| **`check_robots_txt`** | `bool` (False) | Whether to check and respect robots.txt rules before crawling. If True, caches robots.txt for efficiency. | +| **`mean_delay`** and **`max_range`** | `float` (0.1, 0.3) | If you call `arun_many()`, these define random delay intervals between crawls, helping avoid detection or rate limits. | +| **`semaphore_count`** | `int` (5) | Max concurrency for `arun_many()`. Increase if you have resources for parallel crawls. | +### D) **Page Interaction** +| **Parameter** | **Type / Default** | **What It Does** | +|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------| +| **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. | +| **`js_only`** | `bool` (False) | If `True`, indicates we’re reusing an existing session and only applying JS. No full reload. | +| **`ignore_body_visibility`** | `bool` (True) | Skip checking if `` is visible. Usually best to keep `True`. | +| **`scan_full_page`** | `bool` (False) | If `True`, auto-scroll the page to load dynamic content (infinite scroll). | +| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. | +| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. | +| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. | +| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. | +| **`override_navigator`** | `bool` (False) | Override `navigator` properties in JS for stealth. | +| **`magic`** | `bool` (False) | Automatic handling of popups/consent banners. Experimental. | +| **`adjust_viewport_to_content`** | `bool` (False) | Resizes viewport to match page content height. | +If your page is a single-page app with repeated JS updates, set `js_only=True` in subsequent calls, plus a `session_id` for reusing the same tab. +### E) **Media Handling** +| **Parameter** | **Type / Default** | **What It Does** | +|--------------------------------------------|---------------------|-----------------------------------------------------------------------------------------------------------| +| **`screenshot`** | `bool` (False) | Capture a screenshot (base64) in `result.screenshot`. | +| **`screenshot_wait_for`** | `float or None` | Extra wait time before the screenshot. | +| **`screenshot_height_threshold`** | `int` (~20000) | If the page is taller than this, alternate screenshot strategies are used. | +| **`pdf`** | `bool` (False) | If `True`, returns a PDF in `result.pdf`. | +| **`capture_mhtml`** | `bool` (False) | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. | +| **`image_description_min_word_threshold`** | `int` (~50) | Minimum words for an image’s alt text or description to be considered valid. | +| **`image_score_threshold`** | `int` (~3) | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.). | +| **`exclude_external_images`** | `bool` (False) | Exclude images from other domains. | +### F) **Link/Domain Handling** +| **Parameter** | **Type / Default** | **What It Does** | +|------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------| +| **`exclude_social_media_domains`** | `list` (e.g. Facebook/Twitter) | A default list can be extended. Any link to these domains is removed from final output. | +| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. | +| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). | +| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). | +| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. | +### G) **Debug & Logging** +| **Parameter** | **Type / Default** | **What It Does** | +|----------------|--------------------|---------------------------------------------------------------------------| +| **`verbose`** | `bool` (True) | Prints logs detailing each step of crawling, interactions, or errors. | +| **`log_console`** | `bool` (False) | Logs the page’s JavaScript console output if you want deeper JS debugging.| +### H) **Virtual Scroll Configuration** +| **Parameter** | **Type / Default** | **What It Does** | +|------------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| **`virtual_scroll_config`** | `VirtualScrollConfig or dict` (None) | Configuration for handling virtualized scrolling on sites like Twitter/Instagram where content is replaced rather than appended. | +When sites use virtual scrolling (content replaced as you scroll), use `VirtualScrollConfig`: +```python +from crawl4ai import VirtualScrollConfig + +virtual_config = VirtualScrollConfig( + container_selector="#timeline", # CSS selector for scrollable container + scroll_count=30, # Number of times to scroll + scroll_by="container_height", # How much to scroll: "container_height", "page_height", or pixels (e.g. 500) + wait_after_scroll=0.5 # Seconds to wait after each scroll for content to load +) + +config = CrawlerRunConfig( + virtual_scroll_config=virtual_config +) +``` +**VirtualScrollConfig Parameters:** +| **Parameter** | **Type / Default** | **What It Does** | +|------------------------|---------------------------|-------------------------------------------------------------------------------------------| +| **`container_selector`** | `str` (required) | CSS selector for the scrollable container (e.g., `"#feed"`, `".timeline"`) | +| **`scroll_count`** | `int` (10) | Maximum number of scrolls to perform | +| **`scroll_by`** | `str or int` ("container_height") | Scroll amount: `"container_height"`, `"page_height"`, or pixels (e.g., `500`) | +| **`wait_after_scroll`** | `float` (0.5) | Time in seconds to wait after each scroll for new content to load | +- Use `virtual_scroll_config` when content is **replaced** during scroll (Twitter, Instagram) +- Use `scan_full_page` when content is **appended** during scroll (traditional infinite scroll) +### I) **URL Matching Configuration** +| **Parameter** | **Type / Default** | **What It Does** | +|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types. **None means match ALL URLs** | +| **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) | +The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`: +```python +from crawl4ai import CrawlerRunConfig, MatchMode +from crawl4ai.processors.pdf import PDFContentScrapingStrategy +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + +# Simple string pattern (glob-style) +pdf_config = CrawlerRunConfig( + url_matcher="*.pdf", + scraping_strategy=PDFContentScrapingStrategy() +) + +# Multiple patterns with OR logic (default) +blog_config = CrawlerRunConfig( + url_matcher=["*/blog/*", "*/article/*", "*/news/*"], + match_mode=MatchMode.OR # Any pattern matches +) + +# Function matcher +api_config = CrawlerRunConfig( + url_matcher=lambda url: 'api' in url or url.endswith('.json'), + # Other settings like extraction_strategy +) + +# Mixed: String + Function with AND logic +complex_config = CrawlerRunConfig( + url_matcher=[ + lambda url: url.startswith('https://'), # Must be HTTPS + "*.org/*", # Must be .org domain + lambda url: 'docs' in url # Must contain 'docs' + ], + match_mode=MatchMode.AND # ALL conditions must match +) + +# Combined patterns and functions with AND logic +secure_docs = CrawlerRunConfig( + url_matcher=["https://*", lambda url: '.doc' in url], + match_mode=MatchMode.AND # Must be HTTPS AND contain .doc +) + +# Default config - matches ALL URLs +default_config = CrawlerRunConfig() # No url_matcher = matches everything +``` +**UrlMatcher Types:** +- **None (default)**: When `url_matcher` is None or not set, the config matches ALL URLs +- **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"` +- **Functions**: `lambda url: bool` - Custom logic for complex matching +- **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND` +**Important Behavior:** +- When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins! +- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found" +Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies: +```python +# Create a base configuration +base_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + word_count_threshold=200 +) + +# Create variations using clone() +stream_config = base_config.clone(stream=True) +no_cache_config = base_config.clone( + cache_mode=CacheMode.BYPASS, + stream=True +) +``` +The `clone()` method is particularly useful when you need slightly different configurations for different use cases, without modifying the original config. +## 2.3 Example Usage +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + # Configure the browser + browser_cfg = BrowserConfig( + headless=False, + viewport_width=1280, + viewport_height=720, + proxy="http://user:pass@myproxy:8080", + text_mode=True + ) + + # Configure the run + run_cfg = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + session_id="my_session", + css_selector="main.article", + excluded_tags=["script", "style"], + exclude_external_links=True, + wait_for="css:.article-loaded", + screenshot=True, + stream=True + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + result = await crawler.arun( + url="https://example.com/news", + config=run_cfg + ) + if result.success: + print("Final cleaned_html length:", len(result.cleaned_html)) + if result.screenshot: + print("Screenshot captured (base64, length):", len(result.screenshot)) + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` +## 2.4 Compliance & Ethics +| **Parameter** | **Type / Default** | **What It Does** | +|-----------------------|-------------------------|----------------------------------------------------------------------------------------------------------------------| +| **`check_robots_txt`**| `bool` (False) | When True, checks and respects robots.txt rules before crawling. Uses efficient caching with SQLite backend. | +| **`user_agent`** | `str` (None) | User agent string to identify your crawler. Used for robots.txt checking when enabled. | +```python +run_config = CrawlerRunConfig( + check_robots_txt=True, # Enable robots.txt compliance + user_agent="MyBot/1.0" # Identify your crawler +) +``` +# 3. **LLMConfig** - Setting up LLM providers +1. LLMExtractionStrategy +2. LLMContentFilter +3. JsonCssExtractionStrategy.generate_schema +4. JsonXPathExtractionStrategy.generate_schema +## 3.1 Parameters +| **Parameter** | **Type / Default** | **What It Does** | +|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------| +| **`provider`** | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`
*(default: `"openai/gpt-4o-mini"`)* | Which LLM provider to use. +| **`api_token`** |1.Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables
2. API token of LLM provider
eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
3. Environment variable - use with prefix "env:"
eg:`api_token = "env: GROQ_API_KEY"` | API token to use for the given provider +| **`base_url`** |Optional. Custom API endpoint | If your provider has a custom endpoint +## 3.2 Example Usage +```python +llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) +``` +## 4. Putting It All Together +- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent. +- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS. +- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`). +- **Use** `LLMConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema` +```python +# Create a modified copy with the clone() method +stream_cfg = run_cfg.clone( + stream=True, + cache_mode=CacheMode.BYPASS +) +``` + + + +# Crawling Patterns + +# Simple Crawling +## Basic Usage +Set up a simple crawl using `BrowserConfig` and `CrawlerRunConfig`: +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + +async def main(): + browser_config = BrowserConfig() # Default browser configuration + run_config = CrawlerRunConfig() # Default crawl run configuration + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + print(result.markdown) # Print clean markdown content + +if __name__ == "__main__": + asyncio.run(main()) +``` +## Understanding the Response +The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details): +```python +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.6), + options={"ignore_links": True} + ) +) + +result = await crawler.arun( + url="https://example.com", + config=config +) + +# Different content formats +print(result.html) # Raw HTML +print(result.cleaned_html) # Cleaned HTML +print(result.markdown.raw_markdown) # Raw markdown from cleaned html +print(result.markdown.fit_markdown) # Most relevant content in markdown + +# Check success status +print(result.success) # True if crawl succeeded +print(result.status_code) # HTTP status code (e.g., 200, 404) + +# Access extracted media and links +print(result.media) # Dictionary of found media (images, videos, audio) +print(result.links) # Dictionary of internal and external links +``` +## Adding Basic Options +Customize your crawl using `CrawlerRunConfig`: +```python +run_config = CrawlerRunConfig( + word_count_threshold=10, # Minimum words per content block + exclude_external_links=True, # Remove external links + remove_overlay_elements=True, # Remove popups/modals + process_iframes=True # Process iframe content +) + +result = await crawler.arun( + url="https://example.com", + config=run_config +) +``` +## Handling Errors +```python +run_config = CrawlerRunConfig() +result = await crawler.arun(url="https://example.com", config=run_config) + +if not result.success: + print(f"Crawl failed: {result.error_message}") + print(f"Status code: {result.status_code}") +``` +## Logging and Debugging +Enable verbose logging in `BrowserConfig`: +```python +browser_config = BrowserConfig(verbose=True) + +async with AsyncWebCrawler(config=browser_config) as crawler: + run_config = CrawlerRunConfig() + result = await crawler.arun(url="https://example.com", config=run_config) +``` +## Complete Example +```python +import asyncio +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + browser_config = BrowserConfig(verbose=True) + run_config = CrawlerRunConfig( + # Content filtering + word_count_threshold=10, + excluded_tags=['form', 'header'], + exclude_external_links=True, + + # Content processing + process_iframes=True, + remove_overlay_elements=True, + + # Cache control + cache_mode=CacheMode.ENABLED # Use cache if available + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + + if result.success: + # Print clean content + print("Content:", result.markdown[:500]) # First 500 chars + + # Process images + for image in result.media["images"]: + print(f"Found image: {image['src']}") + + # Process links + for link in result.links["internal"]: + print(f"Internal link: {link['href']}") + + else: + print(f"Crawl failed: {result.error_message}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + + + +# Content Processing + +# Markdown Generation Basics +1. How to configure the **Default Markdown Generator** +3. The difference between raw markdown (`result.markdown`) and filtered markdown (`fit_markdown`) +> - You know how to configure `CrawlerRunConfig`. +## 1. Quick Example +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator() + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + + if result.success: + print("Raw Markdown Output:\n") + print(result.markdown) # The unfiltered markdown from the page + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` +- `CrawlerRunConfig( markdown_generator = DefaultMarkdownGenerator() )` instructs Crawl4AI to convert the final HTML into markdown at the end of each crawl. +- The resulting markdown is accessible via `result.markdown`. +## 2. How Markdown Generation Works +### 2.1 HTML-to-Text Conversion (Forked & Modified) +- Preserves headings, code blocks, bullet points, etc. +- Removes extraneous tags (scripts, styles) that don’t add meaningful content. +- Can optionally generate references for links or skip them altogether. +### 2.2 Link Citations & References +By default, the generator can convert `` elements into `[text][1]` citations, then place the actual links at the bottom of the document. This is handy for research workflows that demand references in a structured manner. +### 2.3 Optional Content Filters +## 3. Configuring the Default Markdown Generator +You can tweak the output by passing an `options` dict to `DefaultMarkdownGenerator`. For example: +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + # Example: ignore all links, don't escape HTML, and wrap text at 80 characters + md_generator = DefaultMarkdownGenerator( + options={ + "ignore_links": True, + "escape_html": False, + "body_width": 80 + } + ) + + config = CrawlerRunConfig( + markdown_generator=md_generator + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/docs", config=config) + if result.success: + print("Markdown:\n", result.markdown[:500]) # Just a snippet + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) +``` +Some commonly used `options`: +- **`ignore_links`** (bool): Whether to remove all hyperlinks in the final markdown. +- **`ignore_images`** (bool): Remove all `![image]()` references. +- **`escape_html`** (bool): Turn HTML entities into text (default is often `True`). +- **`body_width`** (int): Wrap text at N characters. `0` or `None` means no wrapping. +- **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page. +- **`include_sup_sub`** (bool): Attempt to handle `` / `` in a more readable way. +## 4. Selecting the HTML Source for Markdown Generation +The `content_source` parameter allows you to control which HTML content is used as input for markdown generation. This gives you flexibility in how the HTML is processed before conversion to markdown. +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + # Option 1: Use the raw HTML directly from the webpage (before any processing) + raw_md_generator = DefaultMarkdownGenerator( + content_source="raw_html", + options={"ignore_links": True} + ) + + # Option 2: Use the cleaned HTML (after scraping strategy processing - default) + cleaned_md_generator = DefaultMarkdownGenerator( + content_source="cleaned_html", # This is the default + options={"ignore_links": True} + ) + + # Option 3: Use preprocessed HTML optimized for schema extraction + fit_md_generator = DefaultMarkdownGenerator( + content_source="fit_html", + options={"ignore_links": True} + ) + + # Use one of the generators in your crawler config + config = CrawlerRunConfig( + markdown_generator=raw_md_generator # Try each of the generators + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + if result.success: + print("Markdown:\n", result.markdown.raw_markdown[:500]) + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) +``` +### HTML Source Options +- **`"cleaned_html"`** (default): Uses the HTML after it has been processed by the scraping strategy. This HTML is typically cleaner and more focused on content, with some boilerplate removed. +- **`"raw_html"`**: Uses the original HTML directly from the webpage, before any cleaning or processing. This preserves more of the original content, but may include navigation bars, ads, footers, and other elements that might not be relevant to the main content. +- **`"fit_html"`**: Uses HTML preprocessed for schema extraction. This HTML is optimized for structured data extraction and may have certain elements simplified or removed. +### When to Use Each Option +- Use **`"cleaned_html"`** (default) for most cases where you want a balance of content preservation and noise removal. +- Use **`"raw_html"`** when you need to preserve all original content, or when the cleaning process is removing content you actually want to keep. +- Use **`"fit_html"`** when working with structured data or when you need HTML that's optimized for schema extraction. +## 5. Content Filters +### 5.1 BM25ContentFilter +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai import CrawlerRunConfig + +bm25_filter = BM25ContentFilter( + user_query="machine learning", + bm25_threshold=1.2, + language="english" +) + +md_generator = DefaultMarkdownGenerator( + content_filter=bm25_filter, + options={"ignore_links": True} +) + +config = CrawlerRunConfig(markdown_generator=md_generator) +``` +- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query. +- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more. +- **`use_stemming`** *(default `True`)*: Whether to apply stemming to the query and content. +- **`language (str)`**: Language for stemming (default: 'english'). +### 5.2 PruningContentFilter +If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections. +```python +from crawl4ai.content_filter_strategy import PruningContentFilter + +prune_filter = PruningContentFilter( + threshold=0.5, + threshold_type="fixed", # or "dynamic" + min_word_threshold=50 +) +``` +- **`threshold`**: Score boundary. Blocks below this score get removed. +- **`threshold_type`**: + - `"fixed"`: Straight comparison (`score >= threshold` keeps the block). + - `"dynamic"`: The filter adjusts threshold in a data-driven manner. +- **`min_word_threshold`**: Discard blocks under N words as likely too short or unhelpful. +- You want a broad cleanup without a user query. +### 5.3 LLMContentFilter +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import LLMContentFilter + +async def main(): + # Initialize LLM filter with specific instruction + filter = LLMContentFilter( + llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable + instruction=""" + Focus on extracting the core educational content. + Include: + - Key concepts and explanations + - Important code examples + - Essential technical details + Exclude: + - Navigation elements + - Sidebars + - Footer content + Format the output as clean markdown with proper code blocks and headers. + """, + chunk_token_threshold=4096, # Adjust based on your needs + verbose=True + ) + md_generator = DefaultMarkdownGenerator( + content_filter=filter, + options={"ignore_links": True} + ) + config = CrawlerRunConfig( + markdown_generator=md_generator, + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + print(result.markdown.fit_markdown) # Filtered markdown content +``` +- **Chunk Processing**: Handles large documents by processing them in chunks (controlled by `chunk_token_threshold`) +- **Parallel Processing**: For better performance, use smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks +1. **Exact Content Preservation**: +```python +filter = LLMContentFilter( + instruction=""" + Extract the main educational content while preserving its original wording and substance completely. + 1. Maintain the exact language and terminology + 2. Keep all technical explanations and examples intact + 3. Preserve the original flow and structure + 4. Remove only clearly irrelevant elements like navigation menus and ads + """, + chunk_token_threshold=4096 +) +``` +2. **Focused Content Extraction**: +```python +filter = LLMContentFilter( + instruction=""" + Focus on extracting specific types of content: + - Technical documentation + - Code examples + - API references + Reformat the content into clear, well-structured markdown + """, + chunk_token_threshold=4096 +) +``` +> **Performance Tip**: Set a smaller `chunk_token_threshold` (e.g., 2048 or 4096) to enable parallel processing of content chunks. The default value is infinity, which processes the entire content as a single chunk. +## 6. Using Fit Markdown +When a content filter is active, the library produces two forms of markdown inside `result.markdown`: +1. **`raw_markdown`**: The full unfiltered markdown. +2. **`fit_markdown`**: A “fit” version where the filter has removed or trimmed noisy segments. +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter + +async def main(): + config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.6), + options={"ignore_links": True} + ) + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://news.example.com/tech", config=config) + if result.success: + print("Raw markdown:\n", result.markdown) + + # If a filter is used, we also have .fit_markdown: + md_object = result.markdown # or your equivalent + print("Filtered markdown:\n", md_object.fit_markdown) + else: + print("Crawl failed:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` +## 7. The `MarkdownGenerationResult` Object +If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as: +- **`raw_markdown`**: The direct HTML-to-markdown transformation (no filtering). +- **`markdown_with_citations`**: A version that moves links to reference-style footnotes. +- **`references_markdown`**: A separate string or section containing the gathered references. +- **`fit_markdown`**: The filtered markdown if you used a content filter. +- **`fit_html`**: The corresponding HTML snippet used to generate `fit_markdown` (helpful for debugging or advanced usage). +```python +md_obj = result.markdown # your library’s naming may vary +print("RAW:\n", md_obj.raw_markdown) +print("CITED:\n", md_obj.markdown_with_citations) +print("REFERENCES:\n", md_obj.references_markdown) +print("FIT:\n", md_obj.fit_markdown) +``` +- You can supply `raw_markdown` to an LLM if you want the entire text. +- Or feed `fit_markdown` into a vector database to reduce token usage. +- `references_markdown` can help you keep track of link provenance. +## 8. Combining Filters (BM25 + Pruning) in Two Passes +You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead: +1. **First pass**: Apply `PruningContentFilter` directly to the raw HTML from `result.html` (the crawler’s downloaded HTML). +2. **Second pass**: Take the pruned HTML (or text) from step 1, and feed it into `BM25ContentFilter`, focusing on a user query. +### Two-Pass Example +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter +from bs4 import BeautifulSoup + +async def main(): + # 1. Crawl with minimal or no markdown generator, just get raw HTML + config = CrawlerRunConfig( + # If you only want raw HTML, you can skip passing a markdown_generator + # or provide one but focus on .html in this example + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/tech-article", config=config) + + if not result.success or not result.html: + print("Crawl failed or no HTML content.") + return + + raw_html = result.html + + # 2. First pass: PruningContentFilter on raw HTML + pruning_filter = PruningContentFilter(threshold=0.5, min_word_threshold=50) + + # filter_content returns a list of "text chunks" or cleaned HTML sections + pruned_chunks = pruning_filter.filter_content(raw_html) + # This list is basically pruned content blocks, presumably in HTML or text form + + # For demonstration, let's combine these chunks back into a single HTML-like string + # or you could do further processing. It's up to your pipeline design. + pruned_html = "\n".join(pruned_chunks) + + # 3. Second pass: BM25ContentFilter with a user query + bm25_filter = BM25ContentFilter( + user_query="machine learning", + bm25_threshold=1.2, + language="english" + ) + + # returns a list of text chunks + bm25_chunks = bm25_filter.filter_content(pruned_html) + + if not bm25_chunks: + print("Nothing matched the BM25 query after pruning.") + return + + # 4. Combine or display final results + final_text = "\n---\n".join(bm25_chunks) + + print("==== PRUNED OUTPUT (first pass) ====") + print(pruned_html[:500], "... (truncated)") # preview + + print("\n==== BM25 OUTPUT (second pass) ====") + print(final_text[:500], "... (truncated)") + +if __name__ == "__main__": + asyncio.run(main()) +``` +### What’s Happening? +1. **Raw HTML**: We crawl once and store the raw HTML in `result.html`. +4. **BM25ContentFilter**: We feed the pruned string into `BM25ContentFilter` with a user query. This second pass further narrows the content to chunks relevant to “machine learning.” +**No Re-Crawling**: We used `raw_html` from the first pass, so there’s no need to run `arun()` again—**no second network request**. +### Tips & Variations +- **Plain Text vs. HTML**: If your pruned output is mostly text, BM25 can still handle it; just keep in mind it expects a valid string input. If you supply partial HTML (like `"

some text

"`), it will parse it as HTML. +- **Adjust Thresholds**: If you see too much or too little text in step one, tweak `threshold=0.5` or `min_word_threshold=50`. Similarly, `bm25_threshold=1.2` can be raised/lowered for more or fewer chunks in step two. +### One-Pass Combination? +## 9. Common Pitfalls & Tips +1. **No Markdown Output?** +2. **Performance Considerations** + - Very large pages with multiple filters can be slower. Consider `cache_mode` to avoid re-downloading. +3. **Take Advantage of `fit_markdown`** +4. **Adjusting `html2text` Options** + - If you see lots of raw HTML slipping into the text, turn on `escape_html`. + - If code blocks look messy, experiment with `mark_code` or `handle_code_in_pre`. +## 10. Summary & Next Steps +- Configure the **DefaultMarkdownGenerator** with HTML-to-text options. +- Select different HTML sources using the `content_source` parameter. +- Distinguish between raw and filtered markdown (`fit_markdown`). +- Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.). + + +# Fit Markdown with Pruning & BM25 +## 1. How “Fit Markdown” Works +### 1.1 The `content_filter` +In **`CrawlerRunConfig`**, you can specify a **`content_filter`** to shape how content is pruned or ranked before final markdown generation. A filter’s logic is applied **before** or **during** the HTML→Markdown process, producing: +- **`result.markdown.raw_markdown`** (unfiltered) +- **`result.markdown.fit_markdown`** (filtered or “fit” version) +- **`result.markdown.fit_html`** (the corresponding HTML snippet that produced `fit_markdown`) +### 1.2 Common Filters +## 2. PruningContentFilter +### 2.1 Usage Example +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + # Step 1: Create a pruning filter + prune_filter = PruningContentFilter( + # Lower → more content retained, higher → more content pruned + threshold=0.45, + # "fixed" or "dynamic" + threshold_type="dynamic", + # Ignore nodes with <5 words + min_word_threshold=5 + ) + + # Step 2: Insert it into a Markdown Generator + md_generator = DefaultMarkdownGenerator(content_filter=prune_filter) + + # Step 3: Pass it to CrawlerRunConfig + config = CrawlerRunConfig( + markdown_generator=md_generator + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news.ycombinator.com", + config=config + ) + + if result.success: + # 'fit_markdown' is your pruned content, focusing on "denser" text + print("Raw Markdown length:", len(result.markdown.raw_markdown)) + print("Fit Markdown length:", len(result.markdown.fit_markdown)) + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` +### 2.2 Key Parameters +- **`min_word_threshold`** (int): If a block has fewer words than this, it’s pruned. +- **`threshold_type`** (str): + - `"fixed"` → each node must exceed `threshold` (0–1). + - `"dynamic"` → node scoring adjusts according to tag type, text/link density, etc. +- **`threshold`** (float, default ~0.48): The base or “anchor” cutoff. +- **Link density** – Penalizes sections that are mostly links. +- **Tag importance** – e.g., an `
` or `

` might be more important than a `

`. +## 3. BM25ContentFilter +### 3.1 Usage Example +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import BM25ContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +async def main(): + # 1) A BM25 filter with a user query + bm25_filter = BM25ContentFilter( + user_query="startup fundraising tips", + # Adjust for stricter or looser results + bm25_threshold=1.2 + ) + + # 2) Insert into a Markdown Generator + md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter) + + # 3) Pass to crawler config + config = CrawlerRunConfig( + markdown_generator=md_generator + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news.ycombinator.com", + config=config + ) + if result.success: + print("Fit Markdown (BM25 query-based):") + print(result.markdown.fit_markdown) + else: + print("Error:", result.error_message) + +if __name__ == "__main__": + asyncio.run(main()) +``` +### 3.2 Parameters +- **`user_query`** (str, optional): E.g. `"machine learning"`. If blank, the filter tries to glean a query from page metadata. +- **`bm25_threshold`** (float, default 1.0): + - Higher → fewer chunks but more relevant. + - Lower → more inclusive. +> In more advanced scenarios, you might see parameters like `language`, `case_sensitive`, or `priority_tags` to refine how text is tokenized or weighted. +## 4. Accessing the “Fit” Output +After the crawl, your “fit” content is found in **`result.markdown.fit_markdown`**. +```python +fit_md = result.markdown.fit_markdown +fit_html = result.markdown.fit_html +``` +If the content filter is **BM25**, you might see additional logic or references in `fit_markdown` that highlight relevant segments. If it’s **Pruning**, the text is typically well-cleaned but not necessarily matched to a query. +## 5. Code Patterns Recap +### 5.1 Pruning +```python +prune_filter = PruningContentFilter( + threshold=0.5, + threshold_type="fixed", + min_word_threshold=10 +) +md_generator = DefaultMarkdownGenerator(content_filter=prune_filter) +config = CrawlerRunConfig(markdown_generator=md_generator) +``` +### 5.2 BM25 +```python +bm25_filter = BM25ContentFilter( + user_query="health benefits fruit", + bm25_threshold=1.2 +) +md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter) +config = CrawlerRunConfig(markdown_generator=md_generator) +``` +## 6. Combining with “word_count_threshold” & Exclusions +```python +config = CrawlerRunConfig( + word_count_threshold=10, + excluded_tags=["nav", "footer", "header"], + exclude_external_links=True, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.5) + ) +) +``` +1. The crawler’s `excluded_tags` are removed from the HTML first. +3. The final “fit” content is generated in `result.markdown.fit_markdown`. +## 7. Custom Filters +If you need a different approach (like a specialized ML model or site-specific heuristics), you can create a new class inheriting from `RelevantContentFilter` and implement `filter_content(html)`. Then inject it into your **markdown generator**: +```python +from crawl4ai.content_filter_strategy import RelevantContentFilter + +class MyCustomFilter(RelevantContentFilter): + def filter_content(self, html, min_word_threshold=None): + # parse HTML, implement custom logic + return [block for block in ... if ... some condition...] + +``` +1. Subclass `RelevantContentFilter`. +2. Implement `filter_content(...)`. +3. Use it in your `DefaultMarkdownGenerator(content_filter=MyCustomFilter(...))`. +## 8. Final Thoughts +- **Summaries**: Quickly get the important text from a cluttered page. +- **Search**: Combine with **BM25** to produce content relevant to a query. +- **BM25ContentFilter**: Perfect for query-based extraction or searching. +- Combine with **`excluded_tags`, `exclude_external_links`, `word_count_threshold`** to refine your final “fit” text. +- Fit markdown ends up in **`result.markdown.fit_markdown`**; eventually **`result.markdown.fit_markdown`** in future versions. +- Last Updated: 2025-01-01 + + +# Content Selection +Crawl4AI provides multiple ways to **select**, **filter**, and **refine** the content from your crawls. Whether you need to target a specific CSS region, exclude entire tags, filter out external links, or remove certain domains and images, **`CrawlerRunConfig`** offers a wide range of parameters. +## 1. CSS-Based Selection +There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`. +### 1.1 Using `css_selector` +A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + config = CrawlerRunConfig( + # e.g., first 30 items from Hacker News + css_selector=".athing:nth-child(-n+30)" + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://news.ycombinator.com/newest", + config=config + ) + print("Partial HTML length:", len(result.cleaned_html)) + +if __name__ == "__main__": + asyncio.run(main()) +``` +**Result**: Only elements matching that selector remain in `result.cleaned_html`. +### 1.2 Using `target_elements` +The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + config = CrawlerRunConfig( + # Target article body and sidebar, but not other content + target_elements=["article.main-content", "aside.sidebar"] + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/blog-post", + config=config + ) + print("Markdown focused on target elements") + print("Links from entire page still available:", len(result.links.get("internal", []))) + +if __name__ == "__main__": + asyncio.run(main()) +``` +**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection. +## 2. Content Filtering & Exclusions +### 2.1 Basic Overview +```python +config = CrawlerRunConfig( + # Content thresholds + word_count_threshold=10, # Minimum words per block + + # Tag exclusions + excluded_tags=['form', 'header', 'footer', 'nav'], + + # Link filtering + exclude_external_links=True, + exclude_social_media_links=True, + # Block entire domains + exclude_domains=["adtrackers.com", "spammynews.org"], + exclude_social_media_domains=["facebook.com", "twitter.com"], + + # Media filtering + exclude_external_images=True +) +``` +- **`word_count_threshold`**: Ignores text blocks under X words. Helps skip trivial blocks like short nav or disclaimers. +- **`excluded_tags`**: Removes entire tags (``, `
`, `